1use crate::config::{WebSearchConfig, WebSearchProvider};
14use crate::error::{AppError, Result};
15use scraper::{ElementRef, Html, Selector};
16use serde::{Deserialize, Serialize};
17use std::collections::HashMap;
18use std::time::Duration;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct FetchResult {
27 pub url: String,
28 pub status_code: u16,
29 pub content_type: Option<String>,
30 pub content: String,
31 pub headers: HashMap<String, String>,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct ExtractedContent {
37 pub title: Option<String>,
39 pub description: Option<String>,
41 pub main_content: String,
43 pub links: Vec<PageLink>,
45 pub headings: Vec<Heading>,
47 pub code_blocks: Vec<CodeBlock>,
49 pub url: String,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct PageLink {
56 pub text: String,
57 pub href: String,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct Heading {
63 pub level: u8, pub text: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct CodeBlock {
70 pub language: Option<String>,
71 pub code: String,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct SearchResult {
77 pub query: String,
78 pub results: Vec<SearchItem>,
79 pub provider: String,
81}
82
83#[derive(Debug, Clone, Default, Serialize, Deserialize)]
85pub struct SearchItem {
86 pub title: String,
87 pub url: String,
88 pub snippet: String,
89 #[serde(skip_serializing_if = "Option::is_none")]
91 pub content: Option<ExtractedContent>,
92}
93
94#[derive(Debug, Clone)]
100pub struct ContentExtractor {
101 main_content_selectors: Vec<&'static str>,
103 noise_selectors: Vec<&'static str>,
105 max_content_length: usize,
107}
108
109impl Default for ContentExtractor {
110 fn default() -> Self {
111 Self::new(10_000)
112 }
113}
114
115impl ContentExtractor {
116 pub fn new(max_content_length: usize) -> Self {
117 Self {
118 main_content_selectors: vec![
120 "article",
121 "main",
122 "[role=\"main\"]",
123 ".post-content",
124 ".article-content",
125 ".entry-content",
126 ".content",
127 ".markdown-body",
128 ".prose",
129 "#content",
130 "#main",
131 "body",
132 ],
133 noise_selectors: vec![
135 "script",
136 "style",
137 "nav",
138 "header",
139 "footer",
140 "aside",
141 ".sidebar",
142 ".navigation",
143 ".menu",
144 ".ads",
145 ".advertisement",
146 ".comments",
147 ".social-share",
148 "[role=\"navigation\"]",
149 "[role=\"banner\"]",
150 "[role=\"complementary\"]",
151 ],
152 max_content_length,
153 }
154 }
155
156 pub fn extract(&self, html: &str, url: &str) -> ExtractedContent {
158 let document = Html::parse_document(html);
159
160 let title = self.extract_title(&document);
162
163 let description = self.extract_meta_description(&document);
165
166 let headings = self.extract_headings(&document);
168
169 let code_blocks = self.extract_code_blocks(&document);
171
172 let links = self.extract_links(&document, url);
174
175 let main_content = self.extract_main_content(&document);
177
178 ExtractedContent {
179 title,
180 description,
181 main_content,
182 links,
183 headings,
184 code_blocks,
185 url: url.to_string(),
186 }
187 }
188
189 fn extract_title(&self, doc: &Html) -> Option<String> {
190 let selector = Selector::parse("title").ok()?;
191 doc.select(&selector)
192 .next()
193 .map(|el| self.get_text(&el).trim().to_string())
194 .filter(|s| !s.is_empty())
195 }
196
197 fn extract_meta_description(&self, doc: &Html) -> Option<String> {
198 let selector = Selector::parse("meta[name=\"description\"]").ok()?;
199 doc.select(&selector)
200 .next()
201 .and_then(|el| el.value().attr("content"))
202 .map(|s| s.trim().to_string())
203 .filter(|s| !s.is_empty())
204 }
205
206 fn extract_headings(&self, doc: &Html) -> Vec<Heading> {
207 let mut headings = Vec::new();
208 for level in 1..=6 {
209 if let Ok(selector) = Selector::parse(&format!("h{}", level)) {
210 for el in doc.select(&selector) {
211 let text = self.get_text(&el).trim().to_string();
212 if !text.is_empty() && text.len() < 200 {
213 headings.push(Heading {
214 level: level as u8,
215 text,
216 });
217 }
218 }
219 }
220 }
221 headings
222 }
223
224 fn extract_code_blocks(&self, doc: &Html) -> Vec<CodeBlock> {
225 let mut blocks = Vec::new();
226
227 let extract_lang = |class: Option<&str>| -> Option<String> {
229 class.and_then(|c| {
230 c.split_whitespace()
231 .find(|cls| {
232 cls.starts_with("language-")
233 || cls.starts_with("lang-")
234 || cls.starts_with("hljs-")
235 })
236 .map(|cls| {
237 cls.trim_start_matches("language-")
238 .trim_start_matches("lang-")
239 .trim_start_matches("hljs-")
240 .to_string()
241 })
242 })
243 };
244
245 if let Ok(selector) = Selector::parse("pre code") {
247 for el in doc.select(&selector) {
248 let code = self.get_text(&el);
249 if code.len() > 10 && code.len() < 10_000 {
250 let language = extract_lang(el.value().attr("class"));
252 blocks.push(CodeBlock { language, code });
253 }
254 }
255 }
256
257 if let Ok(selector) = Selector::parse("pre") {
259 for el in doc.select(&selector) {
260 if let Ok(code_sel) = Selector::parse("code")
262 && el.select(&code_sel).next().is_some()
263 {
264 continue;
265 }
266 let code = self.get_text(&el);
267 if code.len() > 10 && code.len() < 10_000 {
268 let language = extract_lang(el.value().attr("class"));
269 blocks.push(CodeBlock { language, code });
270 }
271 }
272 }
273
274 blocks
275 }
276
277 fn extract_links(&self, doc: &Html, base_url: &str) -> Vec<PageLink> {
278 let mut links = Vec::new();
279 if let Ok(selector) = Selector::parse("a[href]") {
280 for el in doc.select(&selector) {
281 if let Some(href) = el.value().attr("href") {
282 let text = self.get_text(&el).trim().to_string();
283 if !text.is_empty() && text.len() < 200 {
284 let resolved_href = if href.starts_with("http") {
286 href.to_string()
287 } else if href.starts_with('/') {
288 if let Ok(base) = url::Url::parse(base_url) {
290 format!(
291 "{}://{}{}",
292 base.scheme(),
293 base.host_str().unwrap_or(""),
294 href
295 )
296 } else {
297 href.to_string()
298 }
299 } else {
300 href.to_string()
301 };
302 links.push(PageLink {
303 text,
304 href: resolved_href,
305 });
306 }
307 }
308 }
309 }
310 links.sort_by(|a, b| a.href.cmp(&b.href));
312 links.dedup_by(|a, b| a.href == b.href);
313 links.truncate(50); links
315 }
316
317 fn extract_main_content(&self, doc: &Html) -> String {
318 for selector_str in &self.main_content_selectors {
320 if let Ok(selector) = Selector::parse(selector_str)
321 && let Some(el) = doc.select(&selector).next()
322 {
323 let text = self.get_clean_text_without_noise(&el);
325 if text.len() > 100 {
326 return self.truncate_content(&text);
327 }
328 }
329 }
330
331 if let Ok(selector) = Selector::parse("body")
333 && let Some(el) = doc.select(&selector).next()
334 {
335 return self.truncate_content(&self.get_clean_text_without_noise(&el));
336 }
337
338 String::new()
339 }
340
341 fn get_text(&self, el: &ElementRef) -> String {
342 el.text().collect::<Vec<_>>().join(" ")
343 }
344
345 fn is_noise_element(&self, el: &ElementRef) -> bool {
348 for selector_str in &self.noise_selectors {
349 if let Ok(selector) = Selector::parse(selector_str) {
350 if selector.matches(el) {
352 return true;
353 }
354 }
355 }
356 false
357 }
358
359 fn get_text_without_noise(&self, el: &ElementRef) -> String {
362 let mut text_parts = Vec::new();
363
364 for child in el.children() {
365 if let Some(text) = child.value().as_text() {
366 text_parts.push(text.to_string());
367 } else if let Some(child_el) = ElementRef::wrap(child) {
368 if !self.is_noise_element(&child_el) {
370 text_parts.push(self.get_text_without_noise(&child_el));
371 }
372 }
373 }
374
375 text_parts.join(" ")
376 }
377
378 fn get_clean_text_without_noise(&self, el: &ElementRef) -> String {
380 let raw = self.get_text_without_noise(el);
381 let ws_re = regex::Regex::new(r"\s+").unwrap();
383 ws_re.replace_all(&raw, " ").trim().to_string()
384 }
385
386 fn truncate_content(&self, content: &str) -> String {
387 if content.len() <= self.max_content_length {
388 content.to_string()
389 } else {
390 let truncated = &content[..self.max_content_length];
392 if let Some(last_period) = truncated.rfind(". ") {
393 format!("{}...", &truncated[..=last_period])
394 } else {
395 format!("{}...", truncated)
396 }
397 }
398 }
399}
400
401#[async_trait::async_trait]
407pub trait SearchProvider: Send + Sync {
408 fn name(&self) -> &str;
410
411 async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>>;
413}
414
415pub struct LocalSearchProvider {
418 client: reqwest::Client,
419 extractor: ContentExtractor,
421}
422
423impl Default for LocalSearchProvider {
424 fn default() -> Self {
425 Self::new(Duration::from_secs(30), 10_000)
426 }
427}
428
429impl LocalSearchProvider {
430 const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
433 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
434
435 pub fn new(timeout: Duration, max_content_length: usize) -> Self {
436 use reqwest::header::{self, HeaderMap, HeaderValue};
437
438 let mut default_headers = HeaderMap::new();
439 default_headers.insert(
440 header::ACCEPT,
441 HeaderValue::from_static(
442 "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
443 ),
444 );
445 default_headers.insert(
446 header::ACCEPT_LANGUAGE,
447 HeaderValue::from_static("en-US,en;q=0.9"),
448 );
449 default_headers.insert(
450 header::REFERER,
451 HeaderValue::from_static("https://html.duckduckgo.com/"),
452 );
453
454 Self {
455 client: reqwest::Client::builder()
456 .timeout(timeout)
457 .user_agent(Self::USER_AGENT)
458 .default_headers(default_headers)
459 .build()
460 .unwrap_or_default(),
461 extractor: ContentExtractor::new(max_content_length),
462 }
463 }
464
465 fn is_captcha_page(html: &str) -> bool {
468 html.contains("anomaly-modal") || html.contains("cc=botnet")
469 }
470
471 fn parse_ddg_html(&self, html: &str, query: &str) -> Vec<SearchItem> {
473 let document = Html::parse_document(html);
474 let mut results = Vec::new();
475
476 if let Ok(result_selector) = Selector::parse(".result, .web-result") {
479 let result_count = document.select(&result_selector).count();
480 tracing::debug!("Found {} result containers in DDG HTML", result_count);
481
482 for result_el in document.select(&result_selector) {
483 let (title, url) = if let Ok(link_sel) =
485 Selector::parse(".result__a, .result-link, a.result__url")
486 {
487 if let Some(link) = result_el.select(&link_sel).next() {
488 let title = link.text().collect::<String>().trim().to_string();
489 let url = link.value().attr("href").unwrap_or("").to_string();
490 (title, url)
491 } else {
492 tracing::debug!("No link found in result container");
493 continue;
494 }
495 } else {
496 continue;
497 };
498
499 let snippet =
501 if let Ok(snippet_sel) = Selector::parse(".result__snippet, .result-snippet") {
502 result_el
503 .select(&snippet_sel)
504 .next()
505 .map(|el| el.text().collect::<String>().trim().to_string())
506 .unwrap_or_default()
507 } else {
508 String::new()
509 };
510
511 let resolved_url = self.resolve_ddg_url(&url);
513
514 if !title.is_empty() && !resolved_url.is_empty() {
515 tracing::debug!("Parsed result: {} -> {}", title, resolved_url);
516 results.push(SearchItem {
517 title,
518 url: resolved_url,
519 snippet,
520 content: None,
521 });
522 }
523 }
524 }
525
526 if results.is_empty() {
528 tracing::warn!("Primary DDG parser returned no results, trying fallback parser");
529
530 if let Ok(link_sel) = Selector::parse("a.result__a, .results a[href*=\"http\"]") {
531 let link_count = document.select(&link_sel).count();
532 tracing::debug!("Fallback parser found {} links", link_count);
533
534 for link in document.select(&link_sel).take(10) {
535 let title = link.text().collect::<String>().trim().to_string();
536 let url = link.value().attr("href").unwrap_or("").to_string();
537 if !title.is_empty() && url.contains("http") {
538 let resolved_url = self.resolve_ddg_url(&url);
539 tracing::debug!("Fallback result: {} -> {}", title, resolved_url);
540 results.push(SearchItem {
541 title,
542 url: resolved_url,
543 snippet: format!("Search result for: {}", query),
544 content: None,
545 });
546 }
547 }
548 }
549 }
550
551 if results.is_empty() {
552 tracing::error!(
553 "DDG HTML parsing found no results. HTML length: {} bytes. \
554 DuckDuckGo may have changed their HTML structure, or the \
555 response is an unrecognised challenge page. \
556 Consider configuring a Brave or SerpAPI key for reliable search.",
557 html.len()
558 );
559 } else {
560 tracing::info!(
561 "Successfully parsed {} results from DDG HTML",
562 results.len()
563 );
564 }
565
566 results
567 }
568
569 fn resolve_ddg_url(&self, url: &str) -> String {
571 if url.contains("uddg=")
573 && let Some(encoded) = url.split("uddg=").nth(1)
574 {
575 if let Some(end) = encoded.find('&') {
576 return urlencoding::decode(&encoded[..end])
577 .unwrap_or_default()
578 .to_string();
579 }
580 return urlencoding::decode(encoded).unwrap_or_default().to_string();
581 }
582 if url.starts_with("//") {
584 return format!("https:{}", url);
585 }
586 url.to_string()
587 }
588
589 pub async fn fetch_content(&self, url: &str) -> Result<ExtractedContent> {
592 Self::fetch_content_with(self.client.clone(), self.extractor.clone(), url.to_string()).await
593 }
594
595 async fn fetch_content_with(
596 client: reqwest::Client,
597 extractor: ContentExtractor,
598 url: String,
599 ) -> Result<ExtractedContent> {
600 let response = client.get(&url).send().await.map_err(|e| {
601 AppError::Io(std::io::Error::other(format!(
602 "Failed to fetch URL {}: {}",
603 url, e
604 )))
605 })?;
606
607 let html = response.text().await.map_err(|e| {
608 AppError::Io(std::io::Error::other(format!(
609 "Failed to read response from {}: {}",
610 url, e
611 )))
612 })?;
613
614 let url_for_extract = url.clone();
615 tokio::task::spawn_blocking(move || extractor.extract(&html, &url_for_extract))
616 .await
617 .map_err(|error| {
618 AppError::Io(std::io::Error::other(format!(
619 "Content extraction task failed for {}: {}",
620 url, error
621 )))
622 })
623 }
624
625 pub async fn enrich_result(&self, mut item: SearchItem) -> SearchItem {
628 if let Ok(mut extracted) = self.fetch_content(&item.url).await {
629 if extracted.main_content.len() > 500 {
631 extracted.main_content = format!("{}...", &extracted.main_content[..500]);
632 }
633 item.content = Some(extracted);
634 }
635 item
636 }
637
638 pub async fn search_with_content(
641 &self,
642 query: &str,
643 max_results: usize,
644 fetch_content_count: usize,
645 ) -> Result<Vec<SearchItem>> {
646 let mut results = self.search_basic(query, max_results).await?;
647
648 let fetch_count = fetch_content_count.min(results.len());
650 if fetch_count == 0 {
651 return Ok(results);
652 }
653
654 let mut tasks = tokio::task::JoinSet::new();
655 for (index, item) in results.iter().enumerate().take(fetch_count) {
656 let url = item.url.clone();
657 let client = self.client.clone();
658 let extractor = self.extractor.clone();
659 tasks.spawn(async move {
660 let content = Self::fetch_content_with(client, extractor, url).await.ok();
661 (index, content)
662 });
663 }
664
665 while let Some(joined) = tasks.join_next().await {
666 match joined {
667 Ok((index, Some(mut extracted))) => {
668 if extracted.main_content.len() > 500 {
669 extracted.main_content = format!("{}...", &extracted.main_content[..500]);
670 }
671 if let Some(item) = results.get_mut(index) {
672 item.content = Some(extracted);
673 }
674 }
675 Ok((_index, None)) => {}
676 Err(error) => {
677 tracing::warn!(error = %error, "Search content enrichment task failed");
678 }
679 }
680 }
681
682 Ok(results)
683 }
684
685 async fn search_basic(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
691 let search_url = "https://html.duckduckgo.com/html/";
692
693 let response = self
694 .client
695 .post(search_url)
696 .form(&[("q", query)])
697 .send()
698 .await
699 .map_err(|e| {
700 AppError::Io(std::io::Error::other(format!("Search request failed: {e}")))
701 })?;
702
703 let html = response.text().await.map_err(|e| {
704 AppError::Io(std::io::Error::other(format!(
705 "Failed to read response: {e}"
706 )))
707 })?;
708
709 if Self::is_captcha_page(&html) {
713 tracing::warn!(
714 "DuckDuckGo returned a CAPTCHA bot-challenge page ({} bytes). \
715 Automated requests are being rate-limited. \
716 Consider configuring a Brave or SerpAPI key for reliable search.",
717 html.len()
718 );
719 return Err(AppError::Io(std::io::Error::other(
720 "DuckDuckGo returned a CAPTCHA challenge — automated requests are being blocked. \
721 Configure a Brave Search or SerpAPI key in Settings → Web Search for reliable results.",
722 )));
723 }
724
725 let mut results = self.parse_ddg_html(&html, query);
726 results.truncate(max_results);
727
728 Ok(results)
729 }
730}
731
732#[async_trait::async_trait]
733impl SearchProvider for LocalSearchProvider {
734 fn name(&self) -> &str {
735 "local"
736 }
737
738 async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
739 self.search_basic(query, max_results).await
741 }
742}
743
744pub struct SerpApiProvider {
746 client: reqwest::Client,
747 api_key: String,
748}
749
750impl SerpApiProvider {
751 pub fn new(api_key: String, timeout: Duration) -> Self {
752 Self {
753 client: reqwest::Client::builder()
754 .timeout(timeout)
755 .build()
756 .unwrap_or_default(),
757 api_key,
758 }
759 }
760}
761
762#[async_trait::async_trait]
763impl SearchProvider for SerpApiProvider {
764 fn name(&self) -> &str {
765 "serpapi"
766 }
767
768 async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
769 let encoded_query = urlencoding::encode(query);
770 let search_url = format!(
771 "https://serpapi.com/search.json?q={}&api_key={}&num={}",
772 encoded_query, self.api_key, max_results
773 );
774
775 let response = self.client.get(&search_url).send().await.map_err(|e| {
776 AppError::Io(std::io::Error::other(format!(
777 "SerpAPI request failed: {e}"
778 )))
779 })?;
780
781 let json: serde_json::Value = response.json().await.map_err(|e| {
782 AppError::Io(std::io::Error::other(format!(
783 "Failed to parse SerpAPI response: {e}"
784 )))
785 })?;
786
787 let mut results = Vec::new();
788 if let Some(organic) = json.get("organic_results").and_then(|v| v.as_array()) {
789 for item in organic.iter().take(max_results) {
790 let title = item
791 .get("title")
792 .and_then(|v| v.as_str())
793 .unwrap_or("")
794 .to_string();
795 let url = item
796 .get("link")
797 .and_then(|v| v.as_str())
798 .unwrap_or("")
799 .to_string();
800 let snippet = item
801 .get("snippet")
802 .and_then(|v| v.as_str())
803 .unwrap_or("")
804 .to_string();
805
806 if !title.is_empty() && !url.is_empty() {
807 results.push(SearchItem {
808 title,
809 url,
810 snippet,
811 content: None,
812 });
813 }
814 }
815 }
816
817 Ok(results)
818 }
819}
820
821pub struct DuckDuckGoApiProvider {
823 client: reqwest::Client,
824}
825
826impl Default for DuckDuckGoApiProvider {
827 fn default() -> Self {
828 Self::new(Duration::from_secs(30))
829 }
830}
831
832impl DuckDuckGoApiProvider {
833 pub fn new(timeout: Duration) -> Self {
834 Self {
835 client: reqwest::Client::builder()
836 .timeout(timeout)
837 .build()
838 .unwrap_or_default(),
839 }
840 }
841}
842
843#[async_trait::async_trait]
844impl SearchProvider for DuckDuckGoApiProvider {
845 fn name(&self) -> &str {
846 "duckduckgo"
847 }
848
849 async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
850 let encoded_query = urlencoding::encode(query);
851 let search_url = format!(
853 "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1",
854 encoded_query
855 );
856
857 let response = self.client.get(&search_url).send().await.map_err(|e| {
858 AppError::Io(std::io::Error::other(format!(
859 "DDG API request failed: {e}"
860 )))
861 })?;
862
863 let json: serde_json::Value = response.json().await.map_err(|e| {
864 AppError::Io(std::io::Error::other(format!(
865 "Failed to parse DDG response: {e}"
866 )))
867 })?;
868
869 let mut results = Vec::new();
870
871 if let Some(abstract_text) = json.get("Abstract").and_then(|v| v.as_str())
873 && !abstract_text.is_empty()
874 {
875 let url = json
876 .get("AbstractURL")
877 .and_then(|v| v.as_str())
878 .unwrap_or("")
879 .to_string();
880 let source = json
881 .get("AbstractSource")
882 .and_then(|v| v.as_str())
883 .unwrap_or("DuckDuckGo");
884 results.push(SearchItem {
885 title: format!("{} - {}", query, source),
886 url,
887 snippet: abstract_text.to_string(),
888 content: None,
889 });
890 }
891
892 if let Some(topics) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
894 for topic in topics
895 .iter()
896 .take(max_results.saturating_sub(results.len()))
897 {
898 if let (Some(text), Some(url)) = (
899 topic.get("Text").and_then(|v| v.as_str()),
900 topic.get("FirstURL").and_then(|v| v.as_str()),
901 ) && !text.is_empty()
902 && !url.is_empty()
903 {
904 let (title, snippet) = if let Some(idx) = text.find(" - ") {
906 (text[..idx].to_string(), text[idx + 3..].to_string())
907 } else {
908 (query.to_string(), text.to_string())
909 };
910 results.push(SearchItem {
911 title,
912 url: url.to_string(),
913 snippet,
914 content: None,
915 });
916 }
917 }
918 }
919
920 Ok(results)
921 }
922}
923
924pub struct BraveSearchProvider {
926 client: reqwest::Client,
927 api_key: String,
928}
929
930impl BraveSearchProvider {
931 pub fn new(api_key: String, timeout: Duration) -> Self {
932 Self {
933 client: reqwest::Client::builder()
934 .timeout(timeout)
935 .build()
936 .unwrap_or_default(),
937 api_key,
938 }
939 }
940}
941
942#[async_trait::async_trait]
943impl SearchProvider for BraveSearchProvider {
944 fn name(&self) -> &str {
945 "brave"
946 }
947
948 async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
949 let encoded_query = urlencoding::encode(query);
950 let search_url = format!(
951 "https://api.search.brave.com/res/v1/web/search?q={}&count={}",
952 encoded_query, max_results
953 );
954
955 let response = self
956 .client
957 .get(&search_url)
958 .header("X-Subscription-Token", &self.api_key)
959 .header("Accept", "application/json")
960 .send()
961 .await
962 .map_err(|e| {
963 AppError::Io(std::io::Error::other(format!(
964 "Brave API request failed: {e}"
965 )))
966 })?;
967
968 let json: serde_json::Value = response.json().await.map_err(|e| {
969 AppError::Io(std::io::Error::other(format!(
970 "Failed to parse Brave response: {e}"
971 )))
972 })?;
973
974 let mut results = Vec::new();
975 if let Some(web) = json
976 .get("web")
977 .and_then(|v| v.get("results"))
978 .and_then(|v| v.as_array())
979 {
980 for item in web.iter().take(max_results) {
981 let title = item
982 .get("title")
983 .and_then(|v| v.as_str())
984 .unwrap_or("")
985 .to_string();
986 let url = item
987 .get("url")
988 .and_then(|v| v.as_str())
989 .unwrap_or("")
990 .to_string();
991 let snippet = item
992 .get("description")
993 .and_then(|v| v.as_str())
994 .unwrap_or("")
995 .to_string();
996
997 if !title.is_empty() && !url.is_empty() {
998 results.push(SearchItem {
999 title,
1000 url,
1001 snippet,
1002 content: None,
1003 });
1004 }
1005 }
1006 }
1007
1008 Ok(results)
1009 }
1010}
1011
1012pub struct WebSearchService {
1018 config: WebSearchConfig,
1019 client: reqwest::Client,
1020 extractor: ContentExtractor,
1021}
1022
1023impl Default for WebSearchService {
1024 fn default() -> Self {
1025 Self::new(WebSearchConfig::default())
1026 }
1027}
1028
1029impl WebSearchService {
1030 pub fn new(config: WebSearchConfig) -> Self {
1031 let client = reqwest::Client::builder()
1032 .timeout(Duration::from_secs(config.timeout_secs))
1033 .user_agent(&config.user_agent)
1034 .build()
1035 .unwrap_or_default();
1036
1037 Self {
1038 extractor: ContentExtractor::new(config.max_content_length),
1039 config,
1040 client,
1041 }
1042 }
1043
1044 fn create_provider(&self, provider_type: &WebSearchProvider) -> Box<dyn SearchProvider> {
1046 let timeout = Duration::from_secs(self.config.timeout_secs);
1047
1048 match provider_type {
1049 WebSearchProvider::Local => Box::new(LocalSearchProvider::new(
1050 timeout,
1051 self.config.max_content_length,
1052 )),
1053 WebSearchProvider::SerpApi => {
1054 if let Some(ref key) = self.config.serpapi_key {
1055 Box::new(SerpApiProvider::new(key.clone(), timeout))
1056 } else {
1057 tracing::warn!("SerpAPI key not configured, falling back to local");
1058 Box::new(LocalSearchProvider::new(
1059 timeout,
1060 self.config.max_content_length,
1061 ))
1062 }
1063 }
1064 WebSearchProvider::DuckDuckGo => Box::new(DuckDuckGoApiProvider::new(timeout)),
1065 WebSearchProvider::Brave => {
1066 if let Some(ref key) = self.config.brave_key {
1067 Box::new(BraveSearchProvider::new(key.clone(), timeout))
1068 } else {
1069 tracing::warn!("Brave API key not configured, falling back to local");
1070 Box::new(LocalSearchProvider::new(
1071 timeout,
1072 self.config.max_content_length,
1073 ))
1074 }
1075 }
1076 }
1077 }
1078
1079 pub async fn search(&self, query: &str) -> Result<SearchResult> {
1081 let max_results = self.config.max_results;
1082
1083 let mut providers = vec![self.config.provider.clone()];
1085 providers.extend(self.config.fallback_providers.clone());
1086
1087 let mut last_error: Option<AppError> = None;
1088
1089 for provider_type in &providers {
1090 let provider = self.create_provider(provider_type);
1091 tracing::info!(
1092 "Trying search provider '{}' for query: {}",
1093 provider.name(),
1094 query
1095 );
1096
1097 match provider.search(query, max_results).await {
1098 Ok(mut results) => {
1099 if results.is_empty() {
1100 tracing::warn!(
1101 "Provider '{}' returned 0 results for query: {}. This may indicate HTML structure changes or API issues.",
1102 provider.name(),
1103 query
1104 );
1105 last_error = Some(AppError::Io(std::io::Error::other(format!(
1107 "Provider '{}' returned no results",
1108 provider.name()
1109 ))));
1110 continue;
1111 }
1112
1113 tracing::info!(
1114 "Provider '{}' returned {} results",
1115 provider.name(),
1116 results.len()
1117 );
1118
1119 if self.config.extract_content {
1121 results = self.enrich_with_content(results).await;
1122 }
1123
1124 return Ok(SearchResult {
1125 query: query.to_string(),
1126 results,
1127 provider: provider.name().to_string(),
1128 });
1129 }
1130 Err(e) => {
1131 tracing::warn!("Provider '{}' failed with error: {}", provider.name(), e);
1132 last_error = Some(e);
1133 }
1134 }
1135 }
1136
1137 Err(last_error.unwrap_or_else(|| {
1139 AppError::Io(std::io::Error::other(
1140 "All search providers failed or returned no results. \
1141 Consider configuring a search API (Brave or SerpAPI) in your config file. \
1142 The default Local provider uses DuckDuckGo HTML scraping which may be unreliable.",
1143 ))
1144 }))
1145 }
1146
1147 async fn enrich_with_content(&self, mut results: Vec<SearchItem>) -> Vec<SearchItem> {
1149 let fetch_count = 3.min(results.len());
1151
1152 for item in results.iter_mut().take(fetch_count) {
1153 if let Ok(content) = self.fetch_and_extract(&item.url).await {
1154 item.content = Some(content);
1155 }
1156 }
1157
1158 results
1159 }
1160
1161 pub async fn fetch_and_extract(&self, url: &str) -> Result<ExtractedContent> {
1163 let response = self
1164 .client
1165 .get(url)
1166 .send()
1167 .await
1168 .map_err(|e| AppError::Io(std::io::Error::other(format!("Fetch failed: {e}"))))?;
1169
1170 let html = response
1171 .text()
1172 .await
1173 .map_err(|e| AppError::Io(std::io::Error::other(format!("Read failed: {e}"))))?;
1174
1175 let extractor = self.extractor.clone();
1176 let url = url.to_string();
1177 let error_url = url.clone();
1178 tokio::task::spawn_blocking(move || extractor.extract(&html, &url))
1179 .await
1180 .map_err(|error| {
1181 AppError::Io(std::io::Error::other(format!(
1182 "Content extraction task failed for {}: {}",
1183 error_url, error
1184 )))
1185 })
1186 }
1187
1188 pub async fn fetch(&self, url: &str) -> Result<FetchResult> {
1190 let response = self
1191 .client
1192 .get(url)
1193 .send()
1194 .await
1195 .map_err(|e| AppError::Io(std::io::Error::other(format!("HTTP error: {e}"))))?;
1196
1197 let status_code = response.status().as_u16();
1198 let content_type = response
1199 .headers()
1200 .get("content-type")
1201 .and_then(|v| v.to_str().ok())
1202 .map(|s| s.to_string());
1203
1204 let mut headers = HashMap::new();
1205 for (key, value) in response.headers() {
1206 if let Ok(v) = value.to_str() {
1207 headers.insert(key.to_string(), v.to_string());
1208 }
1209 }
1210
1211 let content = response
1212 .text()
1213 .await
1214 .map_err(|e| AppError::Io(std::io::Error::other(format!("Read error: {e}"))))?;
1215
1216 Ok(FetchResult {
1217 url: url.to_string(),
1218 status_code,
1219 content_type,
1220 content,
1221 headers,
1222 })
1223 }
1224}
1225
1226pub struct WebTools {
1233 service: WebSearchService,
1234}
1235
1236impl Default for WebTools {
1237 fn default() -> Self {
1238 Self::new()
1239 }
1240}
1241
1242impl WebTools {
1243 pub fn new() -> Self {
1244 Self {
1245 service: WebSearchService::default(),
1246 }
1247 }
1248
1249 pub fn with_config(config: WebSearchConfig) -> Self {
1250 Self {
1251 service: WebSearchService::new(config),
1252 }
1253 }
1254
1255 pub async fn fetch(&self, url: &str) -> Result<FetchResult> {
1257 self.service.fetch(url).await
1258 }
1259
1260 pub async fn search(&self, query: &str, num_results: Option<usize>) -> Result<SearchResult> {
1262 if let Some(count) = num_results {
1264 let mut config = self.service.config.clone();
1265 config.max_results = count;
1266 let temp_service = WebSearchService::new(config);
1267 temp_service.search(query).await
1268 } else {
1269 self.service.search(query).await
1270 }
1271 }
1272
1273 pub fn html_to_text(&self, html: &str) -> String {
1275 let content = self.service.extractor.extract(html, "");
1276 content.main_content
1277 }
1278
1279 pub fn extract_content(&self, html: &str, url: &str) -> ExtractedContent {
1281 self.service.extractor.extract(html, url)
1282 }
1283
1284 pub async fn fetch_and_extract(&self, url: &str) -> Result<ExtractedContent> {
1288 self.service.fetch_and_extract(url).await
1289 }
1290}
1291
1292#[cfg(test)]
1297mod tests {
1298 use super::*;
1299
1300 #[test]
1301 fn test_content_extractor_basic() {
1302 let extractor = ContentExtractor::new(1000);
1303 let html = r#"
1304 <!DOCTYPE html>
1305 <html>
1306 <head><title>Test Page</title></head>
1307 <body>
1308 <h1>Hello World</h1>
1309 <p>This is a test paragraph.</p>
1310 <a href="https://example.com">Example Link</a>
1311 </body>
1312 </html>
1313 "#;
1314
1315 let content = extractor.extract(html, "https://test.com");
1316 assert_eq!(content.title, Some("Test Page".to_string()));
1317 assert!(content.main_content.contains("Hello World"));
1318 assert!(content.main_content.contains("test paragraph"));
1319 assert!(!content.links.is_empty());
1320 assert!(!content.headings.is_empty());
1321 }
1322
1323 #[test]
1324 fn test_content_extractor_code_blocks() {
1325 let extractor = ContentExtractor::new(1000);
1326 let html = r#"
1327 <html>
1328 <body>
1329 <pre><code class="language-rust">fn main() { println!("Hello"); }</code></pre>
1330 </body>
1331 </html>
1332 "#;
1333
1334 let content = extractor.extract(html, "https://test.com");
1335 assert!(!content.code_blocks.is_empty());
1336 assert_eq!(content.code_blocks[0].language, Some("rust".to_string()));
1337 }
1338
1339 #[test]
1340 fn test_ddg_url_resolution() {
1341 let provider = LocalSearchProvider::default();
1342
1343 let redirect = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpath";
1345 assert_eq!(
1346 provider.resolve_ddg_url(redirect),
1347 "https://example.com/path"
1348 );
1349
1350 let relative = "//example.com/path";
1352 assert_eq!(
1353 provider.resolve_ddg_url(relative),
1354 "https://example.com/path"
1355 );
1356
1357 let normal = "https://example.com";
1359 assert_eq!(provider.resolve_ddg_url(normal), "https://example.com");
1360 }
1361
1362 #[test]
1363 fn test_web_search_config_default() {
1364 let config = WebSearchConfig::default();
1365 assert!(matches!(config.provider, WebSearchProvider::Local));
1366 assert_eq!(config.max_results, 5);
1367 assert!(config.extract_content);
1368 }
1369}