gestura_core_tools/
web.rs

1//! Web fetching and search tool
2//!
3//! Provides web operations with structured output including:
4//! - Smart content extraction from web pages
5//! - Multiple search provider support (Local, SerpAPI, DuckDuckGo, Brave)
6//! - Configurable fallback chains
7//!
8//! # Default Behavior
9//! By default, uses local HTTP-based search via DuckDuckGo HTML scraping (no API key required).
10//! This provides a "batteries included" experience while allowing users to upgrade to
11//! API-based providers for better results.
12
13use crate::config::{WebSearchConfig, WebSearchProvider};
14use crate::error::{AppError, Result};
15use scraper::{ElementRef, Html, Selector};
16use serde::{Deserialize, Serialize};
17use std::collections::HashMap;
18use std::time::Duration;
19
20// ============================================================================
21// Core Data Types
22// ============================================================================
23
24/// Web page fetch result
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct FetchResult {
27    pub url: String,
28    pub status_code: u16,
29    pub content_type: Option<String>,
30    pub content: String,
31    pub headers: HashMap<String, String>,
32}
33
34/// Extracted content from a web page
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct ExtractedContent {
37    /// Page title
38    pub title: Option<String>,
39    /// Meta description
40    pub description: Option<String>,
41    /// Main content text (cleaned)
42    pub main_content: String,
43    /// All links found on the page
44    pub links: Vec<PageLink>,
45    /// Headings structure
46    pub headings: Vec<Heading>,
47    /// Code blocks found
48    pub code_blocks: Vec<CodeBlock>,
49    /// Source URL
50    pub url: String,
51}
52
53/// A link found on a page
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct PageLink {
56    pub text: String,
57    pub href: String,
58}
59
60/// A heading from the document
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct Heading {
63    pub level: u8, // 1-6 for h1-h6
64    pub text: String,
65}
66
67/// A code block from the document
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct CodeBlock {
70    pub language: Option<String>,
71    pub code: String,
72}
73
74/// Web search result
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct SearchResult {
77    pub query: String,
78    pub results: Vec<SearchItem>,
79    /// Which provider returned these results
80    pub provider: String,
81}
82
83/// A single search result item
84#[derive(Debug, Clone, Default, Serialize, Deserialize)]
85pub struct SearchItem {
86    pub title: String,
87    pub url: String,
88    pub snippet: String,
89    /// Optional extracted content (if extract_content is enabled)
90    #[serde(skip_serializing_if = "Option::is_none")]
91    pub content: Option<ExtractedContent>,
92}
93
94// ============================================================================
95// Content Extractor
96// ============================================================================
97
98/// Smart content extraction from HTML pages
99#[derive(Debug, Clone)]
100pub struct ContentExtractor {
101    /// Selectors for main content areas (priority order)
102    main_content_selectors: Vec<&'static str>,
103    /// Elements to filter out during extraction (ads, navigation, footers, etc.)
104    noise_selectors: Vec<&'static str>,
105    /// Maximum content length
106    max_content_length: usize,
107}
108
109impl Default for ContentExtractor {
110    fn default() -> Self {
111        Self::new(10_000)
112    }
113}
114
115impl ContentExtractor {
116    pub fn new(max_content_length: usize) -> Self {
117        Self {
118            // Priority order: most specific content containers first
119            main_content_selectors: vec![
120                "article",
121                "main",
122                "[role=\"main\"]",
123                ".post-content",
124                ".article-content",
125                ".entry-content",
126                ".content",
127                ".markdown-body",
128                ".prose",
129                "#content",
130                "#main",
131                "body",
132            ],
133            // Noise elements to remove
134            noise_selectors: vec![
135                "script",
136                "style",
137                "nav",
138                "header",
139                "footer",
140                "aside",
141                ".sidebar",
142                ".navigation",
143                ".menu",
144                ".ads",
145                ".advertisement",
146                ".comments",
147                ".social-share",
148                "[role=\"navigation\"]",
149                "[role=\"banner\"]",
150                "[role=\"complementary\"]",
151            ],
152            max_content_length,
153        }
154    }
155
156    /// Extract structured content from HTML
157    pub fn extract(&self, html: &str, url: &str) -> ExtractedContent {
158        let document = Html::parse_document(html);
159
160        // Extract title
161        let title = self.extract_title(&document);
162
163        // Extract meta description
164        let description = self.extract_meta_description(&document);
165
166        // Extract headings
167        let headings = self.extract_headings(&document);
168
169        // Extract code blocks
170        let code_blocks = self.extract_code_blocks(&document);
171
172        // Extract links
173        let links = self.extract_links(&document, url);
174
175        // Extract main content
176        let main_content = self.extract_main_content(&document);
177
178        ExtractedContent {
179            title,
180            description,
181            main_content,
182            links,
183            headings,
184            code_blocks,
185            url: url.to_string(),
186        }
187    }
188
189    fn extract_title(&self, doc: &Html) -> Option<String> {
190        let selector = Selector::parse("title").ok()?;
191        doc.select(&selector)
192            .next()
193            .map(|el| self.get_text(&el).trim().to_string())
194            .filter(|s| !s.is_empty())
195    }
196
197    fn extract_meta_description(&self, doc: &Html) -> Option<String> {
198        let selector = Selector::parse("meta[name=\"description\"]").ok()?;
199        doc.select(&selector)
200            .next()
201            .and_then(|el| el.value().attr("content"))
202            .map(|s| s.trim().to_string())
203            .filter(|s| !s.is_empty())
204    }
205
206    fn extract_headings(&self, doc: &Html) -> Vec<Heading> {
207        let mut headings = Vec::new();
208        for level in 1..=6 {
209            if let Ok(selector) = Selector::parse(&format!("h{}", level)) {
210                for el in doc.select(&selector) {
211                    let text = self.get_text(&el).trim().to_string();
212                    if !text.is_empty() && text.len() < 200 {
213                        headings.push(Heading {
214                            level: level as u8,
215                            text,
216                        });
217                    }
218                }
219            }
220        }
221        headings
222    }
223
224    fn extract_code_blocks(&self, doc: &Html) -> Vec<CodeBlock> {
225        let mut blocks = Vec::new();
226
227        // Helper to extract language from class attribute
228        let extract_lang = |class: Option<&str>| -> Option<String> {
229            class.and_then(|c| {
230                c.split_whitespace()
231                    .find(|cls| {
232                        cls.starts_with("language-")
233                            || cls.starts_with("lang-")
234                            || cls.starts_with("hljs-")
235                    })
236                    .map(|cls| {
237                        cls.trim_start_matches("language-")
238                            .trim_start_matches("lang-")
239                            .trim_start_matches("hljs-")
240                            .to_string()
241                    })
242            })
243        };
244
245        // First, look for <pre><code> blocks (most common pattern)
246        if let Ok(selector) = Selector::parse("pre code") {
247            for el in doc.select(&selector) {
248                let code = self.get_text(&el);
249                if code.len() > 10 && code.len() < 10_000 {
250                    // Try to detect language from the code element's class
251                    let language = extract_lang(el.value().attr("class"));
252                    blocks.push(CodeBlock { language, code });
253                }
254            }
255        }
256
257        // Then look for standalone <pre> blocks (without nested <code>)
258        if let Ok(selector) = Selector::parse("pre") {
259            for el in doc.select(&selector) {
260                // Skip if this <pre> contains a <code> element (already handled above)
261                if let Ok(code_sel) = Selector::parse("code")
262                    && el.select(&code_sel).next().is_some()
263                {
264                    continue;
265                }
266                let code = self.get_text(&el);
267                if code.len() > 10 && code.len() < 10_000 {
268                    let language = extract_lang(el.value().attr("class"));
269                    blocks.push(CodeBlock { language, code });
270                }
271            }
272        }
273
274        blocks
275    }
276
277    fn extract_links(&self, doc: &Html, base_url: &str) -> Vec<PageLink> {
278        let mut links = Vec::new();
279        if let Ok(selector) = Selector::parse("a[href]") {
280            for el in doc.select(&selector) {
281                if let Some(href) = el.value().attr("href") {
282                    let text = self.get_text(&el).trim().to_string();
283                    if !text.is_empty() && text.len() < 200 {
284                        // Resolve relative URLs
285                        let resolved_href = if href.starts_with("http") {
286                            href.to_string()
287                        } else if href.starts_with('/') {
288                            // Absolute path
289                            if let Ok(base) = url::Url::parse(base_url) {
290                                format!(
291                                    "{}://{}{}",
292                                    base.scheme(),
293                                    base.host_str().unwrap_or(""),
294                                    href
295                                )
296                            } else {
297                                href.to_string()
298                            }
299                        } else {
300                            href.to_string()
301                        };
302                        links.push(PageLink {
303                            text,
304                            href: resolved_href,
305                        });
306                    }
307                }
308            }
309        }
310        // Deduplicate by URL
311        links.sort_by(|a, b| a.href.cmp(&b.href));
312        links.dedup_by(|a, b| a.href == b.href);
313        links.truncate(50); // Limit links
314        links
315    }
316
317    fn extract_main_content(&self, doc: &Html) -> String {
318        // Try each main content selector in priority order
319        for selector_str in &self.main_content_selectors {
320            if let Ok(selector) = Selector::parse(selector_str)
321                && let Some(el) = doc.select(&selector).next()
322            {
323                // Use noise-filtered extraction for cleaner content
324                let text = self.get_clean_text_without_noise(&el);
325                if text.len() > 100 {
326                    return self.truncate_content(&text);
327                }
328            }
329        }
330
331        // Fallback: get all text from body, filtering noise
332        if let Ok(selector) = Selector::parse("body")
333            && let Some(el) = doc.select(&selector).next()
334        {
335            return self.truncate_content(&self.get_clean_text_without_noise(&el));
336        }
337
338        String::new()
339    }
340
341    fn get_text(&self, el: &ElementRef) -> String {
342        el.text().collect::<Vec<_>>().join(" ")
343    }
344
345    /// Check if an element matches any noise selector.
346    /// Used to filter out navigation, ads, footers, etc.
347    fn is_noise_element(&self, el: &ElementRef) -> bool {
348        for selector_str in &self.noise_selectors {
349            if let Ok(selector) = Selector::parse(selector_str) {
350                // Check if this element matches the noise selector
351                if selector.matches(el) {
352                    return true;
353                }
354            }
355        }
356        false
357    }
358
359    /// Get text content from an element, excluding noise elements.
360    /// Recursively extracts text while filtering out ads, navigation, etc.
361    fn get_text_without_noise(&self, el: &ElementRef) -> String {
362        let mut text_parts = Vec::new();
363
364        for child in el.children() {
365            if let Some(text) = child.value().as_text() {
366                text_parts.push(text.to_string());
367            } else if let Some(child_el) = ElementRef::wrap(child) {
368                // Skip noise elements
369                if !self.is_noise_element(&child_el) {
370                    text_parts.push(self.get_text_without_noise(&child_el));
371                }
372            }
373        }
374
375        text_parts.join(" ")
376    }
377
378    /// Get clean text content, excluding noise elements.
379    fn get_clean_text_without_noise(&self, el: &ElementRef) -> String {
380        let raw = self.get_text_without_noise(el);
381        // Collapse whitespace and clean up
382        let ws_re = regex::Regex::new(r"\s+").unwrap();
383        ws_re.replace_all(&raw, " ").trim().to_string()
384    }
385
386    fn truncate_content(&self, content: &str) -> String {
387        if content.len() <= self.max_content_length {
388            content.to_string()
389        } else {
390            // Try to truncate at a sentence boundary
391            let truncated = &content[..self.max_content_length];
392            if let Some(last_period) = truncated.rfind(". ") {
393                format!("{}...", &truncated[..=last_period])
394            } else {
395                format!("{}...", truncated)
396            }
397        }
398    }
399}
400
401// ============================================================================
402// Search Providers
403// ============================================================================
404
405/// Trait for search providers
406#[async_trait::async_trait]
407pub trait SearchProvider: Send + Sync {
408    /// Provider name for logging
409    fn name(&self) -> &str;
410
411    /// Execute a search query
412    async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>>;
413}
414
415/// Local HTTP-based search using DuckDuckGo HTML scraping
416/// No API key required - default provider
417pub struct LocalSearchProvider {
418    client: reqwest::Client,
419    /// Content extractor for fetching and parsing search result pages
420    extractor: ContentExtractor,
421}
422
423impl Default for LocalSearchProvider {
424    fn default() -> Self {
425        Self::new(Duration::from_secs(30), 10_000)
426    }
427}
428
429impl LocalSearchProvider {
430    /// Realistic Chrome user agent to avoid bot detection by DuckDuckGo.
431    /// A truncated UA string is immediately flagged as automated traffic.
432    const USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
433        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
434
435    pub fn new(timeout: Duration, max_content_length: usize) -> Self {
436        use reqwest::header::{self, HeaderMap, HeaderValue};
437
438        let mut default_headers = HeaderMap::new();
439        default_headers.insert(
440            header::ACCEPT,
441            HeaderValue::from_static(
442                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
443            ),
444        );
445        default_headers.insert(
446            header::ACCEPT_LANGUAGE,
447            HeaderValue::from_static("en-US,en;q=0.9"),
448        );
449        default_headers.insert(
450            header::REFERER,
451            HeaderValue::from_static("https://html.duckduckgo.com/"),
452        );
453
454        Self {
455            client: reqwest::Client::builder()
456                .timeout(timeout)
457                .user_agent(Self::USER_AGENT)
458                .default_headers(default_headers)
459                .build()
460                .unwrap_or_default(),
461            extractor: ContentExtractor::new(max_content_length),
462        }
463    }
464
465    /// Returns `true` when DuckDuckGo has responded with a CAPTCHA /
466    /// bot-challenge page instead of real search results.
467    fn is_captcha_page(html: &str) -> bool {
468        html.contains("anomaly-modal") || html.contains("cc=botnet")
469    }
470
471    /// Parse DuckDuckGo HTML search results
472    fn parse_ddg_html(&self, html: &str, query: &str) -> Vec<SearchItem> {
473        let document = Html::parse_document(html);
474        let mut results = Vec::new();
475
476        // DuckDuckGo HTML structure: .result class contains search results
477        // Each result has .result__a (link) and .result__snippet (description)
478        if let Ok(result_selector) = Selector::parse(".result, .web-result") {
479            let result_count = document.select(&result_selector).count();
480            tracing::debug!("Found {} result containers in DDG HTML", result_count);
481
482            for result_el in document.select(&result_selector) {
483                // Get title and URL from the link
484                let (title, url) = if let Ok(link_sel) =
485                    Selector::parse(".result__a, .result-link, a.result__url")
486                {
487                    if let Some(link) = result_el.select(&link_sel).next() {
488                        let title = link.text().collect::<String>().trim().to_string();
489                        let url = link.value().attr("href").unwrap_or("").to_string();
490                        (title, url)
491                    } else {
492                        tracing::debug!("No link found in result container");
493                        continue;
494                    }
495                } else {
496                    continue;
497                };
498
499                // Get snippet
500                let snippet =
501                    if let Ok(snippet_sel) = Selector::parse(".result__snippet, .result-snippet") {
502                        result_el
503                            .select(&snippet_sel)
504                            .next()
505                            .map(|el| el.text().collect::<String>().trim().to_string())
506                            .unwrap_or_default()
507                    } else {
508                        String::new()
509                    };
510
511                // Resolve DuckDuckGo redirect URLs
512                let resolved_url = self.resolve_ddg_url(&url);
513
514                if !title.is_empty() && !resolved_url.is_empty() {
515                    tracing::debug!("Parsed result: {} -> {}", title, resolved_url);
516                    results.push(SearchItem {
517                        title,
518                        url: resolved_url,
519                        snippet,
520                        content: None,
521                    });
522                }
523            }
524        }
525
526        // Fallback: try to extract from simpler structure
527        if results.is_empty() {
528            tracing::warn!("Primary DDG parser returned no results, trying fallback parser");
529
530            if let Ok(link_sel) = Selector::parse("a.result__a, .results a[href*=\"http\"]") {
531                let link_count = document.select(&link_sel).count();
532                tracing::debug!("Fallback parser found {} links", link_count);
533
534                for link in document.select(&link_sel).take(10) {
535                    let title = link.text().collect::<String>().trim().to_string();
536                    let url = link.value().attr("href").unwrap_or("").to_string();
537                    if !title.is_empty() && url.contains("http") {
538                        let resolved_url = self.resolve_ddg_url(&url);
539                        tracing::debug!("Fallback result: {} -> {}", title, resolved_url);
540                        results.push(SearchItem {
541                            title,
542                            url: resolved_url,
543                            snippet: format!("Search result for: {}", query),
544                            content: None,
545                        });
546                    }
547                }
548            }
549        }
550
551        if results.is_empty() {
552            tracing::error!(
553                "DDG HTML parsing found no results. HTML length: {} bytes. \
554                 DuckDuckGo may have changed their HTML structure, or the \
555                 response is an unrecognised challenge page. \
556                 Consider configuring a Brave or SerpAPI key for reliable search.",
557                html.len()
558            );
559        } else {
560            tracing::info!(
561                "Successfully parsed {} results from DDG HTML",
562                results.len()
563            );
564        }
565
566        results
567    }
568
569    /// Resolve DuckDuckGo redirect URL to actual URL
570    fn resolve_ddg_url(&self, url: &str) -> String {
571        // DDG sometimes uses redirect URLs like //duckduckgo.com/l/?uddg=...
572        if url.contains("uddg=")
573            && let Some(encoded) = url.split("uddg=").nth(1)
574        {
575            if let Some(end) = encoded.find('&') {
576                return urlencoding::decode(&encoded[..end])
577                    .unwrap_or_default()
578                    .to_string();
579            }
580            return urlencoding::decode(encoded).unwrap_or_default().to_string();
581        }
582        // Handle //hostname/path format
583        if url.starts_with("//") {
584            return format!("https:{}", url);
585        }
586        url.to_string()
587    }
588
589    /// Fetch and extract content from a URL.
590    /// Returns extracted content including title, description, and main text.
591    pub async fn fetch_content(&self, url: &str) -> Result<ExtractedContent> {
592        Self::fetch_content_with(self.client.clone(), self.extractor.clone(), url.to_string()).await
593    }
594
595    async fn fetch_content_with(
596        client: reqwest::Client,
597        extractor: ContentExtractor,
598        url: String,
599    ) -> Result<ExtractedContent> {
600        let response = client.get(&url).send().await.map_err(|e| {
601            AppError::Io(std::io::Error::other(format!(
602                "Failed to fetch URL {}: {}",
603                url, e
604            )))
605        })?;
606
607        let html = response.text().await.map_err(|e| {
608            AppError::Io(std::io::Error::other(format!(
609                "Failed to read response from {}: {}",
610                url, e
611            )))
612        })?;
613
614        let url_for_extract = url.clone();
615        tokio::task::spawn_blocking(move || extractor.extract(&html, &url_for_extract))
616            .await
617            .map_err(|error| {
618                AppError::Io(std::io::Error::other(format!(
619                    "Content extraction task failed for {}: {}",
620                    url, error
621                )))
622            })
623    }
624
625    /// Enrich a search result by fetching and extracting content from its URL.
626    /// Returns the search item with the `content` field populated.
627    pub async fn enrich_result(&self, mut item: SearchItem) -> SearchItem {
628        if let Ok(mut extracted) = self.fetch_content(&item.url).await {
629            // Truncate main content for summary if too long
630            if extracted.main_content.len() > 500 {
631                extracted.main_content = format!("{}...", &extracted.main_content[..500]);
632            }
633            item.content = Some(extracted);
634        }
635        item
636    }
637
638    /// Search with content extraction - fetches and extracts content from top results.
639    /// This is slower but provides richer results with actual page content.
640    pub async fn search_with_content(
641        &self,
642        query: &str,
643        max_results: usize,
644        fetch_content_count: usize,
645    ) -> Result<Vec<SearchItem>> {
646        let mut results = self.search_basic(query, max_results).await?;
647
648        // Fetch content for top N results
649        let fetch_count = fetch_content_count.min(results.len());
650        if fetch_count == 0 {
651            return Ok(results);
652        }
653
654        let mut tasks = tokio::task::JoinSet::new();
655        for (index, item) in results.iter().enumerate().take(fetch_count) {
656            let url = item.url.clone();
657            let client = self.client.clone();
658            let extractor = self.extractor.clone();
659            tasks.spawn(async move {
660                let content = Self::fetch_content_with(client, extractor, url).await.ok();
661                (index, content)
662            });
663        }
664
665        while let Some(joined) = tasks.join_next().await {
666            match joined {
667                Ok((index, Some(mut extracted))) => {
668                    if extracted.main_content.len() > 500 {
669                        extracted.main_content = format!("{}...", &extracted.main_content[..500]);
670                    }
671                    if let Some(item) = results.get_mut(index) {
672                        item.content = Some(extracted);
673                    }
674                }
675                Ok((_index, None)) => {}
676                Err(error) => {
677                    tracing::warn!(error = %error, "Search content enrichment task failed");
678                }
679            }
680        }
681
682        Ok(results)
683    }
684
685    /// Basic search without content extraction (faster).
686    ///
687    /// Uses HTTP **POST** with form-encoded data, matching DuckDuckGo's own
688    /// HTML search form. GET requests to this endpoint are aggressively
689    /// blocked with CAPTCHA challenges.
690    async fn search_basic(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
691        let search_url = "https://html.duckduckgo.com/html/";
692
693        let response = self
694            .client
695            .post(search_url)
696            .form(&[("q", query)])
697            .send()
698            .await
699            .map_err(|e| {
700                AppError::Io(std::io::Error::other(format!("Search request failed: {e}")))
701            })?;
702
703        let html = response.text().await.map_err(|e| {
704            AppError::Io(std::io::Error::other(format!(
705                "Failed to read response: {e}"
706            )))
707        })?;
708
709        // Detect CAPTCHA / bot-challenge page before attempting to parse.
710        // Returning an error (instead of an empty Vec) ensures the fallback
711        // provider chain in WebSearchService::search() is triggered.
712        if Self::is_captcha_page(&html) {
713            tracing::warn!(
714                "DuckDuckGo returned a CAPTCHA bot-challenge page ({} bytes). \
715                 Automated requests are being rate-limited. \
716                 Consider configuring a Brave or SerpAPI key for reliable search.",
717                html.len()
718            );
719            return Err(AppError::Io(std::io::Error::other(
720                "DuckDuckGo returned a CAPTCHA challenge — automated requests are being blocked. \
721                 Configure a Brave Search or SerpAPI key in Settings → Web Search for reliable results.",
722            )));
723        }
724
725        let mut results = self.parse_ddg_html(&html, query);
726        results.truncate(max_results);
727
728        Ok(results)
729    }
730}
731
732#[async_trait::async_trait]
733impl SearchProvider for LocalSearchProvider {
734    fn name(&self) -> &str {
735        "local"
736    }
737
738    async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
739        // Use basic search for the trait implementation (fast, no content fetching)
740        self.search_basic(query, max_results).await
741    }
742}
743
744/// SerpAPI provider for Google search results
745pub struct SerpApiProvider {
746    client: reqwest::Client,
747    api_key: String,
748}
749
750impl SerpApiProvider {
751    pub fn new(api_key: String, timeout: Duration) -> Self {
752        Self {
753            client: reqwest::Client::builder()
754                .timeout(timeout)
755                .build()
756                .unwrap_or_default(),
757            api_key,
758        }
759    }
760}
761
762#[async_trait::async_trait]
763impl SearchProvider for SerpApiProvider {
764    fn name(&self) -> &str {
765        "serpapi"
766    }
767
768    async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
769        let encoded_query = urlencoding::encode(query);
770        let search_url = format!(
771            "https://serpapi.com/search.json?q={}&api_key={}&num={}",
772            encoded_query, self.api_key, max_results
773        );
774
775        let response = self.client.get(&search_url).send().await.map_err(|e| {
776            AppError::Io(std::io::Error::other(format!(
777                "SerpAPI request failed: {e}"
778            )))
779        })?;
780
781        let json: serde_json::Value = response.json().await.map_err(|e| {
782            AppError::Io(std::io::Error::other(format!(
783                "Failed to parse SerpAPI response: {e}"
784            )))
785        })?;
786
787        let mut results = Vec::new();
788        if let Some(organic) = json.get("organic_results").and_then(|v| v.as_array()) {
789            for item in organic.iter().take(max_results) {
790                let title = item
791                    .get("title")
792                    .and_then(|v| v.as_str())
793                    .unwrap_or("")
794                    .to_string();
795                let url = item
796                    .get("link")
797                    .and_then(|v| v.as_str())
798                    .unwrap_or("")
799                    .to_string();
800                let snippet = item
801                    .get("snippet")
802                    .and_then(|v| v.as_str())
803                    .unwrap_or("")
804                    .to_string();
805
806                if !title.is_empty() && !url.is_empty() {
807                    results.push(SearchItem {
808                        title,
809                        url,
810                        snippet,
811                        content: None,
812                    });
813                }
814            }
815        }
816
817        Ok(results)
818    }
819}
820
821/// DuckDuckGo Instant Answer API provider
822pub struct DuckDuckGoApiProvider {
823    client: reqwest::Client,
824}
825
826impl Default for DuckDuckGoApiProvider {
827    fn default() -> Self {
828        Self::new(Duration::from_secs(30))
829    }
830}
831
832impl DuckDuckGoApiProvider {
833    pub fn new(timeout: Duration) -> Self {
834        Self {
835            client: reqwest::Client::builder()
836                .timeout(timeout)
837                .build()
838                .unwrap_or_default(),
839        }
840    }
841}
842
843#[async_trait::async_trait]
844impl SearchProvider for DuckDuckGoApiProvider {
845    fn name(&self) -> &str {
846        "duckduckgo"
847    }
848
849    async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
850        let encoded_query = urlencoding::encode(query);
851        // DuckDuckGo Instant Answer API
852        let search_url = format!(
853            "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1",
854            encoded_query
855        );
856
857        let response = self.client.get(&search_url).send().await.map_err(|e| {
858            AppError::Io(std::io::Error::other(format!(
859                "DDG API request failed: {e}"
860            )))
861        })?;
862
863        let json: serde_json::Value = response.json().await.map_err(|e| {
864            AppError::Io(std::io::Error::other(format!(
865                "Failed to parse DDG response: {e}"
866            )))
867        })?;
868
869        let mut results = Vec::new();
870
871        // Check for abstract (main answer)
872        if let Some(abstract_text) = json.get("Abstract").and_then(|v| v.as_str())
873            && !abstract_text.is_empty()
874        {
875            let url = json
876                .get("AbstractURL")
877                .and_then(|v| v.as_str())
878                .unwrap_or("")
879                .to_string();
880            let source = json
881                .get("AbstractSource")
882                .and_then(|v| v.as_str())
883                .unwrap_or("DuckDuckGo");
884            results.push(SearchItem {
885                title: format!("{} - {}", query, source),
886                url,
887                snippet: abstract_text.to_string(),
888                content: None,
889            });
890        }
891
892        // Add related topics
893        if let Some(topics) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
894            for topic in topics
895                .iter()
896                .take(max_results.saturating_sub(results.len()))
897            {
898                if let (Some(text), Some(url)) = (
899                    topic.get("Text").and_then(|v| v.as_str()),
900                    topic.get("FirstURL").and_then(|v| v.as_str()),
901                ) && !text.is_empty()
902                    && !url.is_empty()
903                {
904                    // Extract title from text (usually format: "Title - Description")
905                    let (title, snippet) = if let Some(idx) = text.find(" - ") {
906                        (text[..idx].to_string(), text[idx + 3..].to_string())
907                    } else {
908                        (query.to_string(), text.to_string())
909                    };
910                    results.push(SearchItem {
911                        title,
912                        url: url.to_string(),
913                        snippet,
914                        content: None,
915                    });
916                }
917            }
918        }
919
920        Ok(results)
921    }
922}
923
924/// Brave Search API provider
925pub struct BraveSearchProvider {
926    client: reqwest::Client,
927    api_key: String,
928}
929
930impl BraveSearchProvider {
931    pub fn new(api_key: String, timeout: Duration) -> Self {
932        Self {
933            client: reqwest::Client::builder()
934                .timeout(timeout)
935                .build()
936                .unwrap_or_default(),
937            api_key,
938        }
939    }
940}
941
942#[async_trait::async_trait]
943impl SearchProvider for BraveSearchProvider {
944    fn name(&self) -> &str {
945        "brave"
946    }
947
948    async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchItem>> {
949        let encoded_query = urlencoding::encode(query);
950        let search_url = format!(
951            "https://api.search.brave.com/res/v1/web/search?q={}&count={}",
952            encoded_query, max_results
953        );
954
955        let response = self
956            .client
957            .get(&search_url)
958            .header("X-Subscription-Token", &self.api_key)
959            .header("Accept", "application/json")
960            .send()
961            .await
962            .map_err(|e| {
963                AppError::Io(std::io::Error::other(format!(
964                    "Brave API request failed: {e}"
965                )))
966            })?;
967
968        let json: serde_json::Value = response.json().await.map_err(|e| {
969            AppError::Io(std::io::Error::other(format!(
970                "Failed to parse Brave response: {e}"
971            )))
972        })?;
973
974        let mut results = Vec::new();
975        if let Some(web) = json
976            .get("web")
977            .and_then(|v| v.get("results"))
978            .and_then(|v| v.as_array())
979        {
980            for item in web.iter().take(max_results) {
981                let title = item
982                    .get("title")
983                    .and_then(|v| v.as_str())
984                    .unwrap_or("")
985                    .to_string();
986                let url = item
987                    .get("url")
988                    .and_then(|v| v.as_str())
989                    .unwrap_or("")
990                    .to_string();
991                let snippet = item
992                    .get("description")
993                    .and_then(|v| v.as_str())
994                    .unwrap_or("")
995                    .to_string();
996
997                if !title.is_empty() && !url.is_empty() {
998                    results.push(SearchItem {
999                        title,
1000                        url,
1001                        snippet,
1002                        content: None,
1003                    });
1004                }
1005            }
1006        }
1007
1008        Ok(results)
1009    }
1010}
1011
1012// ============================================================================
1013// Web Search Service (Unified Facade)
1014// ============================================================================
1015
1016/// Unified web search service with configurable providers and fallback chains
1017pub struct WebSearchService {
1018    config: WebSearchConfig,
1019    client: reqwest::Client,
1020    extractor: ContentExtractor,
1021}
1022
1023impl Default for WebSearchService {
1024    fn default() -> Self {
1025        Self::new(WebSearchConfig::default())
1026    }
1027}
1028
1029impl WebSearchService {
1030    pub fn new(config: WebSearchConfig) -> Self {
1031        let client = reqwest::Client::builder()
1032            .timeout(Duration::from_secs(config.timeout_secs))
1033            .user_agent(&config.user_agent)
1034            .build()
1035            .unwrap_or_default();
1036
1037        Self {
1038            extractor: ContentExtractor::new(config.max_content_length),
1039            config,
1040            client,
1041        }
1042    }
1043
1044    /// Create provider based on configuration
1045    fn create_provider(&self, provider_type: &WebSearchProvider) -> Box<dyn SearchProvider> {
1046        let timeout = Duration::from_secs(self.config.timeout_secs);
1047
1048        match provider_type {
1049            WebSearchProvider::Local => Box::new(LocalSearchProvider::new(
1050                timeout,
1051                self.config.max_content_length,
1052            )),
1053            WebSearchProvider::SerpApi => {
1054                if let Some(ref key) = self.config.serpapi_key {
1055                    Box::new(SerpApiProvider::new(key.clone(), timeout))
1056                } else {
1057                    tracing::warn!("SerpAPI key not configured, falling back to local");
1058                    Box::new(LocalSearchProvider::new(
1059                        timeout,
1060                        self.config.max_content_length,
1061                    ))
1062                }
1063            }
1064            WebSearchProvider::DuckDuckGo => Box::new(DuckDuckGoApiProvider::new(timeout)),
1065            WebSearchProvider::Brave => {
1066                if let Some(ref key) = self.config.brave_key {
1067                    Box::new(BraveSearchProvider::new(key.clone(), timeout))
1068                } else {
1069                    tracing::warn!("Brave API key not configured, falling back to local");
1070                    Box::new(LocalSearchProvider::new(
1071                        timeout,
1072                        self.config.max_content_length,
1073                    ))
1074                }
1075            }
1076        }
1077    }
1078
1079    /// Search using configured provider with fallback chain
1080    pub async fn search(&self, query: &str) -> Result<SearchResult> {
1081        let max_results = self.config.max_results;
1082
1083        // Build provider chain: primary first, then fallbacks
1084        let mut providers = vec![self.config.provider.clone()];
1085        providers.extend(self.config.fallback_providers.clone());
1086
1087        let mut last_error: Option<AppError> = None;
1088
1089        for provider_type in &providers {
1090            let provider = self.create_provider(provider_type);
1091            tracing::info!(
1092                "Trying search provider '{}' for query: {}",
1093                provider.name(),
1094                query
1095            );
1096
1097            match provider.search(query, max_results).await {
1098                Ok(mut results) => {
1099                    if results.is_empty() {
1100                        tracing::warn!(
1101                            "Provider '{}' returned 0 results for query: {}. This may indicate HTML structure changes or API issues.",
1102                            provider.name(),
1103                            query
1104                        );
1105                        // Don't treat empty results as an error - continue to next provider
1106                        last_error = Some(AppError::Io(std::io::Error::other(format!(
1107                            "Provider '{}' returned no results",
1108                            provider.name()
1109                        ))));
1110                        continue;
1111                    }
1112
1113                    tracing::info!(
1114                        "Provider '{}' returned {} results",
1115                        provider.name(),
1116                        results.len()
1117                    );
1118
1119                    // Optionally extract content from result pages
1120                    if self.config.extract_content {
1121                        results = self.enrich_with_content(results).await;
1122                    }
1123
1124                    return Ok(SearchResult {
1125                        query: query.to_string(),
1126                        results,
1127                        provider: provider.name().to_string(),
1128                    });
1129                }
1130                Err(e) => {
1131                    tracing::warn!("Provider '{}' failed with error: {}", provider.name(), e);
1132                    last_error = Some(e);
1133                }
1134            }
1135        }
1136
1137        // All providers failed or returned empty results
1138        Err(last_error.unwrap_or_else(|| {
1139            AppError::Io(std::io::Error::other(
1140                "All search providers failed or returned no results. \
1141                Consider configuring a search API (Brave or SerpAPI) in your config file. \
1142                The default Local provider uses DuckDuckGo HTML scraping which may be unreliable.",
1143            ))
1144        }))
1145    }
1146
1147    /// Enrich search results with extracted content from top results
1148    async fn enrich_with_content(&self, mut results: Vec<SearchItem>) -> Vec<SearchItem> {
1149        // Only fetch content for first 3 results to avoid overwhelming requests
1150        let fetch_count = 3.min(results.len());
1151
1152        for item in results.iter_mut().take(fetch_count) {
1153            if let Ok(content) = self.fetch_and_extract(&item.url).await {
1154                item.content = Some(content);
1155            }
1156        }
1157
1158        results
1159    }
1160
1161    /// Fetch a URL and extract structured content
1162    pub async fn fetch_and_extract(&self, url: &str) -> Result<ExtractedContent> {
1163        let response = self
1164            .client
1165            .get(url)
1166            .send()
1167            .await
1168            .map_err(|e| AppError::Io(std::io::Error::other(format!("Fetch failed: {e}"))))?;
1169
1170        let html = response
1171            .text()
1172            .await
1173            .map_err(|e| AppError::Io(std::io::Error::other(format!("Read failed: {e}"))))?;
1174
1175        let extractor = self.extractor.clone();
1176        let url = url.to_string();
1177        let error_url = url.clone();
1178        tokio::task::spawn_blocking(move || extractor.extract(&html, &url))
1179            .await
1180            .map_err(|error| {
1181                AppError::Io(std::io::Error::other(format!(
1182                    "Content extraction task failed for {}: {}",
1183                    error_url, error
1184                )))
1185            })
1186    }
1187
1188    /// Fetch a web page (raw)
1189    pub async fn fetch(&self, url: &str) -> Result<FetchResult> {
1190        let response = self
1191            .client
1192            .get(url)
1193            .send()
1194            .await
1195            .map_err(|e| AppError::Io(std::io::Error::other(format!("HTTP error: {e}"))))?;
1196
1197        let status_code = response.status().as_u16();
1198        let content_type = response
1199            .headers()
1200            .get("content-type")
1201            .and_then(|v| v.to_str().ok())
1202            .map(|s| s.to_string());
1203
1204        let mut headers = HashMap::new();
1205        for (key, value) in response.headers() {
1206            if let Ok(v) = value.to_str() {
1207                headers.insert(key.to_string(), v.to_string());
1208            }
1209        }
1210
1211        let content = response
1212            .text()
1213            .await
1214            .map_err(|e| AppError::Io(std::io::Error::other(format!("Read error: {e}"))))?;
1215
1216        Ok(FetchResult {
1217            url: url.to_string(),
1218            status_code,
1219            content_type,
1220            content,
1221            headers,
1222        })
1223    }
1224}
1225
1226// ============================================================================
1227// Legacy WebTools (for backward compatibility)
1228// ============================================================================
1229
1230/// Web operations service (legacy interface)
1231/// Prefer using WebSearchService for new code
1232pub struct WebTools {
1233    service: WebSearchService,
1234}
1235
1236impl Default for WebTools {
1237    fn default() -> Self {
1238        Self::new()
1239    }
1240}
1241
1242impl WebTools {
1243    pub fn new() -> Self {
1244        Self {
1245            service: WebSearchService::default(),
1246        }
1247    }
1248
1249    pub fn with_config(config: WebSearchConfig) -> Self {
1250        Self {
1251            service: WebSearchService::new(config),
1252        }
1253    }
1254
1255    /// Fetch a web page
1256    pub async fn fetch(&self, url: &str) -> Result<FetchResult> {
1257        self.service.fetch(url).await
1258    }
1259
1260    /// Search the web
1261    pub async fn search(&self, query: &str, num_results: Option<usize>) -> Result<SearchResult> {
1262        // Create a temporary config with custom result count if provided
1263        if let Some(count) = num_results {
1264            let mut config = self.service.config.clone();
1265            config.max_results = count;
1266            let temp_service = WebSearchService::new(config);
1267            temp_service.search(query).await
1268        } else {
1269            self.service.search(query).await
1270        }
1271    }
1272
1273    /// Convert HTML to plain text
1274    pub fn html_to_text(&self, html: &str) -> String {
1275        let content = self.service.extractor.extract(html, "");
1276        content.main_content
1277    }
1278
1279    /// Extract structured content from HTML
1280    pub fn extract_content(&self, html: &str, url: &str) -> ExtractedContent {
1281        self.service.extractor.extract(html, url)
1282    }
1283
1284    /// Fetch a web page and extract structured content
1285    /// This is more efficient than fetch() for LLM consumption as it returns
1286    /// only the extracted content instead of raw HTML
1287    pub async fn fetch_and_extract(&self, url: &str) -> Result<ExtractedContent> {
1288        self.service.fetch_and_extract(url).await
1289    }
1290}
1291
1292// ============================================================================
1293// Tests
1294// ============================================================================
1295
1296#[cfg(test)]
1297mod tests {
1298    use super::*;
1299
1300    #[test]
1301    fn test_content_extractor_basic() {
1302        let extractor = ContentExtractor::new(1000);
1303        let html = r#"
1304            <!DOCTYPE html>
1305            <html>
1306            <head><title>Test Page</title></head>
1307            <body>
1308                <h1>Hello World</h1>
1309                <p>This is a test paragraph.</p>
1310                <a href="https://example.com">Example Link</a>
1311            </body>
1312            </html>
1313        "#;
1314
1315        let content = extractor.extract(html, "https://test.com");
1316        assert_eq!(content.title, Some("Test Page".to_string()));
1317        assert!(content.main_content.contains("Hello World"));
1318        assert!(content.main_content.contains("test paragraph"));
1319        assert!(!content.links.is_empty());
1320        assert!(!content.headings.is_empty());
1321    }
1322
1323    #[test]
1324    fn test_content_extractor_code_blocks() {
1325        let extractor = ContentExtractor::new(1000);
1326        let html = r#"
1327            <html>
1328            <body>
1329                <pre><code class="language-rust">fn main() { println!("Hello"); }</code></pre>
1330            </body>
1331            </html>
1332        "#;
1333
1334        let content = extractor.extract(html, "https://test.com");
1335        assert!(!content.code_blocks.is_empty());
1336        assert_eq!(content.code_blocks[0].language, Some("rust".to_string()));
1337    }
1338
1339    #[test]
1340    fn test_ddg_url_resolution() {
1341        let provider = LocalSearchProvider::default();
1342
1343        // Test redirect URL
1344        let redirect = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpath";
1345        assert_eq!(
1346            provider.resolve_ddg_url(redirect),
1347            "https://example.com/path"
1348        );
1349
1350        // Test protocol-relative URL
1351        let relative = "//example.com/path";
1352        assert_eq!(
1353            provider.resolve_ddg_url(relative),
1354            "https://example.com/path"
1355        );
1356
1357        // Test normal URL
1358        let normal = "https://example.com";
1359        assert_eq!(provider.resolve_ddg_url(normal), "https://example.com");
1360    }
1361
1362    #[test]
1363    fn test_web_search_config_default() {
1364        let config = WebSearchConfig::default();
1365        assert!(matches!(config.provider, WebSearchProvider::Local));
1366        assert_eq!(config.max_results, 5);
1367        assert!(config.extract_content);
1368    }
1369}
gestura_core_tools/web.rs

gestura_core_tools/
web.rs