gestura_core_intent/
lib.rs

1//! Unified intent normalization layer for Gestura.
2//!
3//! `gestura-core-intent` converts every input modality—voice transcripts, chat
4//! text, ring gesture data (IMU taps/tilts), and future inputs—into one
5//! consistent [`Intent`] struct before any agentic processing.
6//!
7//! ## Design role
8//!
9//! This crate sits immediately after raw input capture and immediately before
10//! the pipeline's planning/execution phase. It ensures that downstream
11//! orchestration, tool selection, and response generation all operate on a
12//! single, modality-agnostic representation.
13//!
14//! ## Feature gating
15//!
16//! Intent normalization is gated behind `advanced-primitives`. When the feature
17//! is disabled the [`INTENT_NORMALIZATION_ENABLED`] constant is `false` and the
18//! middleware branch in the pipeline constant-folds away, preserving the
19//! original agentic loop behavior.
20//!
21//! ## Stable import paths
22//!
23//! Application code should import through the facade:
24//!
25//! - `gestura_core::intent::*`
26
27use std::collections::HashMap;
28
29use chrono::{DateTime, Utc};
30use gestura_core_pipeline::RequestSource;
31use serde::{Deserialize, Serialize};
32use uuid::Uuid;
33
34/// Compile-time flag exported to downstream crates so the normalization branch
35/// can constant-fold away when `advanced-primitives` is disabled.
36pub const INTENT_NORMALIZATION_ENABLED: bool = cfg!(feature = "advanced-primitives");
37
38// ---------------------------------------------------------------------------
39// InputModality
40// ---------------------------------------------------------------------------
41
42/// Input modality that produced the raw input.
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(rename_all = "snake_case")]
45pub enum InputModality {
46    /// Transcribed voice input from microphone / STT.
47    Voice,
48    /// Typed text input from GUI or CLI.
49    Chat,
50    /// Gesture input from the Haptic Harmony ring (IMU taps/tilts).
51    Gesture,
52    /// Any future input modality not yet enumerated.
53    Future(String),
54}
55
56impl InputModality {
57    /// Derive an [`InputModality`] from the pipeline's [`RequestSource`].
58    pub fn from_request_source(source: &RequestSource) -> Self {
59        match source {
60            RequestSource::GuiVoice => Self::Voice,
61            RequestSource::GuiText | RequestSource::CliTui | RequestSource::CliBasic => Self::Chat,
62            // Orchestrator-initiated requests are treated as chat since they
63            // originate from structured text delegation.
64            RequestSource::Orchestrator | RequestSource::Unknown => Self::Chat,
65        }
66    }
67
68    /// Short label for telemetry and metadata hints.
69    pub fn label(&self) -> &str {
70        match self {
71            Self::Voice => "voice",
72            Self::Chat => "chat",
73            Self::Gesture => "gesture",
74            Self::Future(name) => name.as_str(),
75        }
76    }
77}
78
79// ---------------------------------------------------------------------------
80// GestureData
81// ---------------------------------------------------------------------------
82
83/// Optional IMU gesture data from the Haptic Harmony ring.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct GestureData {
86    /// Gesture type identifier (e.g. `"tap"`, `"double_tap"`, `"tilt_left"`).
87    pub gesture_type: String,
88    /// Optional raw IMU acceleration values.
89    #[serde(default, skip_serializing_if = "Option::is_none")]
90    pub acceleration: Option<[f32; 3]>,
91    /// Optional raw IMU gyroscope values.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub gyroscope: Option<[f32; 3]>,
94    /// Gesture confidence from the on-device classifier (0.0–1.0).
95    #[serde(default = "default_gesture_confidence")]
96    pub confidence: f32,
97}
98
99fn default_gesture_confidence() -> f32 {
100    0.9
101}
102
103// ---------------------------------------------------------------------------
104// RawInput
105// ---------------------------------------------------------------------------
106
107/// Raw input envelope handed to the normalization layer.
108#[derive(Debug, Clone)]
109pub struct RawInput {
110    /// The primary text payload (transcription, typed message, or gesture label).
111    pub text: String,
112    /// Detected or declared input modality.
113    pub modality: InputModality,
114    /// Session identifier if the request is already session-scoped.
115    pub session_id: Option<String>,
116    /// Optional structured gesture data from the Haptic Harmony ring.
117    pub gesture_data: Option<GestureData>,
118}
119
120// ---------------------------------------------------------------------------
121// Intent
122// ---------------------------------------------------------------------------
123
124/// Unified, modality-agnostic intent produced by the normalization layer.
125///
126/// Every input—voice, chat, gesture, or future modality—is converted into an
127/// `Intent` before any agentic processing, ensuring a single consistent
128/// representation throughout the pipeline.
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct Intent {
131    /// Unique intent identifier (UUID v4).
132    pub id: String,
133    /// Timestamp when the intent was normalized.
134    pub timestamp: DateTime<Utc>,
135    /// Input modality that produced this intent.
136    pub modality: InputModality,
137    /// Original raw input preserved for debugging and audit.
138    pub raw_source: String,
139    /// Extracted primary action or command.
140    pub primary_action: String,
141    /// Structured parameters extracted from the input.
142    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
143    pub parameters: HashMap<String, serde_json::Value>,
144    /// Normalization confidence score (0.0–1.0).
145    pub confidence: f32,
146    /// Contextual hints derived during normalization.
147    #[serde(default, skip_serializing_if = "Vec::is_empty")]
148    pub context_hints: Vec<String>,
149}
150
151// ---------------------------------------------------------------------------
152// Normalization
153// ---------------------------------------------------------------------------
154
155/// Normalize a [`RawInput`] into a unified [`Intent`].
156///
157/// This is the single public entry point for intent normalization. It
158/// dispatches to modality-specific logic internally.
159pub fn normalize_input_to_intent(raw_input: RawInput) -> Intent {
160    match &raw_input.modality {
161        InputModality::Voice => normalize_voice(raw_input),
162        InputModality::Chat => normalize_chat(raw_input),
163        InputModality::Gesture => normalize_gesture(raw_input),
164        InputModality::Future(_) => normalize_future(raw_input),
165    }
166}
167
168/// Extract the primary action from the first sentence of text.
169///
170/// Uses [`find_sentence_boundary`] for dot handling, which avoids truncating
171/// on non-sentence dots such as filenames (`foo.rs`), version numbers (`1.5`),
172/// and URLs (`example.com`).
173fn extract_primary_action(text: &str) -> String {
174    let trimmed = text.trim();
175    if trimmed.is_empty() {
176        return String::new();
177    }
178
179    let end = find_sentence_boundary(trimmed);
180    let first_sentence = trimmed[..end].trim();
181
182    // Limit to a reasonable length for the action label.
183    first_sentence.chars().take(128).collect()
184}
185
186/// Return the byte offset of the first sentence boundary in `text`.
187///
188/// `!`, `?`, and `\n` are **unconditional** sentence terminators.
189///
190/// `.` is a sentence terminator **only** when the immediately following
191/// character is ASCII whitespace (`' '`, `'\t'`, `'\r'`) or there is no
192/// following character (end of string).  This prevents false splits on:
193///
194/// - filenames       (`foo.rs`, `lib.rs`, `Cargo.toml`)
195/// - version numbers (`1.5`, `2.0.1`)
196/// - URLs            (`https://example.com/path`)
197/// - method calls    (`vec.push()`)
198///
199/// # Implementation note
200///
201/// The scan operates on raw bytes.  This is safe for UTF-8 because the
202/// sentinel characters (`.`, `!`, `?`, `\n`, space, tab, CR) are all
203/// single-byte ASCII values that never appear in the continuation bytes of
204/// multi-byte UTF-8 sequences.  The returned index is therefore always a
205/// valid UTF-8 character boundary.
206fn find_sentence_boundary(text: &str) -> usize {
207    let bytes = text.as_bytes();
208    for (i, &b) in bytes.iter().enumerate() {
209        match b {
210            b'!' | b'?' | b'\n' => return i,
211            b'.' => {
212                // Treat as sentence end only when followed by whitespace or
213                // end-of-string; a word/digit/symbol after the dot means it is
214                // part of a filename, number, URL, or similar non-sentence context.
215                match bytes.get(i + 1) {
216                    None | Some(b' ') | Some(b'\t') | Some(b'\r') => return i,
217                    _ => {} // dot is embedded in a word/number — not a sentence end
218                }
219            }
220            _ => {}
221        }
222    }
223    text.len() // no boundary found — whole text is the first "sentence"
224}
225
226// ---- Voice normalization ----
227
228/// Filler words commonly produced by STT engines that carry no semantic value.
229const VOICE_FILLER_WORDS: &[&str] = &[
230    "um",
231    "uh",
232    "er",
233    "ah",
234    "like",
235    "you know",
236    "so basically",
237    "basically",
238    "I mean",
239    "well",
240    "okay so",
241    "right so",
242];
243
244fn strip_fillers(text: &str) -> String {
245    let mut result = text.to_string();
246    for filler in VOICE_FILLER_WORDS {
247        // Case-insensitive removal; collapse resulting double-spaces.
248        // `find_filler_in_original` returns byte ranges measured in the
249        // *original* string so that slicing is always at valid UTF-8
250        // char boundaries — see its doc-comment for why a naïve
251        // `haystack.to_lowercase().find(needle)` is unsafe here.
252        let pattern_lower = filler.to_lowercase();
253        while let Some((pos, end)) = find_filler_in_original(&result, &pattern_lower) {
254            result = format!("{}{}", &result[..pos], &result[end..]);
255        }
256    }
257    // Collapse multiple spaces.
258    collapse_whitespace(&result)
259}
260
261/// Find `needle` (already lower-cased ASCII) inside `haystack`
262/// case-insensitively, returning `(start_byte, end_byte)` measured in the
263/// **original** `haystack`.
264///
265/// # Why not `haystack.to_lowercase().find(needle)`?
266///
267/// `to_lowercase()` can change the UTF-8 byte length of a character.
268/// For example, U+0130 `İ` (2 bytes) lowercases to `i` + U+0307 combining
269/// dot (3 bytes). A byte offset found inside the lowercased copy therefore
270/// does not point to the same position in the original string — using it
271/// directly to slice `haystack` can panic (non-char boundary) or silently
272/// produce wrong output.
273///
274/// This function iterates over `haystack.char_indices()` so that every
275/// position it returns is a guaranteed valid char boundary of `haystack`,
276/// regardless of what case-folding does to individual characters.
277fn find_filler_in_original(haystack: &str, needle: &str) -> Option<(usize, usize)> {
278    if needle.is_empty() {
279        return Some((0, 0));
280    }
281
282    // Collect the needle chars once (needle is always lower-cased ASCII).
283    let needle_chars: Vec<char> = needle.chars().collect();
284    let needle_len = needle_chars.len();
285
286    // Collect (byte_offset, char) pairs from the *original* string.
287    // Using char_indices guarantees that `start` and `end` are always on
288    // valid UTF-8 char boundaries of `haystack`.
289    let chars: Vec<(usize, char)> = haystack.char_indices().collect();
290
291    'outer: for i in 0..chars.len() {
292        if chars.len() - i < needle_len {
293            break;
294        }
295        for j in 0..needle_len {
296            let hc = chars[i + j].1;
297            let nc = needle_chars[j]; // already lower-case
298            // Lower-case the haystack char and compare the full scalar
299            // sequence so multi-scalar lower-case expansions (e.g. 'fi' →
300            // 'f','i') never accidentally match a single needle char.
301            if !hc.to_lowercase().eq(std::iter::once(nc)) {
302                continue 'outer;
303            }
304        }
305        let start = chars[i].0;
306        let end = chars
307            .get(i + needle_len)
308            .map_or(haystack.len(), |&(b, _)| b);
309        return Some((start, end));
310    }
311    None
312}
313
314fn collapse_whitespace(text: &str) -> String {
315    let mut result = String::with_capacity(text.len());
316    let mut last_was_space = false;
317    for ch in text.chars() {
318        if ch.is_whitespace() {
319            if !last_was_space && !result.is_empty() {
320                result.push(' ');
321            }
322            last_was_space = true;
323        } else {
324            result.push(ch);
325            last_was_space = false;
326        }
327    }
328    result.trim().to_string()
329}
330
331fn voice_confidence(text: &str) -> f32 {
332    let word_count = text.split_whitespace().count();
333    if word_count == 0 {
334        return 0.0;
335    }
336    // Heuristic: very short transcripts may be noisy.
337    if word_count < 3 {
338        0.6
339    } else if word_count < 8 {
340        0.75
341    } else {
342        0.85
343    }
344}
345
346fn normalize_voice(raw: RawInput) -> Intent {
347    let cleaned = strip_fillers(&raw.text);
348    let primary_action = extract_primary_action(&cleaned);
349    let confidence = voice_confidence(&cleaned);
350
351    let mut context_hints = Vec::new();
352    context_hints.push("source:voice_transcript".to_string());
353    if cleaned.len() < raw.text.len() {
354        context_hints.push("fillers_stripped".to_string());
355    }
356
357    Intent {
358        id: Uuid::new_v4().to_string(),
359        timestamp: Utc::now(),
360        modality: InputModality::Voice,
361        raw_source: raw.text,
362        primary_action,
363        parameters: HashMap::new(),
364        confidence,
365        context_hints,
366    }
367}
368
369// ---- Chat normalization ----
370
371fn normalize_chat(raw: RawInput) -> Intent {
372    let trimmed = raw.text.trim().to_string();
373    let primary_action = extract_primary_action(&trimmed);
374
375    Intent {
376        id: Uuid::new_v4().to_string(),
377        timestamp: Utc::now(),
378        modality: InputModality::Chat,
379        raw_source: raw.text,
380        primary_action,
381        parameters: HashMap::new(),
382        confidence: 0.95,
383        context_hints: vec!["source:chat_text".to_string()],
384    }
385}
386
387// ---- Gesture normalization ----
388
389/// Map well-known gesture types to semantic action labels.
390fn gesture_to_action(gesture_type: &str) -> (&'static str, f32) {
391    match gesture_type.to_lowercase().as_str() {
392        "tap" => ("confirm", 0.9),
393        "double_tap" => ("execute", 0.92),
394        "triple_tap" => ("cancel", 0.88),
395        "tilt_left" => ("previous", 0.85),
396        "tilt_right" => ("next", 0.85),
397        "tilt_up" => ("scroll_up", 0.8),
398        "tilt_down" => ("scroll_down", 0.8),
399        "twist_cw" => ("increase", 0.82),
400        "twist_ccw" => ("decrease", 0.82),
401        "shake" => ("dismiss", 0.78),
402        "hold" => ("select", 0.88),
403        _ => ("unknown_gesture", 0.5),
404    }
405}
406
407fn normalize_gesture(raw: RawInput) -> Intent {
408    let gesture_type = raw
409        .gesture_data
410        .as_ref()
411        .map(|g| g.gesture_type.as_str())
412        .unwrap_or_else(|| raw.text.trim());
413
414    let device_confidence = raw
415        .gesture_data
416        .as_ref()
417        .map(|g| g.confidence)
418        .unwrap_or(0.9);
419
420    let (action, mapping_confidence) = gesture_to_action(gesture_type);
421
422    // Combined confidence: device classifier × mapping certainty.
423    let confidence = device_confidence * mapping_confidence;
424
425    let mut parameters = HashMap::new();
426    parameters.insert(
427        "gesture_type".to_string(),
428        serde_json::Value::String(gesture_type.to_string()),
429    );
430
431    if let Some(ref gesture) = raw.gesture_data {
432        if let Some(accel) = gesture.acceleration {
433            parameters.insert("acceleration".to_string(), serde_json::json!(accel));
434        }
435        if let Some(gyro) = gesture.gyroscope {
436            parameters.insert("gyroscope".to_string(), serde_json::json!(gyro));
437        }
438    }
439
440    let mut context_hints = vec!["source:gesture_ring".to_string()];
441    if action == "unknown_gesture" {
442        context_hints.push("unmapped_gesture".to_string());
443    }
444
445    Intent {
446        id: Uuid::new_v4().to_string(),
447        timestamp: Utc::now(),
448        modality: InputModality::Gesture,
449        raw_source: raw.text,
450        primary_action: action.to_string(),
451        parameters,
452        confidence,
453        context_hints,
454    }
455}
456
457// ---- Future modality ----
458
459fn normalize_future(raw: RawInput) -> Intent {
460    let primary_action = extract_primary_action(&raw.text);
461
462    Intent {
463        id: Uuid::new_v4().to_string(),
464        timestamp: Utc::now(),
465        modality: raw.modality.clone(),
466        raw_source: raw.text,
467        primary_action,
468        parameters: HashMap::new(),
469        confidence: 0.7,
470        context_hints: vec!["source:future_modality".to_string()],
471    }
472}
473
474// ---------------------------------------------------------------------------
475// Tests
476// ---------------------------------------------------------------------------
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    #[test]
483    fn voice_produces_valid_intent() {
484        let raw = RawInput {
485            text: "Um, like, please create a new file called foo.rs".to_string(),
486            modality: InputModality::Voice,
487            session_id: Some("session-1".to_string()),
488            gesture_data: None,
489        };
490
491        let intent = normalize_input_to_intent(raw);
492
493        assert_eq!(intent.modality, InputModality::Voice);
494        assert!(!intent.id.is_empty());
495        assert!(!intent.primary_action.is_empty());
496        // Fillers should be stripped from the action.
497        assert!(
498            !intent.primary_action.to_lowercase().contains("um,"),
499            "Filler 'um' should be stripped"
500        );
501        assert!(intent.confidence > 0.0 && intent.confidence <= 1.0);
502        assert!(
503            intent
504                .context_hints
505                .contains(&"source:voice_transcript".to_string())
506        );
507        assert!(intent.raw_source.contains("Um")); // raw preserved
508    }
509
510    #[test]
511    fn chat_produces_valid_intent() {
512        let raw = RawInput {
513            text: "Refactor the authentication module to use OAuth2".to_string(),
514            modality: InputModality::Chat,
515            session_id: None,
516            gesture_data: None,
517        };
518
519        let intent = normalize_input_to_intent(raw);
520
521        assert_eq!(intent.modality, InputModality::Chat);
522        assert!(!intent.id.is_empty());
523        assert_eq!(
524            intent.primary_action,
525            "Refactor the authentication module to use OAuth2"
526        );
527        assert!(
528            (intent.confidence - 0.95).abs() < f32::EPSILON,
529            "Chat confidence should be 0.95"
530        );
531        assert!(
532            intent
533                .context_hints
534                .contains(&"source:chat_text".to_string())
535        );
536    }
537
538    #[test]
539    fn gesture_produces_valid_intent() {
540        let raw = RawInput {
541            text: "double_tap".to_string(),
542            modality: InputModality::Gesture,
543            session_id: Some("session-2".to_string()),
544            gesture_data: Some(GestureData {
545                gesture_type: "double_tap".to_string(),
546                acceleration: Some([0.1, 9.8, 0.3]),
547                gyroscope: None,
548                confidence: 0.95,
549            }),
550        };
551
552        let intent = normalize_input_to_intent(raw);
553
554        assert_eq!(intent.modality, InputModality::Gesture);
555        assert_eq!(intent.primary_action, "execute");
556        assert!(intent.confidence > 0.8);
557        assert!(intent.parameters.contains_key("gesture_type"));
558        assert!(intent.parameters.contains_key("acceleration"));
559        assert!(
560            intent
561                .context_hints
562                .contains(&"source:gesture_ring".to_string())
563        );
564    }
565
566    #[test]
567    fn gesture_without_data_falls_back_to_text() {
568        let raw = RawInput {
569            text: "tap".to_string(),
570            modality: InputModality::Gesture,
571            session_id: None,
572            gesture_data: None,
573        };
574
575        let intent = normalize_input_to_intent(raw);
576        assert_eq!(intent.primary_action, "confirm");
577    }
578
579    #[test]
580    fn unknown_gesture_has_low_confidence() {
581        let raw = RawInput {
582            text: "backflip".to_string(),
583            modality: InputModality::Gesture,
584            session_id: None,
585            gesture_data: Some(GestureData {
586                gesture_type: "backflip".to_string(),
587                acceleration: None,
588                gyroscope: None,
589                confidence: 0.9,
590            }),
591        };
592
593        let intent = normalize_input_to_intent(raw);
594        assert_eq!(intent.primary_action, "unknown_gesture");
595        assert!(intent.confidence < 0.6);
596        assert!(
597            intent
598                .context_hints
599                .contains(&"unmapped_gesture".to_string())
600        );
601    }
602
603    #[test]
604    fn future_modality_passes_through() {
605        let raw = RawInput {
606            text: "Neural signal: focus next element".to_string(),
607            modality: InputModality::Future("neural".to_string()),
608            session_id: None,
609            gesture_data: None,
610        };
611
612        let intent = normalize_input_to_intent(raw);
613        assert_eq!(intent.modality, InputModality::Future("neural".to_string()));
614        assert!(!intent.primary_action.is_empty());
615        assert!((intent.confidence - 0.7).abs() < f32::EPSILON);
616    }
617
618    #[test]
619    fn voice_chat_gesture_produce_equivalent_structs() {
620        // All three produce the same Intent shape — the struct fields are
621        // always populated regardless of modality.
622        let voice = normalize_input_to_intent(RawInput {
623            text: "hello world".to_string(),
624            modality: InputModality::Voice,
625            session_id: None,
626            gesture_data: None,
627        });
628        let chat = normalize_input_to_intent(RawInput {
629            text: "hello world".to_string(),
630            modality: InputModality::Chat,
631            session_id: None,
632            gesture_data: None,
633        });
634        let gesture = normalize_input_to_intent(RawInput {
635            text: "tap".to_string(),
636            modality: InputModality::Gesture,
637            session_id: None,
638            gesture_data: None,
639        });
640
641        // All should have non-empty required fields.
642        for intent in [&voice, &chat, &gesture] {
643            assert!(!intent.id.is_empty());
644            assert!(!intent.primary_action.is_empty());
645            assert!(intent.confidence > 0.0);
646            assert!(!intent.context_hints.is_empty());
647        }
648    }
649
650    #[test]
651    fn modality_from_request_source() {
652        assert_eq!(
653            InputModality::from_request_source(&RequestSource::GuiVoice),
654            InputModality::Voice
655        );
656        assert_eq!(
657            InputModality::from_request_source(&RequestSource::GuiText),
658            InputModality::Chat
659        );
660        assert_eq!(
661            InputModality::from_request_source(&RequestSource::CliTui),
662            InputModality::Chat
663        );
664        assert_eq!(
665            InputModality::from_request_source(&RequestSource::Orchestrator),
666            InputModality::Chat
667        );
668    }
669
670    #[test]
671    fn intent_serialization_roundtrip() {
672        let intent = normalize_input_to_intent(RawInput {
673            text: "Build the project".to_string(),
674            modality: InputModality::Chat,
675            session_id: Some("s-1".to_string()),
676            gesture_data: None,
677        });
678
679        let json = serde_json::to_string(&intent).expect("serialize");
680        let parsed: Intent = serde_json::from_str(&json).expect("deserialize");
681        assert_eq!(parsed.id, intent.id);
682        assert_eq!(parsed.primary_action, intent.primary_action);
683        assert_eq!(parsed.modality, intent.modality);
684    }
685
686    #[test]
687    fn empty_text_has_zero_voice_confidence() {
688        let intent = normalize_input_to_intent(RawInput {
689            text: "".to_string(),
690            modality: InputModality::Voice,
691            session_id: None,
692            gesture_data: None,
693        });
694
695        assert!((intent.confidence - 0.0).abs() < f32::EPSILON);
696    }
697
698    // -----------------------------------------------------------------------
699    // strip_fillers / find_filler_in_original
700    // -----------------------------------------------------------------------
701
702    #[test]
703    fn strip_fillers_removes_ascii_filler_case_insensitively() {
704        // All-ASCII path: basic case-insensitive removal.
705        // Note: only the exact filler token is removed; adjacent punctuation
706        // (commas, etc.) is preserved by design.
707        assert_eq!(
708            strip_fillers("Um please open the file"),
709            "please open the file"
710        );
711        assert_eq!(
712            strip_fillers("like please open the file"),
713            "please open the file"
714        );
715        assert_eq!(strip_fillers("UM like uh do it"), "do it");
716    }
717
718    #[test]
719    fn strip_fillers_preserves_non_filler_content() {
720        let input = "Create a new Rust project";
721        assert_eq!(strip_fillers(input), input);
722    }
723
724    #[test]
725    fn strip_fillers_with_non_ascii_prefix_does_not_panic() {
726        // U+0130 İ (2 UTF-8 bytes) lowercases to 'i' + U+0307 (3 bytes).
727        // A naïve `haystack.to_lowercase().find(needle)` returns an offset
728        // inside the lowercased string that is 1 byte ahead of the real
729        // position in the original, potentially slicing in the middle of the
730        // İ codepoint and causing a panic.  The fixed implementation must not
731        // panic and must remove the filler word correctly.
732        let input = "İ um please do this";
733        let result = strip_fillers(input);
734        // The filler "um" must be removed; the non-ASCII prefix must survive.
735        assert!(
736            !result.contains("um"),
737            "filler 'um' should be removed, got: {result:?}"
738        );
739        assert!(
740            result.contains('İ'),
741            "non-ASCII prefix should be preserved, got: {result:?}"
742        );
743    }
744
745    #[test]
746    fn strip_fillers_with_non_ascii_interleaved_does_not_panic() {
747        // Mix of non-ASCII chars around a filler word.
748        // "Ñoño" contains no filler substrings, so only "um" is stripped.
749        // (Note: "über" is intentionally avoided here because it contains
750        // "er", which IS in VOICE_FILLER_WORDS and would also be removed.)
751        let input = "Ñoño um test";
752        let result = strip_fillers(input);
753        assert!(
754            !result.contains("um"),
755            "filler 'um' should be removed, got: {result:?}"
756        );
757        assert!(
758            result.contains("Ñoño"),
759            "non-ASCII word should survive, got: {result:?}"
760        );
761    }
762
763    #[test]
764    fn find_filler_returns_valid_original_byte_range() {
765        // Verify that the returned (start, end) range is always a valid slice
766        // of the original haystack, even with a non-ASCII prefix.
767        let haystack = "İ um test"; // İ = 2 bytes
768        let needle = "um";
769        let (start, end) = find_filler_in_original(haystack, needle).expect("should find 'um'");
770        // Slicing at these offsets must not panic.
771        let before = &haystack[..start];
772        let after = &haystack[end..];
773        assert!(before.contains('İ'));
774        assert_eq!(after.trim(), "test");
775    }
776
777    #[test]
778    fn find_filler_returns_none_when_not_present() {
779        assert!(find_filler_in_original("hello world", "um").is_none());
780    }
781
782    #[test]
783    fn find_filler_empty_needle_returns_zero_range() {
784        assert_eq!(find_filler_in_original("hello", ""), Some((0, 0)));
785    }
786
787    // -----------------------------------------------------------------------
788    // extract_primary_action / find_sentence_boundary
789    // -----------------------------------------------------------------------
790
791    #[test]
792    fn extract_primary_action_does_not_split_on_filename_dot() {
793        // Dots embedded in filenames must NOT be treated as sentence terminators.
794        assert_eq!(
795            extract_primary_action("please create foo.rs and add tests"),
796            "please create foo.rs and add tests",
797        );
798        assert_eq!(
799            extract_primary_action("open lib.rs for editing"),
800            "open lib.rs for editing",
801        );
802        assert_eq!(
803            extract_primary_action("edit Cargo.toml to add the dependency"),
804            "edit Cargo.toml to add the dependency",
805        );
806    }
807
808    #[test]
809    fn extract_primary_action_does_not_split_on_version_number() {
810        assert_eq!(
811            extract_primary_action("upgrade to version 1.5 of the SDK"),
812            "upgrade to version 1.5 of the SDK",
813        );
814        assert_eq!(
815            extract_primary_action("pin gestura-core to 2.0.1 in Cargo.toml"),
816            "pin gestura-core to 2.0.1 in Cargo.toml",
817        );
818    }
819
820    #[test]
821    fn extract_primary_action_does_not_split_on_url() {
822        assert_eq!(
823            extract_primary_action("visit https://example.com/path for the docs"),
824            "visit https://example.com/path for the docs",
825        );
826    }
827
828    #[test]
829    fn extract_primary_action_does_not_split_on_method_call_dot() {
830        assert_eq!(
831            extract_primary_action("call vec.push() and return the result"),
832            "call vec.push() and return the result",
833        );
834    }
835
836    #[test]
837    fn extract_primary_action_splits_on_sentence_ending_dot() {
838        // A dot followed by a space IS a sentence terminator.
839        assert_eq!(
840            extract_primary_action("Fix the bug. Add tests afterwards."),
841            "Fix the bug",
842        );
843    }
844
845    #[test]
846    fn extract_primary_action_splits_on_dot_at_end_of_string() {
847        // A dot at the very end of the string (no following char) is a sentence end.
848        // The result preserves the filename inside the sentence up to the final dot.
849        assert_eq!(extract_primary_action("Check file.rs."), "Check file.rs",);
850    }
851
852    #[test]
853    fn extract_primary_action_splits_on_exclamation_and_question() {
854        assert_eq!(
855            extract_primary_action("Do it now! Please hurry."),
856            "Do it now",
857        );
858        assert_eq!(
859            extract_primary_action("What should I do? Maybe this."),
860            "What should I do",
861        );
862    }
863
864    #[test]
865    fn extract_primary_action_splits_on_newline() {
866        assert_eq!(
867            extract_primary_action("First line\nSecond line"),
868            "First line",
869        );
870    }
871
872    #[test]
873    fn extract_primary_action_no_punctuation_returns_whole_text() {
874        let input = "update the authentication module to use OAuth2";
875        assert_eq!(extract_primary_action(input), input);
876    }
877
878    #[test]
879    fn extract_primary_action_caps_at_128_chars() {
880        let long_input = "a".repeat(200);
881        let result = extract_primary_action(&long_input);
882        assert_eq!(result.chars().count(), 128);
883    }
884
885    #[test]
886    fn find_sentence_boundary_dot_before_non_ascii_is_not_a_boundary() {
887        // A multi-byte UTF-8 char after '.' (e.g. 'ü' = 0xC3 0xBC) must not
888        // be mistaken for whitespace; the continuation byte 0xBF is > 0x7F.
889        assert_eq!(extract_primary_action("foo.über alles"), "foo.über alles",);
890    }
891}