gestura_core_pipeline/
reflection.rs

1//! ERL-inspired experiential reflection types and pure helpers.
2//!
3//! This module adapts the high-level loop from Experiential Reinforcement
4//! Learning (ERL) into Gestura's pipeline model:
5//!
6//! 1. **Experience** — the agent makes an initial attempt and observes tool
7//!    outcomes plus any obvious failure signals.
8//! 2. **Reflection** — when the response quality score falls below a configured
9//!    threshold, the runtime asks the model for a structured explanation of what
10//!    went wrong and how to improve.
11//! 3. **Consolidation** — the resulting reflection can be reused within the same
12//!    turn, stored in session working memory, and promoted into long-term memory
13//!    for later prompt injection.
14//!
15//! This crate owns the *portable* pieces of that design:
16//!
17//! - reflection configuration and data structures
18//! - quality-signal extraction and heuristic scoring
19//! - prompt construction for reflection generation
20//! - parsing of the structured reflection response format
21//!
22//! The concrete runtime integration lives in
23//! `gestura-core/src/pipeline/reflection.rs`, which wires these helpers into the
24//! agent loop, streaming events, session storage, and memory-bank promotion.
25
26use chrono::{DateTime, Utc};
27use gestura_core_foundation::OutcomeSignal;
28use serde::{Deserialize, Serialize};
29
30use crate::types::{AgentResponse, ToolResult};
31
32/// Configuration for the experiential reflection system.
33///
34/// The settings map the ERL-inspired design onto Gestura's runtime behavior:
35///
36/// - `enabled` keeps the feature opt-in because it adds an extra LLM call on
37///   weak turns and can therefore increase latency/cost.
38/// - `quality_threshold` maps to ERL's τ-style gate for deciding when a turn is
39///   poor enough to merit reflection.
40/// - `max_injected_reflections` limits how much cross-episode corrective memory
41///   can be injected back into future prompts.
42/// - `max_retry_attempts` bounds same-turn corrective retries. A retry may be a
43///   text-only revision or one safe read-only re-execution driven by the
44///   reflection strategy, but the runtime still caps it to a single retry.
45/// - `promotion_confidence` gates whether a reflection is strong enough to move
46///   from short-term/session memory into long-term memory-bank storage.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct ReflectionConfig {
49    /// Enable the reflection phase in the agent loop.
50    pub enabled: bool,
51    /// Quality threshold (0.0–1.0). Reflection only triggers when
52    /// the response quality score falls below this value.
53    /// Maps to ERL's τ parameter (gated reflection).
54    pub quality_threshold: f32,
55    /// Maximum number of past reflections to inject into prompt context.
56    pub max_injected_reflections: usize,
57    /// Maximum number of reflection-guided corrective retries per turn.
58    ///
59    /// The runtime currently applies at most one bounded retry. That retry may
60    /// be a text-only revision or one safe re-execution with read-only tool
61    /// policy.
62    pub max_retry_attempts: usize,
63    /// Minimum confidence for a reflection to be promoted to long-term memory.
64    pub promotion_confidence: f32,
65}
66
67impl Default for ReflectionConfig {
68    fn default() -> Self {
69        Self {
70            enabled: true,          // On by default
71            quality_threshold: 0.6, // Trigger reflection below 60% quality
72            max_injected_reflections: 3,
73            max_retry_attempts: 1,
74            promotion_confidence: 0.75,
75        }
76    }
77}
78
79/// A structured reflection generated after a suboptimal agent turn.
80///
81/// This is Gestura's durable representation of ERL's corrective reflection: a
82/// concise summary of the attempted action, the failure mode, and the strategy
83/// the agent should apply next time.
84///
85/// The runtime can:
86///
87/// - use it immediately for a same-turn retry,
88/// - store it in session working memory as short-term corrective context, and
89/// - promote it into `MemoryType::Reflection` for retrieval in future turns.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct AgentReflection {
92    /// Stable identifier so downstream outcomes can update the same reflection.
93    pub reflection_id: String,
94    /// What the agent attempted.
95    pub attempt_summary: String,
96    /// What went wrong or was suboptimal.
97    pub failure_analysis: String,
98    /// Concrete corrective strategy for future attempts.
99    pub corrective_strategy: String,
100    /// Quality improvement score (0.0–1.0) — did the reflection help?
101    /// Set after a subsequent attempt to measure improvement.
102    pub improvement_score: Option<f32>,
103    /// Tags for retrieval (tool names, error categories, task types).
104    #[serde(default, skip_serializing_if = "Vec::is_empty")]
105    pub tags: Vec<String>,
106    /// Durable outcome signals linked back from retries, gates, and task outcomes.
107    #[serde(default, skip_serializing_if = "Vec::is_empty")]
108    pub outcome_signals: Vec<OutcomeSignal>,
109    /// Session context.
110    pub session_id: String,
111    /// Task ID if available.
112    pub task_id: Option<String>,
113    /// When this reflection was created.
114    pub timestamp: DateTime<Utc>,
115}
116
117impl AgentReflection {
118    /// Create a new reflection.
119    pub fn new(
120        session_id: impl Into<String>,
121        attempt_summary: impl Into<String>,
122        failure_analysis: impl Into<String>,
123        corrective_strategy: impl Into<String>,
124    ) -> Self {
125        let session_id = session_id.into();
126        Self {
127            reflection_id: build_reflection_id(&session_id),
128            attempt_summary: attempt_summary.into(),
129            failure_analysis: failure_analysis.into(),
130            corrective_strategy: corrective_strategy.into(),
131            improvement_score: None,
132            tags: Vec::new(),
133            outcome_signals: Vec::new(),
134            session_id,
135            task_id: None,
136            timestamp: Utc::now(),
137        }
138    }
139
140    /// Attach tags for retrieval.
141    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
142        self.tags = tags;
143        self
144    }
145
146    /// Attach a task ID.
147    pub fn with_task(mut self, task_id: impl Into<String>) -> Self {
148        self.task_id = Some(task_id.into());
149        self
150    }
151
152    /// Attach durable outcome signals for corrective-learning provenance.
153    #[must_use]
154    pub fn with_outcome_signals(mut self, outcome_signals: Vec<OutcomeSignal>) -> Self {
155        self.outcome_signals = outcome_signals;
156        self
157    }
158
159    /// Record a single durable outcome signal.
160    pub fn push_outcome_signal(&mut self, signal: OutcomeSignal) {
161        self.outcome_signals = merge_outcome_signals(&self.outcome_signals, &[signal]);
162    }
163
164    /// Score how strongly this reflection should be promoted into durable memory.
165    #[must_use]
166    pub fn promotion_confidence(&self) -> f32 {
167        reflection_promotion_confidence(self.improvement_score, &self.outcome_signals)
168    }
169
170    /// Format as a prompt section for context injection.
171    pub fn to_prompt_section(&self) -> String {
172        let improvement = self.improvement_score.map(|score| {
173            format!(
174                "             - Observed improvement after retry: {:.0}%\n",
175                score * 100.0
176            )
177        });
178        let outcomes = if self.outcome_signals.is_empty() {
179            String::new()
180        } else {
181            format!(
182                "             - Outcomes: {}\n",
183                self.outcome_signals
184                    .iter()
185                    .map(|signal| signal.kind.label())
186                    .collect::<Vec<_>>()
187                    .join(", ")
188            )
189        };
190
191        format!(
192            "**Reflection** ({})\n\
193             - Attempted: {}\n\
194             - Issue: {}\n\
195             - Strategy: {}\n{}{}",
196            self.timestamp.format("%Y-%m-%d %H:%M UTC"),
197            self.attempt_summary,
198            self.failure_analysis,
199            self.corrective_strategy,
200            improvement.unwrap_or_default(),
201            outcomes,
202        )
203    }
204}
205
206fn build_reflection_id(session_id: &str) -> String {
207    let nanos = Utc::now()
208        .timestamp_nanos_opt()
209        .unwrap_or_else(|| Utc::now().timestamp_micros() * 1_000);
210    let session_fragment = session_id
211        .chars()
212        .filter(|ch| ch.is_ascii_alphanumeric())
213        .take(12)
214        .collect::<String>();
215    if session_fragment.is_empty() {
216        format!("reflection-{nanos}")
217    } else {
218        format!("reflection-{session_fragment}-{nanos}")
219    }
220}
221
222/// Score how strongly a reflection should be promoted into durable memory.
223#[must_use]
224pub fn reflection_promotion_confidence(
225    improvement_score: Option<f32>,
226    outcome_signals: &[OutcomeSignal],
227) -> f32 {
228    let base = improvement_score
229        .map(|score| (0.55 + (score * 0.35)).clamp(0.55, 0.90))
230        .unwrap_or(0.62);
231    let delta: f32 = outcome_signals
232        .iter()
233        .map(|signal| signal.kind.confidence_delta())
234        .sum();
235    (base + delta).clamp(0.30, 0.97)
236}
237
238/// Merge outcome signals, replacing older entries of the same kind with the newest.
239#[must_use]
240pub fn merge_outcome_signals(
241    existing: &[OutcomeSignal],
242    incoming: &[OutcomeSignal],
243) -> Vec<OutcomeSignal> {
244    let mut merged = existing.to_vec();
245    for signal in incoming {
246        if let Some(slot) = merged
247            .iter_mut()
248            .find(|current| current.kind == signal.kind)
249        {
250            *slot = signal.clone();
251        } else {
252            merged.push(signal.clone());
253        }
254    }
255    merged.sort_by_key(|signal| signal.observed_at);
256    merged
257}
258
259/// Score how much a reflection-guided retry improved quality.
260///
261/// Returns a normalized 0.0–1.0 value where:
262/// - `0.0` means no measurable improvement
263/// - `1.0` means the retry closed the entire remaining quality gap
264pub fn score_reflection_improvement(initial_quality: f32, retry_quality: f32) -> f32 {
265    if retry_quality <= initial_quality {
266        return 0.0;
267    }
268
269    let available_headroom = (1.0 - initial_quality).max(f32::EPSILON);
270    ((retry_quality - initial_quality) / available_headroom).clamp(0.0, 1.0)
271}
272
273/// Build heuristic quality signals from an agent response.
274///
275/// These signals stand in for the explicit reward signal used in ERL. Gestura
276/// does not have a verifiable task reward for most agent turns, so the runtime
277/// falls back to observable quality proxies such as tool errors, iteration
278/// pressure, truncation, and explicit failure language.
279pub fn quality_signals_for_response(
280    response: &AgentResponse,
281    max_iterations: usize,
282) -> QualitySignals {
283    let total_tool_calls = response.tool_calls.len();
284    let error_count = response
285        .tool_calls
286        .iter()
287        .filter(|tc| matches!(&tc.result, ToolResult::Error(_)))
288        .count();
289
290    let tool_error_rate = if total_tool_calls > 0 {
291        error_count as f32 / total_tool_calls as f32
292    } else {
293        0.0
294    };
295
296    QualitySignals {
297        tool_error_rate,
298        iterations_used: response.iterations,
299        max_iterations,
300        was_truncated: response.truncated,
301        has_failure_patterns: detect_failure_patterns(&response.content)
302            || detect_assertive_uncertainty(&response.content)
303            || detect_missing_debug_structure(&response.content),
304        is_empty_response: response.content.trim().is_empty(),
305    }
306}
307
308/// Heuristic quality signals extracted from an agent response.
309///
310/// These signals drive the quality gate (ERL's τ threshold) that
311/// determines whether a reflection should be triggered.
312#[derive(Debug, Clone)]
313pub struct QualitySignals {
314    /// Fraction of tool calls that resulted in errors (0.0–1.0).
315    pub tool_error_rate: f32,
316    /// Number of agentic loop iterations used.
317    pub iterations_used: usize,
318    /// Maximum iterations configured.
319    pub max_iterations: usize,
320    /// Whether the response was truncated due to token limits.
321    pub was_truncated: bool,
322    /// Whether the response contains apology/failure patterns.
323    pub has_failure_patterns: bool,
324    /// Whether the response is empty or near-empty.
325    pub is_empty_response: bool,
326}
327
328impl QualitySignals {
329    /// Compute an aggregate quality score (0.0–1.0, higher is better).
330    ///
331    /// This is the heuristic that replaces ERL's verifiable reward signal.
332    /// In our setting we don't have a ground-truth reward, so we use
333    /// observable proxy signals from the agent's execution.
334    pub fn score(&self) -> f32 {
335        if self.is_empty_response {
336            return 0.0;
337        }
338
339        let mut score: f32 = 1.0;
340
341        // Tool errors are a strong negative signal
342        score -= self.tool_error_rate * 0.4;
343
344        // Using many iterations suggests the agent is struggling
345        if self.max_iterations > 0 {
346            let iteration_ratio = self.iterations_used as f32 / self.max_iterations as f32;
347            if iteration_ratio > 0.7 {
348                score -= (iteration_ratio - 0.7) * 0.5;
349            }
350        }
351
352        // Truncation indicates context overflow issues
353        if self.was_truncated {
354            score -= 0.15;
355        }
356
357        // Explicit failure/apology patterns
358        if self.has_failure_patterns {
359            score -= 0.25;
360        }
361
362        score.clamp(0.0, 1.0)
363    }
364}
365
366/// Check if response text contains common failure/apology patterns.
367pub fn detect_failure_patterns(text: &str) -> bool {
368    let lower = text.to_lowercase();
369    let patterns = [
370        "i'm sorry, i can't",
371        "i cannot",
372        "i'm unable to",
373        "unfortunately, i",
374        "i don't have the ability",
375        "i apologize, but i",
376        "i'm not able to",
377        "error occurred",
378        "failed to execute",
379        "i was not able to",
380        "i was unable to",
381        "as an ai, i",
382    ];
383    patterns.iter().any(|p| lower.contains(p))
384}
385
386/// Check if response text contains overconfident, unhedged assertions on
387/// factual or contested topics.
388///
389/// These patterns indicate the agent stated something as absolute fact when
390/// appropriate hedging language should have been used. Used as an additional
391/// quality signal alongside [`detect_failure_patterns`].
392pub fn detect_assertive_uncertainty(text: &str) -> bool {
393    let lower = text.to_lowercase();
394    let patterns = [
395        "the fact is that",
396        "it is a fact that",
397        "it is definitely the case",
398        "there is no question that",
399        "it is absolutely certain",
400        "without any doubt",
401    ];
402    patterns.iter().any(|p| lower.contains(p))
403}
404
405/// Check whether a response to a debugging or diagnostic query is missing
406/// explicit root-cause or verification structure.
407///
408/// Returns `true` when the response text looks like a technical/debugging
409/// answer (contains error, bug, fix, crash, or failure language) but lacks
410/// any root-cause marker or verification instruction. Used as an additional
411/// quality signal to trigger reflection on structurally incomplete answers.
412pub fn detect_missing_debug_structure(response: &str) -> bool {
413    let lower = response.to_lowercase();
414
415    let is_debug_context = [
416        "error",
417        "bug",
418        "fix",
419        "crash",
420        "failure",
421        "exception",
422        "traceback",
423        "stack trace",
424        "panicked",
425        "undefined",
426        "null pointer",
427        "segfault",
428        "diagnos",
429        "debug",
430        "root cause",
431    ]
432    .iter()
433    .any(|p| lower.contains(p));
434
435    if !is_debug_context {
436        return false;
437    }
438
439    let has_root_cause = [
440        "root cause",
441        "caused by",
442        "because",
443        "the reason",
444        "this happens when",
445        "this is because",
446        "due to",
447        "stems from",
448        "originates from",
449    ]
450    .iter()
451    .any(|p| lower.contains(p));
452
453    let has_verification = [
454        "verify",
455        "verif",
456        "to confirm",
457        "run ",
458        "check ",
459        "test ",
460        "validate",
461        "you can confirm",
462        "to verify",
463        "make sure",
464        "ensure",
465    ]
466    .iter()
467    .any(|p| lower.contains(p));
468
469    !has_root_cause || !has_verification
470}
471
472/// Build the reflection prompt that asks the LLM to analyze a suboptimal turn.
473///
474/// This is the pure prompt-construction step for reflection generation:
475///
476/// - the original user request becomes the task context,
477/// - the agent response becomes the failed/suboptimal attempt,
478/// - tool errors and quality signals become the environment feedback,
479/// - and the output contract forces the model into the structured
480///   `ATTEMPT`/`ISSUE`/`STRATEGY`/`TAGS` format expected by the parser.
481pub fn build_reflection_prompt(
482    user_request: &str,
483    agent_response: &str,
484    quality_signals: &QualitySignals,
485    tool_errors: &[String],
486) -> String {
487    let mut prompt = String::from(
488        "System: You are a self-reflective AI assistant analyzing a previous interaction \
489         that was suboptimal. Generate a structured reflection to improve future responses.\n\n",
490    );
491
492    prompt.push_str(&format!("User request: {}\n\n", user_request));
493    prompt.push_str(&format!(
494        "Agent response (quality score: {:.2}):\n{}\n\n",
495        quality_signals.score(),
496        agent_response
497    ));
498
499    if !tool_errors.is_empty() {
500        prompt.push_str("Tool errors encountered:\n");
501        for error in tool_errors {
502            prompt.push_str(&format!("- {}\n", error));
503        }
504        prompt.push('\n');
505    }
506
507    prompt.push_str(
508        "Provide a brief, structured reflection in the following format:\n\
509         ATTEMPT: [1-2 sentence summary of what was attempted]\n\
510         ISSUE: [1-2 sentence analysis of what went wrong]\n\
511         STRATEGY: [1-2 sentence corrective strategy for future attempts]\n\
512         TAGS: [comma-separated relevant tags]\n\
513         Important:\n\
514         - Output plain text only.\n\
515         - Do not wrap the reflection in Markdown code fences.\n\
516         - Do not add any preamble, explanation, or extra sections before or after the four fields.\n",
517    );
518
519    prompt
520}
521
522#[derive(Debug, Clone, Copy, PartialEq, Eq)]
523enum ReflectionField {
524    Attempt,
525    Issue,
526    Strategy,
527    Tags,
528}
529
530fn strip_tag_blocks(input: &str, open: &str, close: &str) -> String {
531    let mut output = String::new();
532    let mut cursor = 0usize;
533
534    while let Some(start_rel) = input[cursor..].find(open) {
535        let start = cursor + start_rel;
536        output.push_str(&input[cursor..start]);
537        let content_start = start + open.len();
538        let Some(end_rel) = input[content_start..].find(close) else {
539            return output.trim().to_string();
540        };
541        cursor = content_start + end_rel + close.len();
542    }
543
544    output.push_str(&input[cursor..]);
545    output.trim().to_string()
546}
547
548fn sanitize_reflection_response(response: &str) -> String {
549    strip_tag_blocks(response, "<think>", "</think>")
550        .lines()
551        .filter(|line| !line.trim_start().starts_with("```"))
552        .collect::<Vec<_>>()
553        .join("\n")
554        .trim()
555        .to_string()
556}
557
558fn compact_reflection_value(value: &str) -> String {
559    value.split_whitespace().collect::<Vec<_>>().join(" ")
560}
561
562fn clean_reflection_field_value(value: &str) -> String {
563    value
564        .trim()
565        .trim_end_matches(',')
566        .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | '"' | '\''))
567        .trim()
568        .to_string()
569}
570
571fn push_reflection_segment(target: &mut String, segment: &str) {
572    let segment = compact_reflection_value(&clean_reflection_field_value(segment));
573    if segment.is_empty() {
574        return;
575    }
576    if !target.is_empty() {
577        target.push(' ');
578    }
579    target.push_str(&segment);
580}
581
582fn normalize_reflection_label(label: &str) -> Option<ReflectionField> {
583    let normalized = label
584        .trim()
585        .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | '[' | ']' | '(' | ')' | '#'))
586        .chars()
587        .filter_map(|ch| {
588            if ch.is_ascii_alphanumeric() {
589                Some(ch.to_ascii_lowercase())
590            } else if matches!(ch, ' ' | '-' | '_') {
591                Some('_')
592            } else {
593                None
594            }
595        })
596        .collect::<String>();
597
598    let normalized = normalized.trim_matches('_');
599
600    match normalized {
601        "attempt" | "attempt_summary" | "summary" | "what_was_attempted" => {
602            Some(ReflectionField::Attempt)
603        }
604        "issue" | "failure" | "failure_analysis" | "problem" | "analysis" | "what_went_wrong" => {
605            Some(ReflectionField::Issue)
606        }
607        "strategy"
608        | "corrective_strategy"
609        | "correction"
610        | "fix"
611        | "improvement_strategy"
612        | "next_time" => Some(ReflectionField::Strategy),
613        "tags" | "labels" => Some(ReflectionField::Tags),
614        _ => None,
615    }
616}
617
618fn strip_reflection_line_prefix(line: &str) -> &str {
619    let mut trimmed = line.trim_start();
620
621    loop {
622        if let Some(rest) = trimmed.strip_prefix('>') {
623            trimmed = rest.trim_start();
624            continue;
625        }
626        if let Some(rest) = trimmed.strip_prefix("- ") {
627            trimmed = rest.trim_start();
628            continue;
629        }
630        if let Some(rest) = trimmed.strip_prefix("* ") {
631            trimmed = rest.trim_start();
632            continue;
633        }
634        if let Some(rest) = trimmed.strip_prefix("• ") {
635            trimmed = rest.trim_start();
636            continue;
637        }
638
639        let digit_count = trimmed.chars().take_while(|ch| ch.is_ascii_digit()).count();
640        if digit_count > 0 {
641            let suffix = &trimmed[digit_count..];
642            if let Some(rest) = suffix.strip_prefix(". ") {
643                trimmed = rest.trim_start();
644                continue;
645            }
646            if let Some(rest) = suffix.strip_prefix(") ") {
647                trimmed = rest.trim_start();
648                continue;
649            }
650        }
651
652        break;
653    }
654
655    trimmed
656}
657
658fn parse_reflection_field_line(line: &str) -> Option<(ReflectionField, String)> {
659    let candidate = strip_reflection_line_prefix(line);
660    let colon_idx = candidate.find(':')?;
661    let label = candidate[..colon_idx].trim();
662    let value = candidate[colon_idx + 1..].trim();
663    let field = normalize_reflection_label(label)?;
664    Some((field, value.to_string()))
665}
666
667fn parse_tag_values(value: &str) -> Vec<String> {
668    let trimmed = strip_reflection_line_prefix(value).trim();
669    let trimmed = clean_reflection_field_value(trimmed);
670    let trimmed = trimmed.trim_matches(|c: char| matches!(c, '[' | ']' | '{' | '}'));
671    if trimmed.is_empty() {
672        return Vec::new();
673    }
674    trimmed
675        .split(',')
676        .map(clean_reflection_field_value)
677        .filter(|tag| !tag.is_empty())
678        .collect()
679}
680
681fn extract_jsonish_string_value(source: &str, key: &str) -> Option<String> {
682    let needle = format!("\"{key}\"");
683    let start = source.find(&needle)? + needle.len();
684    let remainder = source[start..].trim_start();
685    let remainder = remainder.strip_prefix(':')?.trim_start();
686    let remainder = remainder.strip_prefix('"')?;
687
688    let mut value = String::new();
689    let mut escaped = false;
690    for ch in remainder.chars() {
691        if escaped {
692            value.push(match ch {
693                'n' => '\n',
694                'r' => '\r',
695                't' => '\t',
696                '"' => '"',
697                '\\' => '\\',
698                other => other,
699            });
700            escaped = false;
701            continue;
702        }
703        match ch {
704            '\\' => escaped = true,
705            '"' => return Some(value.trim().to_string()),
706            other => value.push(other),
707        }
708    }
709
710    None
711}
712
713fn extract_jsonish_tags(source: &str) -> Option<Vec<String>> {
714    if let Some(tags_str) = extract_jsonish_string_value(source, "tags") {
715        return Some(parse_tag_values(&tags_str));
716    }
717
718    let needle = "\"tags\"";
719    let start = source.find(needle)? + needle.len();
720    let remainder = source[start..].trim_start();
721    let remainder = remainder.strip_prefix(':')?.trim_start();
722    let remainder = remainder.strip_prefix('[')?;
723    let end = remainder.find(']')?;
724    let body = &remainder[..end];
725
726    Some(
727        body.split(',')
728            .map(|item| item.trim().trim_matches(|c: char| matches!(c, '"' | '\'')))
729            .filter(|item| !item.is_empty())
730            .map(ToOwned::to_owned)
731            .collect(),
732    )
733}
734
735fn parse_jsonish_reflection_response(response: &str, session_id: &str) -> Option<AgentReflection> {
736    let attempt = extract_jsonish_string_value(response, "attempt_summary")
737        .or_else(|| extract_jsonish_string_value(response, "attempt"))?;
738    let issue = extract_jsonish_string_value(response, "failure_analysis")
739        .or_else(|| extract_jsonish_string_value(response, "issue"))?;
740    let strategy = extract_jsonish_string_value(response, "corrective_strategy")
741        .or_else(|| extract_jsonish_string_value(response, "strategy"))?;
742    let tags = extract_jsonish_tags(response).unwrap_or_default();
743
744    Some(
745        AgentReflection::new(session_id, attempt, issue, strategy)
746            .with_tags(tags.into_iter().filter(|tag| !tag.is_empty()).collect()),
747    )
748}
749
750/// Parse a structured reflection from an LLM response.
751///
752/// Expects the format produced by `build_reflection_prompt` and intentionally
753/// fails closed if any of the core fields are missing. The runtime treats an
754/// unparsable reflection as non-durable so it does not enter session or
755/// long-term memory in a malformed shape.
756pub fn parse_reflection_response(response: &str, session_id: &str) -> Option<AgentReflection> {
757    let response = sanitize_reflection_response(response);
758    let mut attempt = String::new();
759    let mut issue = String::new();
760    let mut strategy = String::new();
761    let mut tags = Vec::new();
762    let mut current_field = None;
763
764    for line in response.lines() {
765        let trimmed = line.trim();
766        if trimmed.is_empty() {
767            continue;
768        }
769
770        if let Some((field, value)) = parse_reflection_field_line(trimmed) {
771            current_field = Some(field);
772            match field {
773                ReflectionField::Attempt => push_reflection_segment(&mut attempt, &value),
774                ReflectionField::Issue => push_reflection_segment(&mut issue, &value),
775                ReflectionField::Strategy => push_reflection_segment(&mut strategy, &value),
776                ReflectionField::Tags => tags.extend(parse_tag_values(&value)),
777            }
778            continue;
779        }
780
781        match current_field {
782            Some(ReflectionField::Attempt) => push_reflection_segment(&mut attempt, trimmed),
783            Some(ReflectionField::Issue) => push_reflection_segment(&mut issue, trimmed),
784            Some(ReflectionField::Strategy) => push_reflection_segment(&mut strategy, trimmed),
785            Some(ReflectionField::Tags) => tags.extend(parse_tag_values(trimmed)),
786            None => {}
787        }
788    }
789
790    let mut deduped_tags = Vec::new();
791    for tag in tags {
792        if !tag.is_empty() && !deduped_tags.contains(&tag) {
793            deduped_tags.push(tag);
794        }
795    }
796
797    if !attempt.is_empty() && !issue.is_empty() && !strategy.is_empty() {
798        return Some(
799            AgentReflection::new(session_id, attempt, issue, strategy).with_tags(deduped_tags),
800        );
801    }
802
803    parse_jsonish_reflection_response(&response, session_id)
804}
805
806#[cfg(test)]
807mod tests {
808    use super::*;
809    use gestura_core_foundation::OutcomeSignalKind;
810
811    #[test]
812    fn test_quality_scoring_high_quality_response() {
813        let signals = QualitySignals {
814            tool_error_rate: 0.0,
815            iterations_used: 1,
816            max_iterations: 10,
817            was_truncated: false,
818            has_failure_patterns: false,
819            is_empty_response: false,
820        };
821        let score = signals.score();
822        assert!(score > 0.9, "Good response should score > 0.9, got {score}");
823    }
824
825    #[test]
826    fn test_quality_scoring_tool_errors() {
827        let signals = QualitySignals {
828            tool_error_rate: 0.5, // Half the tools errored
829            iterations_used: 3,
830            max_iterations: 10,
831            was_truncated: false,
832            has_failure_patterns: false,
833            is_empty_response: false,
834        };
835        let score = signals.score();
836        assert!(
837            score < 0.85,
838            "50% tool errors should lower score, got {score}"
839        );
840    }
841
842    #[test]
843    fn test_quality_scoring_many_iterations() {
844        let signals = QualitySignals {
845            tool_error_rate: 0.0,
846            iterations_used: 9,
847            max_iterations: 10,
848            was_truncated: false,
849            has_failure_patterns: false,
850            is_empty_response: false,
851        };
852        let score = signals.score();
853        assert!(
854            score < 0.95,
855            "Using 90% iterations should lower score, got {score}"
856        );
857    }
858
859    #[test]
860    fn test_quality_scoring_empty_response() {
861        let signals = QualitySignals {
862            tool_error_rate: 0.0,
863            iterations_used: 1,
864            max_iterations: 10,
865            was_truncated: false,
866            has_failure_patterns: false,
867            is_empty_response: true,
868        };
869        assert_eq!(signals.score(), 0.0);
870    }
871
872    #[test]
873    fn test_quality_scoring_combined_issues() {
874        let signals = QualitySignals {
875            tool_error_rate: 0.3,
876            iterations_used: 8,
877            max_iterations: 10,
878            was_truncated: true,
879            has_failure_patterns: true,
880            is_empty_response: false,
881        };
882        let score = signals.score();
883        assert!(
884            score < 0.5,
885            "Multiple issues should produce low score, got {score}"
886        );
887    }
888
889    #[test]
890    fn test_detect_failure_patterns() {
891        assert!(detect_failure_patterns("I'm sorry, I can't do that"));
892        assert!(detect_failure_patterns(
893            "Unfortunately, I cannot access that file"
894        ));
895        assert!(!detect_failure_patterns(
896            "Here is the file content you requested"
897        ));
898    }
899
900    #[test]
901    fn test_detect_missing_debug_structure() {
902        // Debug context with neither root-cause nor verification → flag it
903        assert!(detect_missing_debug_structure(
904            "There is a bug in the config parser."
905        ));
906        // Debug context with root-cause but no verification → flag it
907        assert!(detect_missing_debug_structure(
908            "The crash is caused by a null pointer in the config parser."
909        ));
910        // Debug context with both → do not flag
911        assert!(!detect_missing_debug_structure(
912            "The crash is caused by a null pointer. To verify, run `cargo test` and check the output."
913        ));
914        // Non-debug context → do not flag regardless
915        assert!(!detect_missing_debug_structure(
916            "The deployment was successful and all services are running."
917        ));
918    }
919
920    #[test]
921    fn test_detect_assertive_uncertainty() {
922        assert!(detect_assertive_uncertainty(
923            "The fact is that Python was invented in 1989."
924        ));
925        assert!(detect_assertive_uncertainty(
926            "There is no question that this is the correct approach."
927        ));
928        assert!(!detect_assertive_uncertainty(
929            "Python is generally credited as a language designed for readability."
930        ));
931        assert!(!detect_assertive_uncertainty(
932            "The file was found at the expected path."
933        ));
934    }
935
936    #[test]
937    fn test_reflection_prompt_construction() {
938        let signals = QualitySignals {
939            tool_error_rate: 0.5,
940            iterations_used: 3,
941            max_iterations: 10,
942            was_truncated: false,
943            has_failure_patterns: false,
944            is_empty_response: false,
945        };
946        let prompt = build_reflection_prompt(
947            "Read the file",
948            "Error: file not found",
949            &signals,
950            &["FileNotFound: /tmp/missing.txt".to_string()],
951        );
952        assert!(prompt.contains("Read the file"));
953        assert!(prompt.contains("Error: file not found"));
954        assert!(prompt.contains("FileNotFound"));
955        assert!(prompt.contains("ATTEMPT:"));
956        assert!(prompt.contains("ISSUE:"));
957        assert!(prompt.contains("STRATEGY:"));
958    }
959
960    #[test]
961    fn test_reflection_response_parsing() {
962        let response = "\
963            ATTEMPT: Tried to read the file at /tmp/missing.txt\n\
964            ISSUE: The file path was incorrect; the file does not exist\n\
965            STRATEGY: Verify file existence before attempting to read; suggest alternatives\n\
966            TAGS: file, read, path-error\n";
967
968        let reflection = parse_reflection_response(response, "session-123").unwrap();
969        assert_eq!(
970            reflection.attempt_summary,
971            "Tried to read the file at /tmp/missing.txt"
972        );
973        assert!(reflection.failure_analysis.contains("incorrect"));
974        assert!(reflection.corrective_strategy.contains("Verify"));
975        assert_eq!(reflection.tags, vec!["file", "read", "path-error"]);
976        assert_eq!(reflection.session_id, "session-123");
977    }
978
979    #[test]
980    fn test_reflection_response_parsing_incomplete() {
981        let response = "ATTEMPT: Something\nISSUE: Something else\n";
982        let reflection = parse_reflection_response(response, "s1");
983        assert!(reflection.is_none(), "Missing STRATEGY should return None");
984    }
985
986    #[test]
987    fn test_reflection_response_parsing_markdown_and_multiline() {
988        let response = "<think>diagnosing tool output</think>\n\
989            - **Attempt:** Tried to inspect the missing config file.\n\
990              I answered before verifying the real path.\n\
991            - **Issue:** The response relied on an assumed file location\n\
992              instead of repository evidence.\n\
993            - **Strategy:** Search for the config file first, then answer\n\
994              only from the verified path and contents.\n\
995            - **Tags:** file, verification\n";
996
997        let reflection = parse_reflection_response(response, "session-md").unwrap();
998        assert!(
999            reflection
1000                .attempt_summary
1001                .contains("inspect the missing config file")
1002        );
1003        assert!(
1004            reflection
1005                .attempt_summary
1006                .contains("verifying the real path")
1007        );
1008        assert!(reflection.failure_analysis.contains("repository evidence"));
1009        assert!(
1010            reflection
1011                .corrective_strategy
1012                .contains("verified path and contents")
1013        );
1014        assert_eq!(reflection.tags, vec!["file", "verification"]);
1015    }
1016
1017    #[test]
1018    fn test_reflection_response_parsing_aliases_and_tag_list() {
1019        let response = "attempt_summary: Investigated a build failure without reading the actual error output.\n\
1020            failure_analysis: The explanation guessed at causes instead of grounding them in the logs.\n\
1021            corrective_strategy: Read the concrete stderr output first, then explain only the confirmed failure mode.\n\
1022            tags:\n\
1023            - shell\n\
1024            - validation\n";
1025
1026        let reflection = parse_reflection_response(response, "session-alias").unwrap();
1027        assert!(reflection.attempt_summary.contains("build failure"));
1028        assert!(
1029            reflection
1030                .failure_analysis
1031                .contains("grounding them in the logs")
1032        );
1033        assert!(
1034            reflection
1035                .corrective_strategy
1036                .contains("concrete stderr output")
1037        );
1038        assert_eq!(reflection.tags, vec!["shell", "validation"]);
1039    }
1040
1041    #[test]
1042    fn test_reflection_response_parsing_jsonish_payload() {
1043        let response = "```json\n{\n  \"attempt_summary\": \"Tried to edit the wrong file\",\n  \"failure_analysis\": \"The response assumed the target path without confirming it\",\n  \"corrective_strategy\": \"Locate the file first, then apply the edit to the verified path\",\n  \"tags\": [\"file\", \"path\"]\n}\n```";
1044
1045        let reflection = parse_reflection_response(response, "session-json").unwrap();
1046        assert_eq!(reflection.attempt_summary, "Tried to edit the wrong file");
1047        assert!(
1048            reflection
1049                .failure_analysis
1050                .contains("assumed the target path")
1051        );
1052        assert!(
1053            reflection
1054                .corrective_strategy
1055                .contains("Locate the file first")
1056        );
1057        assert_eq!(reflection.tags, vec!["file", "path"]);
1058    }
1059
1060    #[test]
1061    fn test_reflection_to_prompt_section() {
1062        let reflection = AgentReflection::new(
1063            "s1",
1064            "Read missing file",
1065            "File did not exist",
1066            "Check file existence first",
1067        )
1068        .with_outcome_signals(vec![
1069            OutcomeSignal::new(OutcomeSignalKind::RetryImproved)
1070                .with_summary("The revised answer used the correct path."),
1071        ]);
1072        let section = reflection.to_prompt_section();
1073        assert!(section.contains("Read missing file"));
1074        assert!(section.contains("File did not exist"));
1075        assert!(section.contains("Check file existence first"));
1076        assert!(section.contains("Retry improved"));
1077    }
1078
1079    #[test]
1080    fn test_reflection_improvement_score_increases_with_retry_quality() {
1081        let score = score_reflection_improvement(0.40, 0.76);
1082        assert!(
1083            score > 0.5,
1084            "Expected strong improvement signal, got {score}"
1085        );
1086    }
1087
1088    #[test]
1089    fn test_reflection_improvement_score_zero_when_retry_is_not_better() {
1090        assert_eq!(score_reflection_improvement(0.65, 0.65), 0.0);
1091        assert_eq!(score_reflection_improvement(0.65, 0.52), 0.0);
1092    }
1093
1094    #[test]
1095    fn test_promotion_confidence_uses_outcome_signals() {
1096        let baseline = AgentReflection::new(
1097            "s1",
1098            "Attempted a retry",
1099            "The first answer was weak",
1100            "Revise with the missing evidence",
1101        );
1102        let stronger = baseline.clone().with_outcome_signals(vec![
1103            OutcomeSignal::new(OutcomeSignalKind::RetryImproved),
1104            OutcomeSignal::new(OutcomeSignalKind::ReviewApproved),
1105        ]);
1106        let weaker = baseline.with_outcome_signals(vec![
1107            OutcomeSignal::new(OutcomeSignalKind::RetryDidNotImprove),
1108            OutcomeSignal::new(OutcomeSignalKind::ReviewNeedsRevision),
1109        ]);
1110
1111        assert!(stronger.promotion_confidence() > 0.70);
1112        assert!(weaker.promotion_confidence() < 0.50);
1113    }
1114
1115    #[test]
1116    fn test_merge_outcome_signals_replaces_existing_kind() {
1117        let first = OutcomeSignal::new(OutcomeSignalKind::ReviewApproved)
1118            .with_summary("Initial approval note");
1119        let replacement = OutcomeSignal::new(OutcomeSignalKind::ReviewApproved)
1120            .with_summary("Final approval note");
1121
1122        let merged = merge_outcome_signals(&[first], std::slice::from_ref(&replacement));
1123
1124        assert_eq!(merged.len(), 1);
1125        assert_eq!(merged[0].summary.as_deref(), replacement.summary.as_deref());
1126    }
1127}
gestura_core_pipeline/reflection.rs

gestura_core_pipeline/
reflection.rs