fix: stream usa ultimo frame + playground artifact_probe

- streamAnswer: el result final es el ULTIMO answer parseado (frame post-idle, limpio) en vez del mas largo, que podia ser un frame intermedio con spinner. - playground/artifact_probe.go: herramienta de auditoria que corre oneshot + stream contra varios prompts, detecta artefactos del render que se cuelan (box chars, status bar, spinner, palabras pegadas, prompt echo) y verifica consistencia del streaming (concat de deltas == result). Opcional --ref para comparar con claude -p real. No se indexa (playground del padre). El fix del spinner en si vive en parse_claude_tui_go_tui (repo padre): detecta la linea de carga por estructura (glyph + palabra + ellipsis) y firma ((Ns tokens) / esc to interrupt), no por la palabra, que cambia cada frame.
2026-06-03 23:37:39 +02:00
parent 4574f08a22
commit 22bec25d8d
4 changed files with 335 additions and 3 deletions
@@ -200,9 +200,11 @@ func streamAnswer(ctx context.Context, bin string, inputs []string, warmup, step
 			}
 			prev = ans
 		}
-		// Keep the longest answer seen as the final, even if a later frame shrank
-		// (transient reflow / parse noise).
-		if len(ans) >= len(final) {
+		// The final result is the LAST non-empty parsed answer. The last snapshot is
+		// emitted after idle (generation stopped), so it is the cleanest frame — no
+		// active spinner, fully rendered. Using "longest seen" instead would let a
+		// noisy mid-generation frame win over the clean final one.
+		if ans != "" {
 			final = ans
 		}
 	}
@@ -0,0 +1,46 @@
+# claude_pipe — artifact probe (playground)
+
+Herramienta desechable para auditar la calidad del parseo de la TUI que hace
+`claude_pipe`. No se indexa, no tiene `app.md`, vive dentro de `apps/claude_pipe/`
+y se mueve con su sub-repo.
+
+## Qué comprueba
+
+Por cada prompt, lanza el binario `claude_pipe` en modo one-shot y en modo
+`--stream`, y busca:
+
+1. **Artefactos** que se cuelan del render en la respuesta parseada: caracteres de
+   caja (`╭│╰`), reglas horizontales (`────`), fragmentos de la status bar
+   (`CTX:`, `IN:`, `$…`, `← for agents`), la línea meta `✻ Crunched`, el prompt
+   `❯`, el carácter de reemplazo `�`, palabras pegadas (>40 chars sin espacio), o
+   el prompt repetido literalmente.
+2. **Consistencia del streaming**: la concatenación de los `text_delta` debe
+   reconstruir el `result` final. Si no, la heurística de prefijo perdió o duplicó
+   texto bajo reflow.
+3. **(Opcional, `--ref`)** discrepancia contra `claude -p` real para el mismo
+   prompt (normalizado por espacios). claude no es determinista, así que solo los
+   prompts triviales se espera que coincidan exactamente.
+
+Sale con código 2 si encuentra artefactos o inconsistencias (para poder usarlo
+como gate).
+
+## Cómo lanzarlo
+
+```bash
+cd apps/claude_pipe
+CGO_ENABLED=1 go build -tags fts5 -o claude_pipe .   # asegúrate de tener el binario
+
+cd playground
+
+# Set de prompts por defecto, sin comparar con claude -p
+go run artifact_probe.go --root /home/enmanuel/fn_registry
+
+# Comparando además contra `claude -p` real (gasta llamadas reales)
+go run artifact_probe.go --root /home/enmanuel/fn_registry --ref
+
+# Un solo prompt custom
+go run artifact_probe.go --root /home/enmanuel/fn_registry --prompt "tu prompt aqui"
+```
+
+`--root` debe ser un repo cuyos MCP de claude ya estén aprobados, para que la TUI
+no muestre el diálogo de arranque.
@@ -0,0 +1,281 @@
+// artifact_probe drives the claude_pipe binary across a set of prompts and looks
+// for two classes of problems that are inherent to parsing the claude TUI:
+//
+//  1. Artifacts: bits of the render that leaked into the parsed answer — box
+//     drawing characters from the banner, status-bar fragments (CTX:, IN:, $...),
+//     the "✻ Crunched" meta line, the echoed prompt, replacement characters, or
+//     glued words (a heuristic: very long runs with no spaces).
+//
+//  2. Streaming inconsistencies: in --stream mode, the concatenation of all
+//     text_delta events should reconstruct the final result. If it doesn't, the
+//     prefix-delta heuristic dropped or duplicated text under reflow.
+//
+// Optionally (--ref) it also runs the real `claude -p` for the same prompt and
+// reports whether claude_pipe's answer matches it (whitespace-normalized). claude
+// is not deterministic, so only trivial prompts are expected to match exactly;
+// for open prompts the comparison is informational.
+//
+// This is a playground tool: it is not indexed, has no registry entry, and exists
+// only to probe claude_pipe's TUI-parsing quality. Run it when you want to audit
+// the parser against real claude output.
+//
+// Usage:
+//
+//	go run artifact_probe.go --root /home/enmanuel/fn_registry            # default prompts, no ref
+//	go run artifact_probe.go --root /home/enmanuel/fn_registry --ref      # also compare vs claude -p
+//	go run artifact_probe.go --root /repo --prompt "tu prompt"            # single custom prompt
+package main
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"os/exec"
+	"regexp"
+	"strings"
+	"time"
+)
+
+// defaultPrompts exercise different shapes: one word, a short list, a multi-line
+// answer, and one that mentions code (markers that often trip up TUI parsing).
+var defaultPrompts = []string{
+	"responde unicamente con la palabra PONG, sin explicaciones",
+	"lista exactamente tres frutas, una por linea, sin numeracion ni texto extra",
+	"explica en dos frases que es un pseudo-terminal (PTY)",
+	"escribe una linea de codigo Go que imprima hola, sin explicaciones",
+}
+
+// artifactPatterns are substrings/regexes that should NEVER appear in a clean
+// parsed answer. Each is a piece of TUI chrome, not model output.
+var artifactPatterns = []struct {
+	name string
+	re   *regexp.Regexp
+}{
+	{"box_drawing", regexp.MustCompile(`[╭╮╰╯┌┐└┘├┤┬┴┼│─]`)},
+	{"horizontal_rule", regexp.MustCompile(`─{8,}`)},
+	{"status_ctx", regexp.MustCompile(`CTX:\s*[\d█░]`)},
+	{"status_inout", regexp.MustCompile(`\bIN:\d|\bOUT:\d`)},
+	{"status_limits", regexp.MustCompile(`Limits:|Total:\s*↓|⎇\s`)},
+	{"status_cost", regexp.MustCompile(`\$\d+\.\d`)},
+	{"for_agents", regexp.MustCompile(`←\s*for agents`)},
+	// Spinner detected by structure (any glyph + word…) and by signature
+	// ("(Ns ... tokens", "esc to interrupt"), not by the ever-changing word.
+	{"meta_spinner", regexp.MustCompile(`[✻✽✢✶✺✷✦✳✱]|esc to interrupt|\(\d+s\b[^)]*tokens?\b`)},
+	{"prompt_marker", regexp.MustCompile(`❯`)},
+	{"replacement_char", regexp.MustCompile("�")},
+}
+
+// gluedWordRe flags a run of >40 non-space characters, the signature of stripped
+// cursor moves collapsing columns together (e.g. "2newMCPservers").
+var gluedWordRe = regexp.MustCompile(`\S{41,}`)
+
+type streamEvent struct {
+	Type   string `json:"type"`
+	Text   string `json:"text"`
+	Result string `json:"result"`
+}
+
+type caseResult struct {
+	prompt         string
+	oneshot        string
+	streamDeltas   []string
+	streamResult   string
+	ref            string
+	artifactsOne   []string
+	artifactsStrm  []string
+	streamConsistent bool
+	matchesRef       string // "yes" | "no" | "n/a"
+	errs             []string
+}
+
+func main() {
+	root := flag.String("root", "/home/enmanuel/fn_registry", "cwd for claude (a repo whose MCP servers are approved)")
+	bin := flag.String("bin", "../claude_pipe", "path to the claude_pipe binary")
+	single := flag.String("prompt", "", "run a single custom prompt instead of the default set")
+	ref := flag.Bool("ref", false, "also run real `claude -p` and compare")
+	warmup := flag.String("warmup", "4s", "claude_pipe --warmup")
+	idle := flag.String("idle", "4s", "claude_pipe --idle")
+	maxDur := flag.String("max", "90s", "claude_pipe --max")
+	flag.Parse()
+
+	prompts := defaultPrompts
+	if *single != "" {
+		prompts = []string{*single}
+	}
+
+	if _, err := os.Stat(*bin); err != nil {
+		fmt.Fprintf(os.Stderr, "claude_pipe binary not found at %s — build it first:\n  (cd .. && CGO_ENABLED=1 go build -tags fts5 -o claude_pipe .)\n", *bin)
+		os.Exit(1)
+	}
+
+	var results []caseResult
+	for i, p := range prompts {
+		fmt.Fprintf(os.Stderr, "[%d/%d] probing: %s\n", i+1, len(prompts), truncate(p, 60))
+		results = append(results, probe(*bin, *root, p, *warmup, *idle, *maxDur, *ref))
+	}
+
+	report(results, *ref)
+
+	// Exit non-zero if any artifact was found, so this can gate CI if desired.
+	for _, r := range results {
+		if len(r.artifactsOne) > 0 || len(r.artifactsStrm) > 0 || !r.streamConsistent {
+			os.Exit(2)
+		}
+	}
+}
+
+func probe(bin, root, prompt, warmup, idle, maxDur string, withRef bool) caseResult {
+	r := caseResult{prompt: prompt, streamConsistent: true, matchesRef: "n/a"}
+
+	// One-shot, text format.
+	one, err := run(90*time.Second, bin,
+		"--format", "text", "--cwd", root,
+		"--warmup", warmup, "--idle", idle, "--max", maxDur, prompt)
+	if err != nil {
+		r.errs = append(r.errs, "oneshot: "+err.Error())
+	}
+	r.oneshot = strings.TrimRight(one, "\n")
+	r.artifactsOne = findArtifacts(r.oneshot, prompt)
+
+	// Streaming.
+	strm, err := run(90*time.Second, bin,
+		"--stream", "--cwd", root,
+		"--warmup", warmup, "--idle", idle, "--max", maxDur,
+		"--snapshot-interval", "150ms", prompt)
+	if err != nil {
+		r.errs = append(r.errs, "stream: "+err.Error())
+	}
+	r.streamDeltas, r.streamResult = parseStream(strm)
+	r.artifactsStrm = findArtifacts(r.streamResult, prompt)
+	// Consistency: concatenated deltas should reconstruct the final result.
+	recon := strings.Join(r.streamDeltas, "")
+	r.streamConsistent = normalize(recon) == normalize(r.streamResult)
+
+	if withRef {
+		refOut, err := run(90*time.Second, "claude", "-p", prompt)
+		if err != nil {
+			r.errs = append(r.errs, "ref: "+err.Error())
+		} else {
+			r.ref = strings.TrimRight(refOut, "\n")
+			if normalize(r.ref) == normalize(r.oneshot) {
+				r.matchesRef = "yes"
+			} else {
+				r.matchesRef = "no"
+			}
+		}
+	}
+
+	return r
+}
+
+// run executes a command with a timeout and returns its stdout.
+func run(timeout time.Duration, name string, args ...string) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	cmd := exec.CommandContext(ctx, name, args...)
+	out, err := cmd.Output()
+	return string(out), err
+}
+
+// parseStream splits the NDJSON stream into the ordered text_delta texts and the
+// final result string.
+func parseStream(s string) (deltas []string, result string) {
+	sc := bufio.NewScanner(strings.NewReader(s))
+	sc.Buffer(make([]byte, 1024*1024), 1024*1024)
+	for sc.Scan() {
+		line := strings.TrimSpace(sc.Text())
+		if line == "" {
+			continue
+		}
+		var ev streamEvent
+		if json.Unmarshal([]byte(line), &ev) != nil {
+			continue
+		}
+		switch ev.Type {
+		case "text_delta":
+			deltas = append(deltas, ev.Text)
+		case "result":
+			result = ev.Result
+		}
+	}
+	return deltas, result
+}
+
+func findArtifacts(text, prompt string) []string {
+	var found []string
+	for _, ap := range artifactPatterns {
+		if ap.re.MatchString(text) {
+			found = append(found, ap.name)
+		}
+	}
+	if gluedWordRe.MatchString(text) {
+		found = append(found, "glued_words")
+	}
+	// Prompt echoed verbatim into the answer (claude shouldn't repeat the prompt).
+	if len(prompt) > 12 && strings.Contains(text, prompt) {
+		found = append(found, "prompt_echo")
+	}
+	return found
+}
+
+// normalize collapses all whitespace runs to single spaces and trims, so that
+// layout-induced spacing differences don't count as content differences.
+func normalize(s string) string {
+	return strings.Join(strings.Fields(s), " ")
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "…"
+}
+
+func report(results []caseResult, withRef bool) {
+	fmt.Println()
+	fmt.Println("=== claude_pipe artifact probe ===")
+	for i, r := range results {
+		fmt.Printf("\n[%d] %s\n", i+1, truncate(r.prompt, 70))
+		fmt.Printf("    oneshot:   %q\n", truncate(r.oneshot, 80))
+		fmt.Printf("    stream:    %d deltas, result=%q\n", len(r.streamDeltas), truncate(r.streamResult, 60))
+		fmt.Printf("    consistent: %s\n", yesno(r.streamConsistent))
+		printArtifacts("    artifacts(oneshot):", r.artifactsOne)
+		printArtifacts("    artifacts(stream): ", r.artifactsStrm)
+		if withRef {
+			fmt.Printf("    matches claude -p: %s\n", r.matchesRef)
+			if r.matchesRef == "no" {
+				fmt.Printf("      ref: %q\n", truncate(r.ref, 80))
+			}
+		}
+		for _, e := range r.errs {
+			fmt.Printf("    ERROR: %s\n", e)
+		}
+	}
+
+	// Summary.
+	clean := 0
+	for _, r := range results {
+		if len(r.artifactsOne) == 0 && len(r.artifactsStrm) == 0 && r.streamConsistent {
+			clean++
+		}
+	}
+	fmt.Printf("\n=== %d/%d cases clean (no artifacts, stream consistent) ===\n", clean, len(results))
+}
+
+func printArtifacts(label string, a []string) {
+	if len(a) == 0 {
+		fmt.Printf("%s none\n", label)
+		return
+	}
+	fmt.Printf("%s %s\n", label, strings.Join(a, ", "))
+}
+
+func yesno(b bool) string {
+	if b {
+		return "yes"
+	}
+	return "NO"
+}
@@ -0,0 +1,3 @@
+module cp_playground
+
+go 1.25.0