diff --git a/main.go b/main.go index 46016a6..32c586e 100644 --- a/main.go +++ b/main.go @@ -200,9 +200,11 @@ func streamAnswer(ctx context.Context, bin string, inputs []string, warmup, step } prev = ans } - // Keep the longest answer seen as the final, even if a later frame shrank - // (transient reflow / parse noise). - if len(ans) >= len(final) { + // The final result is the LAST non-empty parsed answer. The last snapshot is + // emitted after idle (generation stopped), so it is the cleanest frame — no + // active spinner, fully rendered. Using "longest seen" instead would let a + // noisy mid-generation frame win over the clean final one. + if ans != "" { final = ans } } diff --git a/playground/README.md b/playground/README.md new file mode 100644 index 0000000..78512f4 --- /dev/null +++ b/playground/README.md @@ -0,0 +1,46 @@ +# claude_pipe — artifact probe (playground) + +Herramienta desechable para auditar la calidad del parseo de la TUI que hace +`claude_pipe`. No se indexa, no tiene `app.md`, vive dentro de `apps/claude_pipe/` +y se mueve con su sub-repo. + +## Qué comprueba + +Por cada prompt, lanza el binario `claude_pipe` en modo one-shot y en modo +`--stream`, y busca: + +1. **Artefactos** que se cuelan del render en la respuesta parseada: caracteres de + caja (`╭│╰`), reglas horizontales (`────`), fragmentos de la status bar + (`CTX:`, `IN:`, `$…`, `← for agents`), la línea meta `✻ Crunched`, el prompt + `❯`, el carácter de reemplazo `�`, palabras pegadas (>40 chars sin espacio), o + el prompt repetido literalmente. +2. **Consistencia del streaming**: la concatenación de los `text_delta` debe + reconstruir el `result` final. Si no, la heurística de prefijo perdió o duplicó + texto bajo reflow. +3. **(Opcional, `--ref`)** discrepancia contra `claude -p` real para el mismo + prompt (normalizado por espacios). claude no es determinista, así que solo los + prompts triviales se espera que coincidan exactamente. + +Sale con código 2 si encuentra artefactos o inconsistencias (para poder usarlo +como gate). + +## Cómo lanzarlo + +```bash +cd apps/claude_pipe +CGO_ENABLED=1 go build -tags fts5 -o claude_pipe . # asegúrate de tener el binario + +cd playground + +# Set de prompts por defecto, sin comparar con claude -p +go run artifact_probe.go --root /home/enmanuel/fn_registry + +# Comparando además contra `claude -p` real (gasta llamadas reales) +go run artifact_probe.go --root /home/enmanuel/fn_registry --ref + +# Un solo prompt custom +go run artifact_probe.go --root /home/enmanuel/fn_registry --prompt "tu prompt aqui" +``` + +`--root` debe ser un repo cuyos MCP de claude ya estén aprobados, para que la TUI +no muestre el diálogo de arranque. diff --git a/playground/artifact_probe.go b/playground/artifact_probe.go new file mode 100644 index 0000000..e0aaf48 --- /dev/null +++ b/playground/artifact_probe.go @@ -0,0 +1,281 @@ +// artifact_probe drives the claude_pipe binary across a set of prompts and looks +// for two classes of problems that are inherent to parsing the claude TUI: +// +// 1. Artifacts: bits of the render that leaked into the parsed answer — box +// drawing characters from the banner, status-bar fragments (CTX:, IN:, $...), +// the "✻ Crunched" meta line, the echoed prompt, replacement characters, or +// glued words (a heuristic: very long runs with no spaces). +// +// 2. Streaming inconsistencies: in --stream mode, the concatenation of all +// text_delta events should reconstruct the final result. If it doesn't, the +// prefix-delta heuristic dropped or duplicated text under reflow. +// +// Optionally (--ref) it also runs the real `claude -p` for the same prompt and +// reports whether claude_pipe's answer matches it (whitespace-normalized). claude +// is not deterministic, so only trivial prompts are expected to match exactly; +// for open prompts the comparison is informational. +// +// This is a playground tool: it is not indexed, has no registry entry, and exists +// only to probe claude_pipe's TUI-parsing quality. Run it when you want to audit +// the parser against real claude output. +// +// Usage: +// +// go run artifact_probe.go --root /home/enmanuel/fn_registry # default prompts, no ref +// go run artifact_probe.go --root /home/enmanuel/fn_registry --ref # also compare vs claude -p +// go run artifact_probe.go --root /repo --prompt "tu prompt" # single custom prompt +package main + +import ( + "bufio" + "context" + "encoding/json" + "flag" + "fmt" + "os" + "os/exec" + "regexp" + "strings" + "time" +) + +// defaultPrompts exercise different shapes: one word, a short list, a multi-line +// answer, and one that mentions code (markers that often trip up TUI parsing). +var defaultPrompts = []string{ + "responde unicamente con la palabra PONG, sin explicaciones", + "lista exactamente tres frutas, una por linea, sin numeracion ni texto extra", + "explica en dos frases que es un pseudo-terminal (PTY)", + "escribe una linea de codigo Go que imprima hola, sin explicaciones", +} + +// artifactPatterns are substrings/regexes that should NEVER appear in a clean +// parsed answer. Each is a piece of TUI chrome, not model output. +var artifactPatterns = []struct { + name string + re *regexp.Regexp +}{ + {"box_drawing", regexp.MustCompile(`[╭╮╰╯┌┐└┘├┤┬┴┼│─]`)}, + {"horizontal_rule", regexp.MustCompile(`─{8,}`)}, + {"status_ctx", regexp.MustCompile(`CTX:\s*[\d█░]`)}, + {"status_inout", regexp.MustCompile(`\bIN:\d|\bOUT:\d`)}, + {"status_limits", regexp.MustCompile(`Limits:|Total:\s*↓|⎇\s`)}, + {"status_cost", regexp.MustCompile(`\$\d+\.\d`)}, + {"for_agents", regexp.MustCompile(`←\s*for agents`)}, + // Spinner detected by structure (any glyph + word…) and by signature + // ("(Ns ... tokens", "esc to interrupt"), not by the ever-changing word. + {"meta_spinner", regexp.MustCompile(`[✻✽✢✶✺✷✦✳✱]|esc to interrupt|\(\d+s\b[^)]*tokens?\b`)}, + {"prompt_marker", regexp.MustCompile(`❯`)}, + {"replacement_char", regexp.MustCompile("�")}, +} + +// gluedWordRe flags a run of >40 non-space characters, the signature of stripped +// cursor moves collapsing columns together (e.g. "2newMCPservers"). +var gluedWordRe = regexp.MustCompile(`\S{41,}`) + +type streamEvent struct { + Type string `json:"type"` + Text string `json:"text"` + Result string `json:"result"` +} + +type caseResult struct { + prompt string + oneshot string + streamDeltas []string + streamResult string + ref string + artifactsOne []string + artifactsStrm []string + streamConsistent bool + matchesRef string // "yes" | "no" | "n/a" + errs []string +} + +func main() { + root := flag.String("root", "/home/enmanuel/fn_registry", "cwd for claude (a repo whose MCP servers are approved)") + bin := flag.String("bin", "../claude_pipe", "path to the claude_pipe binary") + single := flag.String("prompt", "", "run a single custom prompt instead of the default set") + ref := flag.Bool("ref", false, "also run real `claude -p` and compare") + warmup := flag.String("warmup", "4s", "claude_pipe --warmup") + idle := flag.String("idle", "4s", "claude_pipe --idle") + maxDur := flag.String("max", "90s", "claude_pipe --max") + flag.Parse() + + prompts := defaultPrompts + if *single != "" { + prompts = []string{*single} + } + + if _, err := os.Stat(*bin); err != nil { + fmt.Fprintf(os.Stderr, "claude_pipe binary not found at %s — build it first:\n (cd .. && CGO_ENABLED=1 go build -tags fts5 -o claude_pipe .)\n", *bin) + os.Exit(1) + } + + var results []caseResult + for i, p := range prompts { + fmt.Fprintf(os.Stderr, "[%d/%d] probing: %s\n", i+1, len(prompts), truncate(p, 60)) + results = append(results, probe(*bin, *root, p, *warmup, *idle, *maxDur, *ref)) + } + + report(results, *ref) + + // Exit non-zero if any artifact was found, so this can gate CI if desired. + for _, r := range results { + if len(r.artifactsOne) > 0 || len(r.artifactsStrm) > 0 || !r.streamConsistent { + os.Exit(2) + } + } +} + +func probe(bin, root, prompt, warmup, idle, maxDur string, withRef bool) caseResult { + r := caseResult{prompt: prompt, streamConsistent: true, matchesRef: "n/a"} + + // One-shot, text format. + one, err := run(90*time.Second, bin, + "--format", "text", "--cwd", root, + "--warmup", warmup, "--idle", idle, "--max", maxDur, prompt) + if err != nil { + r.errs = append(r.errs, "oneshot: "+err.Error()) + } + r.oneshot = strings.TrimRight(one, "\n") + r.artifactsOne = findArtifacts(r.oneshot, prompt) + + // Streaming. + strm, err := run(90*time.Second, bin, + "--stream", "--cwd", root, + "--warmup", warmup, "--idle", idle, "--max", maxDur, + "--snapshot-interval", "150ms", prompt) + if err != nil { + r.errs = append(r.errs, "stream: "+err.Error()) + } + r.streamDeltas, r.streamResult = parseStream(strm) + r.artifactsStrm = findArtifacts(r.streamResult, prompt) + // Consistency: concatenated deltas should reconstruct the final result. + recon := strings.Join(r.streamDeltas, "") + r.streamConsistent = normalize(recon) == normalize(r.streamResult) + + if withRef { + refOut, err := run(90*time.Second, "claude", "-p", prompt) + if err != nil { + r.errs = append(r.errs, "ref: "+err.Error()) + } else { + r.ref = strings.TrimRight(refOut, "\n") + if normalize(r.ref) == normalize(r.oneshot) { + r.matchesRef = "yes" + } else { + r.matchesRef = "no" + } + } + } + + return r +} + +// run executes a command with a timeout and returns its stdout. +func run(timeout time.Duration, name string, args ...string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + cmd := exec.CommandContext(ctx, name, args...) + out, err := cmd.Output() + return string(out), err +} + +// parseStream splits the NDJSON stream into the ordered text_delta texts and the +// final result string. +func parseStream(s string) (deltas []string, result string) { + sc := bufio.NewScanner(strings.NewReader(s)) + sc.Buffer(make([]byte, 1024*1024), 1024*1024) + for sc.Scan() { + line := strings.TrimSpace(sc.Text()) + if line == "" { + continue + } + var ev streamEvent + if json.Unmarshal([]byte(line), &ev) != nil { + continue + } + switch ev.Type { + case "text_delta": + deltas = append(deltas, ev.Text) + case "result": + result = ev.Result + } + } + return deltas, result +} + +func findArtifacts(text, prompt string) []string { + var found []string + for _, ap := range artifactPatterns { + if ap.re.MatchString(text) { + found = append(found, ap.name) + } + } + if gluedWordRe.MatchString(text) { + found = append(found, "glued_words") + } + // Prompt echoed verbatim into the answer (claude shouldn't repeat the prompt). + if len(prompt) > 12 && strings.Contains(text, prompt) { + found = append(found, "prompt_echo") + } + return found +} + +// normalize collapses all whitespace runs to single spaces and trims, so that +// layout-induced spacing differences don't count as content differences. +func normalize(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "…" +} + +func report(results []caseResult, withRef bool) { + fmt.Println() + fmt.Println("=== claude_pipe artifact probe ===") + for i, r := range results { + fmt.Printf("\n[%d] %s\n", i+1, truncate(r.prompt, 70)) + fmt.Printf(" oneshot: %q\n", truncate(r.oneshot, 80)) + fmt.Printf(" stream: %d deltas, result=%q\n", len(r.streamDeltas), truncate(r.streamResult, 60)) + fmt.Printf(" consistent: %s\n", yesno(r.streamConsistent)) + printArtifacts(" artifacts(oneshot):", r.artifactsOne) + printArtifacts(" artifacts(stream): ", r.artifactsStrm) + if withRef { + fmt.Printf(" matches claude -p: %s\n", r.matchesRef) + if r.matchesRef == "no" { + fmt.Printf(" ref: %q\n", truncate(r.ref, 80)) + } + } + for _, e := range r.errs { + fmt.Printf(" ERROR: %s\n", e) + } + } + + // Summary. + clean := 0 + for _, r := range results { + if len(r.artifactsOne) == 0 && len(r.artifactsStrm) == 0 && r.streamConsistent { + clean++ + } + } + fmt.Printf("\n=== %d/%d cases clean (no artifacts, stream consistent) ===\n", clean, len(results)) +} + +func printArtifacts(label string, a []string) { + if len(a) == 0 { + fmt.Printf("%s none\n", label) + return + } + fmt.Printf("%s %s\n", label, strings.Join(a, ", ")) +} + +func yesno(b bool) string { + if b { + return "yes" + } + return "NO" +} diff --git a/playground/go.mod b/playground/go.mod new file mode 100644 index 0000000..deaee87 --- /dev/null +++ b/playground/go.mod @@ -0,0 +1,3 @@ +module cp_playground + +go 1.25.0