Files
claude_pipe/playground/artifact_probe.go
T
agent 22bec25d8d fix: stream usa ultimo frame + playground artifact_probe
- streamAnswer: el result final es el ULTIMO answer parseado (frame post-idle,
  limpio) en vez del mas largo, que podia ser un frame intermedio con spinner.
- playground/artifact_probe.go: herramienta de auditoria que corre oneshot +
  stream contra varios prompts, detecta artefactos del render que se cuelan
  (box chars, status bar, spinner, palabras pegadas, prompt echo) y verifica
  consistencia del streaming (concat de deltas == result). Opcional --ref para
  comparar con claude -p real. No se indexa (playground del padre).

El fix del spinner en si vive en parse_claude_tui_go_tui (repo padre): detecta
la linea de carga por estructura (glyph + palabra + ellipsis) y firma
((Ns tokens) / esc to interrupt), no por la palabra, que cambia cada frame.
2026-06-03 23:37:39 +02:00

282 lines
9.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// artifact_probe drives the claude_pipe binary across a set of prompts and looks
// for two classes of problems that are inherent to parsing the claude TUI:
//
// 1. Artifacts: bits of the render that leaked into the parsed answer — box
// drawing characters from the banner, status-bar fragments (CTX:, IN:, $...),
// the "✻ Crunched" meta line, the echoed prompt, replacement characters, or
// glued words (a heuristic: very long runs with no spaces).
//
// 2. Streaming inconsistencies: in --stream mode, the concatenation of all
// text_delta events should reconstruct the final result. If it doesn't, the
// prefix-delta heuristic dropped or duplicated text under reflow.
//
// Optionally (--ref) it also runs the real `claude -p` for the same prompt and
// reports whether claude_pipe's answer matches it (whitespace-normalized). claude
// is not deterministic, so only trivial prompts are expected to match exactly;
// for open prompts the comparison is informational.
//
// This is a playground tool: it is not indexed, has no registry entry, and exists
// only to probe claude_pipe's TUI-parsing quality. Run it when you want to audit
// the parser against real claude output.
//
// Usage:
//
// go run artifact_probe.go --root /home/enmanuel/fn_registry # default prompts, no ref
// go run artifact_probe.go --root /home/enmanuel/fn_registry --ref # also compare vs claude -p
// go run artifact_probe.go --root /repo --prompt "tu prompt" # single custom prompt
package main
import (
"bufio"
"context"
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
"regexp"
"strings"
"time"
)
// defaultPrompts exercise different shapes: one word, a short list, a multi-line
// answer, and one that mentions code (markers that often trip up TUI parsing).
var defaultPrompts = []string{
"responde unicamente con la palabra PONG, sin explicaciones",
"lista exactamente tres frutas, una por linea, sin numeracion ni texto extra",
"explica en dos frases que es un pseudo-terminal (PTY)",
"escribe una linea de codigo Go que imprima hola, sin explicaciones",
}
// artifactPatterns are substrings/regexes that should NEVER appear in a clean
// parsed answer. Each is a piece of TUI chrome, not model output.
var artifactPatterns = []struct {
name string
re *regexp.Regexp
}{
{"box_drawing", regexp.MustCompile(`[╭╮╰╯┌┐└┘├┤┬┴┼│─]`)},
{"horizontal_rule", regexp.MustCompile(`{8,}`)},
{"status_ctx", regexp.MustCompile(`CTX:\s*[\d█░]`)},
{"status_inout", regexp.MustCompile(`\bIN:\d|\bOUT:\d`)},
{"status_limits", regexp.MustCompile(`Limits:|Total:\s*↓|⎇\s`)},
{"status_cost", regexp.MustCompile(`\$\d+\.\d`)},
{"for_agents", regexp.MustCompile(`←\s*for agents`)},
// Spinner detected by structure (any glyph + word…) and by signature
// ("(Ns ... tokens", "esc to interrupt"), not by the ever-changing word.
{"meta_spinner", regexp.MustCompile(`[✻✽✢✶✺✷✦✳✱]|esc to interrupt|\(\d+s\b[^)]*tokens?\b`)},
{"prompt_marker", regexp.MustCompile(``)},
{"replacement_char", regexp.MustCompile("")},
}
// gluedWordRe flags a run of >40 non-space characters, the signature of stripped
// cursor moves collapsing columns together (e.g. "2newMCPservers").
var gluedWordRe = regexp.MustCompile(`\S{41,}`)
type streamEvent struct {
Type string `json:"type"`
Text string `json:"text"`
Result string `json:"result"`
}
type caseResult struct {
prompt string
oneshot string
streamDeltas []string
streamResult string
ref string
artifactsOne []string
artifactsStrm []string
streamConsistent bool
matchesRef string // "yes" | "no" | "n/a"
errs []string
}
func main() {
root := flag.String("root", "/home/enmanuel/fn_registry", "cwd for claude (a repo whose MCP servers are approved)")
bin := flag.String("bin", "../claude_pipe", "path to the claude_pipe binary")
single := flag.String("prompt", "", "run a single custom prompt instead of the default set")
ref := flag.Bool("ref", false, "also run real `claude -p` and compare")
warmup := flag.String("warmup", "4s", "claude_pipe --warmup")
idle := flag.String("idle", "4s", "claude_pipe --idle")
maxDur := flag.String("max", "90s", "claude_pipe --max")
flag.Parse()
prompts := defaultPrompts
if *single != "" {
prompts = []string{*single}
}
if _, err := os.Stat(*bin); err != nil {
fmt.Fprintf(os.Stderr, "claude_pipe binary not found at %s — build it first:\n (cd .. && CGO_ENABLED=1 go build -tags fts5 -o claude_pipe .)\n", *bin)
os.Exit(1)
}
var results []caseResult
for i, p := range prompts {
fmt.Fprintf(os.Stderr, "[%d/%d] probing: %s\n", i+1, len(prompts), truncate(p, 60))
results = append(results, probe(*bin, *root, p, *warmup, *idle, *maxDur, *ref))
}
report(results, *ref)
// Exit non-zero if any artifact was found, so this can gate CI if desired.
for _, r := range results {
if len(r.artifactsOne) > 0 || len(r.artifactsStrm) > 0 || !r.streamConsistent {
os.Exit(2)
}
}
}
func probe(bin, root, prompt, warmup, idle, maxDur string, withRef bool) caseResult {
r := caseResult{prompt: prompt, streamConsistent: true, matchesRef: "n/a"}
// One-shot, text format.
one, err := run(90*time.Second, bin,
"--format", "text", "--cwd", root,
"--warmup", warmup, "--idle", idle, "--max", maxDur, prompt)
if err != nil {
r.errs = append(r.errs, "oneshot: "+err.Error())
}
r.oneshot = strings.TrimRight(one, "\n")
r.artifactsOne = findArtifacts(r.oneshot, prompt)
// Streaming.
strm, err := run(90*time.Second, bin,
"--stream", "--cwd", root,
"--warmup", warmup, "--idle", idle, "--max", maxDur,
"--snapshot-interval", "150ms", prompt)
if err != nil {
r.errs = append(r.errs, "stream: "+err.Error())
}
r.streamDeltas, r.streamResult = parseStream(strm)
r.artifactsStrm = findArtifacts(r.streamResult, prompt)
// Consistency: concatenated deltas should reconstruct the final result.
recon := strings.Join(r.streamDeltas, "")
r.streamConsistent = normalize(recon) == normalize(r.streamResult)
if withRef {
refOut, err := run(90*time.Second, "claude", "-p", prompt)
if err != nil {
r.errs = append(r.errs, "ref: "+err.Error())
} else {
r.ref = strings.TrimRight(refOut, "\n")
if normalize(r.ref) == normalize(r.oneshot) {
r.matchesRef = "yes"
} else {
r.matchesRef = "no"
}
}
}
return r
}
// run executes a command with a timeout and returns its stdout.
func run(timeout time.Duration, name string, args ...string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
cmd := exec.CommandContext(ctx, name, args...)
out, err := cmd.Output()
return string(out), err
}
// parseStream splits the NDJSON stream into the ordered text_delta texts and the
// final result string.
func parseStream(s string) (deltas []string, result string) {
sc := bufio.NewScanner(strings.NewReader(s))
sc.Buffer(make([]byte, 1024*1024), 1024*1024)
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if line == "" {
continue
}
var ev streamEvent
if json.Unmarshal([]byte(line), &ev) != nil {
continue
}
switch ev.Type {
case "text_delta":
deltas = append(deltas, ev.Text)
case "result":
result = ev.Result
}
}
return deltas, result
}
func findArtifacts(text, prompt string) []string {
var found []string
for _, ap := range artifactPatterns {
if ap.re.MatchString(text) {
found = append(found, ap.name)
}
}
if gluedWordRe.MatchString(text) {
found = append(found, "glued_words")
}
// Prompt echoed verbatim into the answer (claude shouldn't repeat the prompt).
if len(prompt) > 12 && strings.Contains(text, prompt) {
found = append(found, "prompt_echo")
}
return found
}
// normalize collapses all whitespace runs to single spaces and trims, so that
// layout-induced spacing differences don't count as content differences.
func normalize(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}
func report(results []caseResult, withRef bool) {
fmt.Println()
fmt.Println("=== claude_pipe artifact probe ===")
for i, r := range results {
fmt.Printf("\n[%d] %s\n", i+1, truncate(r.prompt, 70))
fmt.Printf(" oneshot: %q\n", truncate(r.oneshot, 80))
fmt.Printf(" stream: %d deltas, result=%q\n", len(r.streamDeltas), truncate(r.streamResult, 60))
fmt.Printf(" consistent: %s\n", yesno(r.streamConsistent))
printArtifacts(" artifacts(oneshot):", r.artifactsOne)
printArtifacts(" artifacts(stream): ", r.artifactsStrm)
if withRef {
fmt.Printf(" matches claude -p: %s\n", r.matchesRef)
if r.matchesRef == "no" {
fmt.Printf(" ref: %q\n", truncate(r.ref, 80))
}
}
for _, e := range r.errs {
fmt.Printf(" ERROR: %s\n", e)
}
}
// Summary.
clean := 0
for _, r := range results {
if len(r.artifactsOne) == 0 && len(r.artifactsStrm) == 0 && r.streamConsistent {
clean++
}
}
fmt.Printf("\n=== %d/%d cases clean (no artifacts, stream consistent) ===\n", clean, len(results))
}
func printArtifacts(label string, a []string) {
if len(a) == 0 {
fmt.Printf("%s none\n", label)
return
}
fmt.Printf("%s %s\n", label, strings.Join(a, ", "))
}
func yesno(b bool) string {
if b {
return "yes"
}
return "NO"
}