claude_pipe/playground/artifact_probe.go

// artifact_probe drives the claude_pipe binary across a set of prompts and looks
// for two classes of problems that are inherent to parsing the claude TUI:
//
//  1. Artifacts: bits of the render that leaked into the parsed answer — box
//     drawing characters from the banner, status-bar fragments (CTX:, IN:, $...),
//     the "✻ Crunched" meta line, the echoed prompt, replacement characters, or
//     glued words (a heuristic: very long runs with no spaces).
//
//  2. Streaming inconsistencies: in --stream mode, the concatenation of all
//     text_delta events should reconstruct the final result. If it doesn't, the
//     prefix-delta heuristic dropped or duplicated text under reflow.
//
// Optionally (--ref) it also runs the real `claude -p` for the same prompt and
// reports whether claude_pipe's answer matches it (whitespace-normalized). claude
// is not deterministic, so only trivial prompts are expected to match exactly;
// for open prompts the comparison is informational.
//
// This is a playground tool: it is not indexed, has no registry entry, and exists
// only to probe claude_pipe's TUI-parsing quality. Run it when you want to audit
// the parser against real claude output.
//
// Usage:
//
//	go run artifact_probe.go --root /home/enmanuel/fn_registry            # default prompts, no ref
//	go run artifact_probe.go --root /home/enmanuel/fn_registry --ref      # also compare vs claude -p
//	go run artifact_probe.go --root /repo --prompt "tu prompt"            # single custom prompt
package main

import (
	"bufio"
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"os"
	"os/exec"
	"regexp"
	"strings"
	"time"
)

// defaultPrompts exercise different shapes: one word, a short list, a multi-line
// answer, and one that mentions code (markers that often trip up TUI parsing).
var defaultPrompts = []string{
	"responde unicamente con la palabra PONG, sin explicaciones",
	"lista exactamente tres frutas, una por linea, sin numeracion ni texto extra",
	"explica en dos frases que es un pseudo-terminal (PTY)",
	"escribe una linea de codigo Go que imprima hola, sin explicaciones",
}

// artifactPatterns are substrings/regexes that should NEVER appear in a clean
// parsed answer. Each is a piece of TUI chrome, not model output.
var artifactPatterns = []struct {
	name string
	re   *regexp.Regexp
}{
	{"box_drawing", regexp.MustCompile(`[╭╮╰╯┌┐└┘├┤┬┴┼│─]`)},
	{"horizontal_rule", regexp.MustCompile(`─{8,}`)},
	{"status_ctx", regexp.MustCompile(`CTX:\s*[\d█░]`)},
	{"status_inout", regexp.MustCompile(`\bIN:\d|\bOUT:\d`)},
	{"status_limits", regexp.MustCompile(`Limits:|Total:\s*↓|⎇\s`)},
	{"status_cost", regexp.MustCompile(`\$\d+\.\d`)},
	{"for_agents", regexp.MustCompile(`←\s*for agents`)},
	// Spinner detected by structure (any glyph + word…) and by signature
	// ("(Ns ... tokens", "esc to interrupt"), not by the ever-changing word.
	{"meta_spinner", regexp.MustCompile(`[✻✽✢✶✺✷✦✳✱]|esc to interrupt|\(\d+s\b[^)]*tokens?\b`)},
	{"prompt_marker", regexp.MustCompile(`❯`)},
	{"replacement_char", regexp.MustCompile("�")},
}

// gluedWordRe flags a run of >40 non-space characters, the signature of stripped
// cursor moves collapsing columns together (e.g. "2newMCPservers").
var gluedWordRe = regexp.MustCompile(`\S{41,}`)

type streamEvent struct {
	Type   string `json:"type"`
	Text   string `json:"text"`
	Result string `json:"result"`
}

type caseResult struct {
	prompt         string
	oneshot        string
	streamDeltas   []string
	streamResult   string
	ref            string
	artifactsOne   []string
	artifactsStrm  []string
	streamConsistent bool
	matchesRef       string // "yes" | "no" | "n/a"
	errs             []string
}

func main() {
	root := flag.String("root", "/home/enmanuel/fn_registry", "cwd for claude (a repo whose MCP servers are approved)")
	bin := flag.String("bin", "../claude_pipe", "path to the claude_pipe binary")
	single := flag.String("prompt", "", "run a single custom prompt instead of the default set")
	ref := flag.Bool("ref", false, "also run real `claude -p` and compare")
	warmup := flag.String("warmup", "4s", "claude_pipe --warmup")
	idle := flag.String("idle", "4s", "claude_pipe --idle")
	maxDur := flag.String("max", "90s", "claude_pipe --max")
	flag.Parse()

	prompts := defaultPrompts
	if *single != "" {
		prompts = []string{*single}
	}

	if _, err := os.Stat(*bin); err != nil {
		fmt.Fprintf(os.Stderr, "claude_pipe binary not found at %s — build it first:\n  (cd .. && CGO_ENABLED=1 go build -tags fts5 -o claude_pipe .)\n", *bin)
		os.Exit(1)
	}

	var results []caseResult
	for i, p := range prompts {
		fmt.Fprintf(os.Stderr, "[%d/%d] probing: %s\n", i+1, len(prompts), truncate(p, 60))
		results = append(results, probe(*bin, *root, p, *warmup, *idle, *maxDur, *ref))
	}

	report(results, *ref)

	// Exit non-zero if any artifact was found, so this can gate CI if desired.
	for _, r := range results {
		if len(r.artifactsOne) > 0 || len(r.artifactsStrm) > 0 || !r.streamConsistent {
			os.Exit(2)
		}
	}
}

func probe(bin, root, prompt, warmup, idle, maxDur string, withRef bool) caseResult {
	r := caseResult{prompt: prompt, streamConsistent: true, matchesRef: "n/a"}

	// One-shot, text format.
	one, err := run(90*time.Second, bin,
		"--format", "text", "--cwd", root,
		"--warmup", warmup, "--idle", idle, "--max", maxDur, prompt)
	if err != nil {
		r.errs = append(r.errs, "oneshot: "+err.Error())
	}
	r.oneshot = strings.TrimRight(one, "\n")
	r.artifactsOne = findArtifacts(r.oneshot, prompt)

	// Streaming.
	strm, err := run(90*time.Second, bin,
		"--stream", "--cwd", root,
		"--warmup", warmup, "--idle", idle, "--max", maxDur,
		"--snapshot-interval", "150ms", prompt)
	if err != nil {
		r.errs = append(r.errs, "stream: "+err.Error())
	}
	r.streamDeltas, r.streamResult = parseStream(strm)
	r.artifactsStrm = findArtifacts(r.streamResult, prompt)
	// Consistency: concatenated deltas should reconstruct the final result.
	recon := strings.Join(r.streamDeltas, "")
	r.streamConsistent = normalize(recon) == normalize(r.streamResult)

	if withRef {
		refOut, err := run(90*time.Second, "claude", "-p", prompt)
		if err != nil {
			r.errs = append(r.errs, "ref: "+err.Error())
		} else {
			r.ref = strings.TrimRight(refOut, "\n")
			if normalize(r.ref) == normalize(r.oneshot) {
				r.matchesRef = "yes"
			} else {
				r.matchesRef = "no"
			}
		}
	}

	return r
}

// run executes a command with a timeout and returns its stdout.
func run(timeout time.Duration, name string, args ...string) (string, error) {
	ctx, cancel := context.WithTimeout(context.Background(), timeout)
	defer cancel()
	cmd := exec.CommandContext(ctx, name, args...)
	out, err := cmd.Output()
	return string(out), err
}

// parseStream splits the NDJSON stream into the ordered text_delta texts and the
// final result string.
func parseStream(s string) (deltas []string, result string) {
	sc := bufio.NewScanner(strings.NewReader(s))
	sc.Buffer(make([]byte, 1024*1024), 1024*1024)
	for sc.Scan() {
		line := strings.TrimSpace(sc.Text())
		if line == "" {
			continue
		}
		var ev streamEvent
		if json.Unmarshal([]byte(line), &ev) != nil {
			continue
		}
		switch ev.Type {
		case "text_delta":
			deltas = append(deltas, ev.Text)
		case "result":
			result = ev.Result
		}
	}
	return deltas, result
}

func findArtifacts(text, prompt string) []string {
	var found []string
	for _, ap := range artifactPatterns {
		if ap.re.MatchString(text) {
			found = append(found, ap.name)
		}
	}
	if gluedWordRe.MatchString(text) {
		found = append(found, "glued_words")
	}
	// Prompt echoed verbatim into the answer (claude shouldn't repeat the prompt).
	if len(prompt) > 12 && strings.Contains(text, prompt) {
		found = append(found, "prompt_echo")
	}
	return found
}

// normalize collapses all whitespace runs to single spaces and trims, so that
// layout-induced spacing differences don't count as content differences.
func normalize(s string) string {
	return strings.Join(strings.Fields(s), " ")
}

func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}

func report(results []caseResult, withRef bool) {
	fmt.Println()
	fmt.Println("=== claude_pipe artifact probe ===")
	for i, r := range results {
		fmt.Printf("\n[%d] %s\n", i+1, truncate(r.prompt, 70))
		fmt.Printf("    oneshot:   %q\n", truncate(r.oneshot, 80))
		fmt.Printf("    stream:    %d deltas, result=%q\n", len(r.streamDeltas), truncate(r.streamResult, 60))
		fmt.Printf("    consistent: %s\n", yesno(r.streamConsistent))
		printArtifacts("    artifacts(oneshot):", r.artifactsOne)
		printArtifacts("    artifacts(stream): ", r.artifactsStrm)
		if withRef {
			fmt.Printf("    matches claude -p: %s\n", r.matchesRef)
			if r.matchesRef == "no" {
				fmt.Printf("      ref: %q\n", truncate(r.ref, 80))
			}
		}
		for _, e := range r.errs {
			fmt.Printf("    ERROR: %s\n", e)
		}
	}

	// Summary.
	clean := 0
	for _, r := range results {
		if len(r.artifactsOne) == 0 && len(r.artifactsStrm) == 0 && r.streamConsistent {
			clean++
		}
	}
	fmt.Printf("\n=== %d/%d cases clean (no artifacts, stream consistent) ===\n", clean, len(results))
}

func printArtifacts(label string, a []string) {
	if len(a) == 0 {
		fmt.Printf("%s none\n", label)
		return
	}
	fmt.Printf("%s %s\n", label, strings.Join(a, ", "))
}

func yesno(b bool) string {
	if b {
		return "yes"
	}
	return "NO"
}