package main import ( "crypto/sha256" "database/sql" "encoding/hex" "encoding/json" "fmt" "os" "regexp" "sort" "strings" "text/tabwriter" ) // ClusterConfig controls the pattern clustering behaviour. type ClusterConfig struct { MinOccurrences int LookbackDays int Tools []string // tool_used values to consider } // PatternCluster is one normalized cluster of inline snippets. type PatternCluster struct { Hash string Representative string Occurrences int SessionIDs []string FirstSeen int64 LastSeen int64 } var ( reNumber = regexp.MustCompile(`\b\d+\b`) reDQuoted = regexp.MustCompile(`"[^"]*"`) reSQuoted = regexp.MustCompile(`'[^']*'`) reSpaces = regexp.MustCompile(`\s+`) rePath = regexp.MustCompile(`/[A-Za-z0-9_./-]{6,}`) reHex = regexp.MustCompile(`\b[0-9a-f]{8,}\b`) ) // normalizeSnippet collapses noise (numbers, quoted strings, hex, paths, // whitespace) so that semantically-equivalent snippets hash the same. func normalizeSnippet(s string) string { s = strings.ToLower(s) s = reDQuoted.ReplaceAllString(s, `"STR"`) s = reSQuoted.ReplaceAllString(s, `'STR'`) s = rePath.ReplaceAllString(s, "/PATH") s = reHex.ReplaceAllString(s, "HEX") s = reNumber.ReplaceAllString(s, "N") s = reSpaces.ReplaceAllString(s, " ") return strings.TrimSpace(s) } func hashSnippet(norm string) string { sum := sha256.Sum256([]byte(norm)) return hex.EncodeToString(sum[:8]) // 16 hex chars, enough collision room } // ClusterPatterns groups inline snippets (function_id='') by normalized hash, // upserts the patterns table, and returns clusters with occurrences >= min. func ClusterPatterns(db *sql.DB, cfg ClusterConfig) ([]PatternCluster, error) { if cfg.MinOccurrences <= 0 { cfg.MinOccurrences = 3 } if cfg.LookbackDays <= 0 { cfg.LookbackDays = 30 } if len(cfg.Tools) == 0 { cfg.Tools = []string{"heredoc_py", "heredoc_bash", "sqlite_direct", "bash_other"} } placeholders := make([]string, len(cfg.Tools)) args := make([]any, 0, len(cfg.Tools)+1) for i, t := range cfg.Tools { placeholders[i] = "?" args = append(args, t) } args = append(args, cfg.LookbackDays) q := fmt.Sprintf(` SELECT command_snippet, session_id, ts FROM calls WHERE function_id = '' AND command_snippet != '' AND tool_used IN (%s) AND ts >= CAST(strftime('%%s', 'now', '-' || ? || ' days') AS INTEGER) ORDER BY ts ASC`, strings.Join(placeholders, ",")) rows, err := db.Query(q, args...) if err != nil { return nil, fmt.Errorf("query calls: %w", err) } defer rows.Close() clusters := map[string]*PatternCluster{} sessionsSeen := map[string]map[string]struct{}{} for rows.Next() { var snippet, session string var ts int64 if err := rows.Scan(&snippet, &session, &ts); err != nil { return nil, err } norm := normalizeSnippet(snippet) if norm == "" { continue } hash := hashSnippet(norm) c, ok := clusters[hash] if !ok { c = &PatternCluster{ Hash: hash, Representative: trim(snippet, 240), FirstSeen: ts, LastSeen: ts, } clusters[hash] = c sessionsSeen[hash] = map[string]struct{}{} } c.Occurrences++ if ts > c.LastSeen { c.LastSeen = ts } if ts < c.FirstSeen { c.FirstSeen = ts } if session != "" { if _, seen := sessionsSeen[hash][session]; !seen { sessionsSeen[hash][session] = struct{}{} c.SessionIDs = append(c.SessionIDs, session) } } } if err := rows.Err(); err != nil { return nil, err } out := make([]PatternCluster, 0, len(clusters)) for _, c := range clusters { if c.Occurrences >= cfg.MinOccurrences { out = append(out, *c) } } sort.Slice(out, func(i, j int) bool { return out[i].Occurrences > out[j].Occurrences }) return out, nil } // UpsertPatterns writes clusters to the patterns table (idempotent). func UpsertPatterns(db *sql.DB, clusters []PatternCluster) error { tx, err := db.Begin() if err != nil { return err } defer tx.Rollback() stmt, err := tx.Prepare(` INSERT INTO patterns (pattern_hash, representative_snippet, occurrences, session_ids_json, first_seen, last_seen, proposal_id) VALUES (?, ?, ?, ?, ?, ?, '') ON CONFLICT(pattern_hash) DO UPDATE SET occurrences = excluded.occurrences, session_ids_json = excluded.session_ids_json, last_seen = excluded.last_seen `) if err != nil { return err } defer stmt.Close() for _, c := range clusters { ids, _ := json.Marshal(c.SessionIDs) if _, err := stmt.Exec( c.Hash, c.Representative, c.Occurrences, string(ids), c.FirstSeen, c.LastSeen, ); err != nil { return err } } return tx.Commit() } func splitCSV(s string) []string { out := []string{} for _, p := range strings.Split(s, ",") { p = strings.TrimSpace(p) if p != "" { out = append(out, p) } } return out } func trim(s string, max int) string { s = strings.TrimSpace(s) if len(s) <= max { return s } return s[:max] } // runClusterPatterns is the CLI entry point. func runClusterPatterns(dbPath string, cfg ClusterConfig, persist, formatJSON bool) { db, err := openDB(dbPath) if err != nil { fmt.Fprintf(os.Stderr, "open db: %v\n", err) os.Exit(1) } defer db.Close() clusters, err := ClusterPatterns(db.conn, cfg) if err != nil { fmt.Fprintf(os.Stderr, "cluster: %v\n", err) os.Exit(1) } if persist { if err := UpsertPatterns(db.conn, clusters); err != nil { fmt.Fprintf(os.Stderr, "persist patterns: %v\n", err) os.Exit(1) } } if formatJSON { enc := json.NewEncoder(os.Stdout) enc.SetIndent("", " ") enc.Encode(clusters) return } tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) fmt.Fprintf(tw, "=== %d pattern clusters (min %d occurrences, last %dd) ===\n", len(clusters), cfg.MinOccurrences, cfg.LookbackDays) if len(clusters) == 0 { fmt.Fprintln(tw, "(no clusters found)") tw.Flush() return } fmt.Fprintln(tw, "HASH\tOCCURRENCES\tSESSIONS\tFIRST_SEEN\tLAST_SEEN\tSNIPPET") for _, c := range clusters { snippet := strings.ReplaceAll(c.Representative, "\n", "\\n") if len(snippet) > 70 { snippet = snippet[:70] + "..." } fmt.Fprintf(tw, "%s\t%d\t%d\t%d\t%d\t%s\n", c.Hash, c.Occurrences, len(c.SessionIDs), c.FirstSeen, c.LastSeen, snippet) } tw.Flush() if persist { fmt.Println() fmt.Printf("upserted %d clusters into patterns table\n", len(clusters)) } }