diff --git a/call_monitor b/call_monitor index 667a324..f00fce3 100755 Binary files a/call_monitor and b/call_monitor differ diff --git a/cluster.go b/cluster.go new file mode 100644 index 0000000..266f5c8 --- /dev/null +++ b/cluster.go @@ -0,0 +1,256 @@ +package main + +import ( + "crypto/sha256" + "database/sql" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "regexp" + "sort" + "strings" + "text/tabwriter" +) + +// ClusterConfig controls the pattern clustering behaviour. +type ClusterConfig struct { + MinOccurrences int + LookbackDays int + Tools []string // tool_used values to consider +} + +// PatternCluster is one normalized cluster of inline snippets. +type PatternCluster struct { + Hash string + Representative string + Occurrences int + SessionIDs []string + FirstSeen int64 + LastSeen int64 +} + +var ( + reNumber = regexp.MustCompile(`\b\d+\b`) + reDQuoted = regexp.MustCompile(`"[^"]*"`) + reSQuoted = regexp.MustCompile(`'[^']*'`) + reSpaces = regexp.MustCompile(`\s+`) + rePath = regexp.MustCompile(`/[A-Za-z0-9_./-]{6,}`) + reHex = regexp.MustCompile(`\b[0-9a-f]{8,}\b`) +) + +// normalizeSnippet collapses noise (numbers, quoted strings, hex, paths, +// whitespace) so that semantically-equivalent snippets hash the same. +func normalizeSnippet(s string) string { + s = strings.ToLower(s) + s = reDQuoted.ReplaceAllString(s, `"STR"`) + s = reSQuoted.ReplaceAllString(s, `'STR'`) + s = rePath.ReplaceAllString(s, "/PATH") + s = reHex.ReplaceAllString(s, "HEX") + s = reNumber.ReplaceAllString(s, "N") + s = reSpaces.ReplaceAllString(s, " ") + return strings.TrimSpace(s) +} + +func hashSnippet(norm string) string { + sum := sha256.Sum256([]byte(norm)) + return hex.EncodeToString(sum[:8]) // 16 hex chars, enough collision room +} + +// ClusterPatterns groups inline snippets (function_id='') by normalized hash, +// upserts the patterns table, and returns clusters with occurrences >= min. +func ClusterPatterns(db *sql.DB, cfg ClusterConfig) ([]PatternCluster, error) { + if cfg.MinOccurrences <= 0 { + cfg.MinOccurrences = 3 + } + if cfg.LookbackDays <= 0 { + cfg.LookbackDays = 30 + } + if len(cfg.Tools) == 0 { + cfg.Tools = []string{"heredoc_py", "heredoc_bash", "sqlite_direct", "bash_other"} + } + + placeholders := make([]string, len(cfg.Tools)) + args := make([]any, 0, len(cfg.Tools)+1) + for i, t := range cfg.Tools { + placeholders[i] = "?" + args = append(args, t) + } + args = append(args, cfg.LookbackDays) + + q := fmt.Sprintf(` + SELECT command_snippet, session_id, ts + FROM calls + WHERE function_id = '' + AND command_snippet != '' + AND tool_used IN (%s) + AND ts >= CAST(strftime('%%s', 'now', '-' || ? || ' days') AS INTEGER) + ORDER BY ts ASC`, + strings.Join(placeholders, ",")) + + rows, err := db.Query(q, args...) + if err != nil { + return nil, fmt.Errorf("query calls: %w", err) + } + defer rows.Close() + + clusters := map[string]*PatternCluster{} + sessionsSeen := map[string]map[string]struct{}{} + + for rows.Next() { + var snippet, session string + var ts int64 + if err := rows.Scan(&snippet, &session, &ts); err != nil { + return nil, err + } + norm := normalizeSnippet(snippet) + if norm == "" { + continue + } + hash := hashSnippet(norm) + c, ok := clusters[hash] + if !ok { + c = &PatternCluster{ + Hash: hash, + Representative: trim(snippet, 240), + FirstSeen: ts, + LastSeen: ts, + } + clusters[hash] = c + sessionsSeen[hash] = map[string]struct{}{} + } + c.Occurrences++ + if ts > c.LastSeen { + c.LastSeen = ts + } + if ts < c.FirstSeen { + c.FirstSeen = ts + } + if session != "" { + if _, seen := sessionsSeen[hash][session]; !seen { + sessionsSeen[hash][session] = struct{}{} + c.SessionIDs = append(c.SessionIDs, session) + } + } + } + if err := rows.Err(); err != nil { + return nil, err + } + + out := make([]PatternCluster, 0, len(clusters)) + for _, c := range clusters { + if c.Occurrences >= cfg.MinOccurrences { + out = append(out, *c) + } + } + sort.Slice(out, func(i, j int) bool { return out[i].Occurrences > out[j].Occurrences }) + return out, nil +} + +// UpsertPatterns writes clusters to the patterns table (idempotent). +func UpsertPatterns(db *sql.DB, clusters []PatternCluster) error { + tx, err := db.Begin() + if err != nil { + return err + } + defer tx.Rollback() + + stmt, err := tx.Prepare(` + INSERT INTO patterns (pattern_hash, representative_snippet, occurrences, + session_ids_json, first_seen, last_seen, proposal_id) + VALUES (?, ?, ?, ?, ?, ?, '') + ON CONFLICT(pattern_hash) DO UPDATE SET + occurrences = excluded.occurrences, + session_ids_json = excluded.session_ids_json, + last_seen = excluded.last_seen + `) + if err != nil { + return err + } + defer stmt.Close() + + for _, c := range clusters { + ids, _ := json.Marshal(c.SessionIDs) + if _, err := stmt.Exec( + c.Hash, c.Representative, c.Occurrences, string(ids), + c.FirstSeen, c.LastSeen, + ); err != nil { + return err + } + } + return tx.Commit() +} + +func splitCSV(s string) []string { + out := []string{} + for _, p := range strings.Split(s, ",") { + p = strings.TrimSpace(p) + if p != "" { + out = append(out, p) + } + } + return out +} + +func trim(s string, max int) string { + s = strings.TrimSpace(s) + if len(s) <= max { + return s + } + return s[:max] +} + +// runClusterPatterns is the CLI entry point. +func runClusterPatterns(dbPath string, cfg ClusterConfig, persist, formatJSON bool) { + db, err := openDB(dbPath) + if err != nil { + fmt.Fprintf(os.Stderr, "open db: %v\n", err) + os.Exit(1) + } + defer db.Close() + + clusters, err := ClusterPatterns(db.conn, cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "cluster: %v\n", err) + os.Exit(1) + } + + if persist { + if err := UpsertPatterns(db.conn, clusters); err != nil { + fmt.Fprintf(os.Stderr, "persist patterns: %v\n", err) + os.Exit(1) + } + } + + if formatJSON { + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + enc.Encode(clusters) + return + } + + tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + fmt.Fprintf(tw, "=== %d pattern clusters (min %d occurrences, last %dd) ===\n", + len(clusters), cfg.MinOccurrences, cfg.LookbackDays) + if len(clusters) == 0 { + fmt.Fprintln(tw, "(no clusters found)") + tw.Flush() + return + } + fmt.Fprintln(tw, "HASH\tOCCURRENCES\tSESSIONS\tFIRST_SEEN\tLAST_SEEN\tSNIPPET") + for _, c := range clusters { + snippet := strings.ReplaceAll(c.Representative, "\n", "\\n") + if len(snippet) > 70 { + snippet = snippet[:70] + "..." + } + fmt.Fprintf(tw, "%s\t%d\t%d\t%d\t%d\t%s\n", + c.Hash, c.Occurrences, len(c.SessionIDs), + c.FirstSeen, c.LastSeen, snippet) + } + tw.Flush() + + if persist { + fmt.Println() + fmt.Printf("upserted %d clusters into patterns table\n", len(clusters)) + } +} diff --git a/cluster_test.go b/cluster_test.go new file mode 100644 index 0000000..15626a1 --- /dev/null +++ b/cluster_test.go @@ -0,0 +1,51 @@ +package main + +import "testing" + +func TestNormalizeSnippet(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"sqlite3 /tmp/x.db \"SELECT 42\"", "sqlite3 /PATH \"STR\""}, + {"grep -n 'foo' line 123", "grep -n 'STR' line N"}, + {" multi space ", "multi space"}, + {"sha=ab12cd34ef56", "sha=HEX"}, + } + for _, c := range cases { + got := normalizeSnippet(c.in) + if got != c.want { + t.Errorf("normalize(%q)=%q want %q", c.in, got, c.want) + } + } +} + +func TestHashSnippetStable(t *testing.T) { + a := hashSnippet(normalizeSnippet("ls /tmp/foo")) + b := hashSnippet(normalizeSnippet("ls /tmp/bar")) + if a != b { + t.Errorf("expected same hash after normalization: %s vs %s", a, b) + } + c := hashSnippet(normalizeSnippet("cat /tmp/foo")) + if a == c { + t.Errorf("expected different hash for different commands") + } +} + +func TestSplitCSV(t *testing.T) { + cases := []struct { + in string + want int + }{ + {"a,b,c", 3}, + {" a , b , c ", 3}, + {"", 0}, + {"a,,b", 2}, + } + for _, c := range cases { + got := splitCSV(c.in) + if len(got) != c.want { + t.Errorf("splitCSV(%q) len=%d want %d (%v)", c.in, len(got), c.want, got) + } + } +} diff --git a/main.go b/main.go index 6a2f677..7b0e0c0 100644 --- a/main.go +++ b/main.go @@ -62,6 +62,19 @@ func main() { MinSuccessRate: 0.9, } runSequences(resolveDB(*dbPath), *detect, *report, *propose, *formatJSON, *root, cfg) + case "cluster-patterns": + minOcc := fs.Int("min-occurrences", 3, "Minimum occurrences to keep a cluster.") + lookback := fs.Int("lookback-days", 30, "How many days of calls to scan.") + persist := fs.Bool("persist", false, "Upsert clusters into the patterns table.") + formatJSON := fs.Bool("json", false, "Output JSON instead of text.") + toolsCSV := fs.String("tools", "heredoc_py,heredoc_bash,sqlite_direct,bash_other", "Comma-separated tool_used values to consider.") + fs.Parse(os.Args[2:]) + tools := splitCSV(*toolsCSV) + runClusterPatterns(resolveDB(*dbPath), ClusterConfig{ + MinOccurrences: *minOcc, + LookbackDays: *lookback, + Tools: tools, + }, *persist, *formatJSON) case "-h", "--help", "help": usage() default: diff --git a/operations.db b/operations.db index 3ba3357..c8f1790 100644 Binary files a/operations.db and b/operations.db differ diff --git a/operations.db-shm b/operations.db-shm deleted file mode 100644 index 04c2e50..0000000 Binary files a/operations.db-shm and /dev/null differ diff --git a/operations.db-wal b/operations.db-wal deleted file mode 100644 index 3bff7d0..0000000 Binary files a/operations.db-wal and /dev/null differ