8e51094d94
- call_monitor - main.go - operations.db - operations.db-shm - operations.db-wal - cluster.go - cluster_test.go Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
257 lines
6.4 KiB
Go
257 lines
6.4 KiB
Go
package main
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"database/sql"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"text/tabwriter"
|
|
)
|
|
|
|
// ClusterConfig controls the pattern clustering behaviour.
|
|
type ClusterConfig struct {
|
|
MinOccurrences int
|
|
LookbackDays int
|
|
Tools []string // tool_used values to consider
|
|
}
|
|
|
|
// PatternCluster is one normalized cluster of inline snippets.
|
|
type PatternCluster struct {
|
|
Hash string
|
|
Representative string
|
|
Occurrences int
|
|
SessionIDs []string
|
|
FirstSeen int64
|
|
LastSeen int64
|
|
}
|
|
|
|
var (
|
|
reNumber = regexp.MustCompile(`\b\d+\b`)
|
|
reDQuoted = regexp.MustCompile(`"[^"]*"`)
|
|
reSQuoted = regexp.MustCompile(`'[^']*'`)
|
|
reSpaces = regexp.MustCompile(`\s+`)
|
|
rePath = regexp.MustCompile(`/[A-Za-z0-9_./-]{6,}`)
|
|
reHex = regexp.MustCompile(`\b[0-9a-f]{8,}\b`)
|
|
)
|
|
|
|
// normalizeSnippet collapses noise (numbers, quoted strings, hex, paths,
|
|
// whitespace) so that semantically-equivalent snippets hash the same.
|
|
func normalizeSnippet(s string) string {
|
|
s = strings.ToLower(s)
|
|
s = reDQuoted.ReplaceAllString(s, `"STR"`)
|
|
s = reSQuoted.ReplaceAllString(s, `'STR'`)
|
|
s = rePath.ReplaceAllString(s, "/PATH")
|
|
s = reHex.ReplaceAllString(s, "HEX")
|
|
s = reNumber.ReplaceAllString(s, "N")
|
|
s = reSpaces.ReplaceAllString(s, " ")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
func hashSnippet(norm string) string {
|
|
sum := sha256.Sum256([]byte(norm))
|
|
return hex.EncodeToString(sum[:8]) // 16 hex chars, enough collision room
|
|
}
|
|
|
|
// ClusterPatterns groups inline snippets (function_id='') by normalized hash,
|
|
// upserts the patterns table, and returns clusters with occurrences >= min.
|
|
func ClusterPatterns(db *sql.DB, cfg ClusterConfig) ([]PatternCluster, error) {
|
|
if cfg.MinOccurrences <= 0 {
|
|
cfg.MinOccurrences = 3
|
|
}
|
|
if cfg.LookbackDays <= 0 {
|
|
cfg.LookbackDays = 30
|
|
}
|
|
if len(cfg.Tools) == 0 {
|
|
cfg.Tools = []string{"heredoc_py", "heredoc_bash", "sqlite_direct", "bash_other"}
|
|
}
|
|
|
|
placeholders := make([]string, len(cfg.Tools))
|
|
args := make([]any, 0, len(cfg.Tools)+1)
|
|
for i, t := range cfg.Tools {
|
|
placeholders[i] = "?"
|
|
args = append(args, t)
|
|
}
|
|
args = append(args, cfg.LookbackDays)
|
|
|
|
q := fmt.Sprintf(`
|
|
SELECT command_snippet, session_id, ts
|
|
FROM calls
|
|
WHERE function_id = ''
|
|
AND command_snippet != ''
|
|
AND tool_used IN (%s)
|
|
AND ts >= CAST(strftime('%%s', 'now', '-' || ? || ' days') AS INTEGER)
|
|
ORDER BY ts ASC`,
|
|
strings.Join(placeholders, ","))
|
|
|
|
rows, err := db.Query(q, args...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("query calls: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
|
|
clusters := map[string]*PatternCluster{}
|
|
sessionsSeen := map[string]map[string]struct{}{}
|
|
|
|
for rows.Next() {
|
|
var snippet, session string
|
|
var ts int64
|
|
if err := rows.Scan(&snippet, &session, &ts); err != nil {
|
|
return nil, err
|
|
}
|
|
norm := normalizeSnippet(snippet)
|
|
if norm == "" {
|
|
continue
|
|
}
|
|
hash := hashSnippet(norm)
|
|
c, ok := clusters[hash]
|
|
if !ok {
|
|
c = &PatternCluster{
|
|
Hash: hash,
|
|
Representative: trim(snippet, 240),
|
|
FirstSeen: ts,
|
|
LastSeen: ts,
|
|
}
|
|
clusters[hash] = c
|
|
sessionsSeen[hash] = map[string]struct{}{}
|
|
}
|
|
c.Occurrences++
|
|
if ts > c.LastSeen {
|
|
c.LastSeen = ts
|
|
}
|
|
if ts < c.FirstSeen {
|
|
c.FirstSeen = ts
|
|
}
|
|
if session != "" {
|
|
if _, seen := sessionsSeen[hash][session]; !seen {
|
|
sessionsSeen[hash][session] = struct{}{}
|
|
c.SessionIDs = append(c.SessionIDs, session)
|
|
}
|
|
}
|
|
}
|
|
if err := rows.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
out := make([]PatternCluster, 0, len(clusters))
|
|
for _, c := range clusters {
|
|
if c.Occurrences >= cfg.MinOccurrences {
|
|
out = append(out, *c)
|
|
}
|
|
}
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Occurrences > out[j].Occurrences })
|
|
return out, nil
|
|
}
|
|
|
|
// UpsertPatterns writes clusters to the patterns table (idempotent).
|
|
func UpsertPatterns(db *sql.DB, clusters []PatternCluster) error {
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer tx.Rollback()
|
|
|
|
stmt, err := tx.Prepare(`
|
|
INSERT INTO patterns (pattern_hash, representative_snippet, occurrences,
|
|
session_ids_json, first_seen, last_seen, proposal_id)
|
|
VALUES (?, ?, ?, ?, ?, ?, '')
|
|
ON CONFLICT(pattern_hash) DO UPDATE SET
|
|
occurrences = excluded.occurrences,
|
|
session_ids_json = excluded.session_ids_json,
|
|
last_seen = excluded.last_seen
|
|
`)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, c := range clusters {
|
|
ids, _ := json.Marshal(c.SessionIDs)
|
|
if _, err := stmt.Exec(
|
|
c.Hash, c.Representative, c.Occurrences, string(ids),
|
|
c.FirstSeen, c.LastSeen,
|
|
); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func splitCSV(s string) []string {
|
|
out := []string{}
|
|
for _, p := range strings.Split(s, ",") {
|
|
p = strings.TrimSpace(p)
|
|
if p != "" {
|
|
out = append(out, p)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func trim(s string, max int) string {
|
|
s = strings.TrimSpace(s)
|
|
if len(s) <= max {
|
|
return s
|
|
}
|
|
return s[:max]
|
|
}
|
|
|
|
// runClusterPatterns is the CLI entry point.
|
|
func runClusterPatterns(dbPath string, cfg ClusterConfig, persist, formatJSON bool) {
|
|
db, err := openDB(dbPath)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
|
|
clusters, err := ClusterPatterns(db.conn, cfg)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "cluster: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
if persist {
|
|
if err := UpsertPatterns(db.conn, clusters); err != nil {
|
|
fmt.Fprintf(os.Stderr, "persist patterns: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
if formatJSON {
|
|
enc := json.NewEncoder(os.Stdout)
|
|
enc.SetIndent("", " ")
|
|
enc.Encode(clusters)
|
|
return
|
|
}
|
|
|
|
tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
|
fmt.Fprintf(tw, "=== %d pattern clusters (min %d occurrences, last %dd) ===\n",
|
|
len(clusters), cfg.MinOccurrences, cfg.LookbackDays)
|
|
if len(clusters) == 0 {
|
|
fmt.Fprintln(tw, "(no clusters found)")
|
|
tw.Flush()
|
|
return
|
|
}
|
|
fmt.Fprintln(tw, "HASH\tOCCURRENCES\tSESSIONS\tFIRST_SEEN\tLAST_SEEN\tSNIPPET")
|
|
for _, c := range clusters {
|
|
snippet := strings.ReplaceAll(c.Representative, "\n", "\\n")
|
|
if len(snippet) > 70 {
|
|
snippet = snippet[:70] + "..."
|
|
}
|
|
fmt.Fprintf(tw, "%s\t%d\t%d\t%d\t%d\t%s\n",
|
|
c.Hash, c.Occurrences, len(c.SessionIDs),
|
|
c.FirstSeen, c.LastSeen, snippet)
|
|
}
|
|
tw.Flush()
|
|
|
|
if persist {
|
|
fmt.Println()
|
|
fmt.Printf("upserted %d clusters into patterns table\n", len(clusters))
|
|
}
|
|
}
|