chore: auto-commit (7 archivos)
- call_monitor - main.go - operations.db - operations.db-shm - operations.db-wal - cluster.go - cluster_test.go Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
+256
@@ -0,0 +1,256 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"database/sql"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
)
|
||||
|
||||
// ClusterConfig controls the pattern clustering behaviour.
|
||||
type ClusterConfig struct {
|
||||
MinOccurrences int
|
||||
LookbackDays int
|
||||
Tools []string // tool_used values to consider
|
||||
}
|
||||
|
||||
// PatternCluster is one normalized cluster of inline snippets.
|
||||
type PatternCluster struct {
|
||||
Hash string
|
||||
Representative string
|
||||
Occurrences int
|
||||
SessionIDs []string
|
||||
FirstSeen int64
|
||||
LastSeen int64
|
||||
}
|
||||
|
||||
var (
|
||||
reNumber = regexp.MustCompile(`\b\d+\b`)
|
||||
reDQuoted = regexp.MustCompile(`"[^"]*"`)
|
||||
reSQuoted = regexp.MustCompile(`'[^']*'`)
|
||||
reSpaces = regexp.MustCompile(`\s+`)
|
||||
rePath = regexp.MustCompile(`/[A-Za-z0-9_./-]{6,}`)
|
||||
reHex = regexp.MustCompile(`\b[0-9a-f]{8,}\b`)
|
||||
)
|
||||
|
||||
// normalizeSnippet collapses noise (numbers, quoted strings, hex, paths,
|
||||
// whitespace) so that semantically-equivalent snippets hash the same.
|
||||
func normalizeSnippet(s string) string {
|
||||
s = strings.ToLower(s)
|
||||
s = reDQuoted.ReplaceAllString(s, `"STR"`)
|
||||
s = reSQuoted.ReplaceAllString(s, `'STR'`)
|
||||
s = rePath.ReplaceAllString(s, "/PATH")
|
||||
s = reHex.ReplaceAllString(s, "HEX")
|
||||
s = reNumber.ReplaceAllString(s, "N")
|
||||
s = reSpaces.ReplaceAllString(s, " ")
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
||||
func hashSnippet(norm string) string {
|
||||
sum := sha256.Sum256([]byte(norm))
|
||||
return hex.EncodeToString(sum[:8]) // 16 hex chars, enough collision room
|
||||
}
|
||||
|
||||
// ClusterPatterns groups inline snippets (function_id='') by normalized hash,
|
||||
// upserts the patterns table, and returns clusters with occurrences >= min.
|
||||
func ClusterPatterns(db *sql.DB, cfg ClusterConfig) ([]PatternCluster, error) {
|
||||
if cfg.MinOccurrences <= 0 {
|
||||
cfg.MinOccurrences = 3
|
||||
}
|
||||
if cfg.LookbackDays <= 0 {
|
||||
cfg.LookbackDays = 30
|
||||
}
|
||||
if len(cfg.Tools) == 0 {
|
||||
cfg.Tools = []string{"heredoc_py", "heredoc_bash", "sqlite_direct", "bash_other"}
|
||||
}
|
||||
|
||||
placeholders := make([]string, len(cfg.Tools))
|
||||
args := make([]any, 0, len(cfg.Tools)+1)
|
||||
for i, t := range cfg.Tools {
|
||||
placeholders[i] = "?"
|
||||
args = append(args, t)
|
||||
}
|
||||
args = append(args, cfg.LookbackDays)
|
||||
|
||||
q := fmt.Sprintf(`
|
||||
SELECT command_snippet, session_id, ts
|
||||
FROM calls
|
||||
WHERE function_id = ''
|
||||
AND command_snippet != ''
|
||||
AND tool_used IN (%s)
|
||||
AND ts >= CAST(strftime('%%s', 'now', '-' || ? || ' days') AS INTEGER)
|
||||
ORDER BY ts ASC`,
|
||||
strings.Join(placeholders, ","))
|
||||
|
||||
rows, err := db.Query(q, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query calls: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
clusters := map[string]*PatternCluster{}
|
||||
sessionsSeen := map[string]map[string]struct{}{}
|
||||
|
||||
for rows.Next() {
|
||||
var snippet, session string
|
||||
var ts int64
|
||||
if err := rows.Scan(&snippet, &session, &ts); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
norm := normalizeSnippet(snippet)
|
||||
if norm == "" {
|
||||
continue
|
||||
}
|
||||
hash := hashSnippet(norm)
|
||||
c, ok := clusters[hash]
|
||||
if !ok {
|
||||
c = &PatternCluster{
|
||||
Hash: hash,
|
||||
Representative: trim(snippet, 240),
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
}
|
||||
clusters[hash] = c
|
||||
sessionsSeen[hash] = map[string]struct{}{}
|
||||
}
|
||||
c.Occurrences++
|
||||
if ts > c.LastSeen {
|
||||
c.LastSeen = ts
|
||||
}
|
||||
if ts < c.FirstSeen {
|
||||
c.FirstSeen = ts
|
||||
}
|
||||
if session != "" {
|
||||
if _, seen := sessionsSeen[hash][session]; !seen {
|
||||
sessionsSeen[hash][session] = struct{}{}
|
||||
c.SessionIDs = append(c.SessionIDs, session)
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
out := make([]PatternCluster, 0, len(clusters))
|
||||
for _, c := range clusters {
|
||||
if c.Occurrences >= cfg.MinOccurrences {
|
||||
out = append(out, *c)
|
||||
}
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].Occurrences > out[j].Occurrences })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// UpsertPatterns writes clusters to the patterns table (idempotent).
|
||||
func UpsertPatterns(db *sql.DB, clusters []PatternCluster) error {
|
||||
tx, err := db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.Prepare(`
|
||||
INSERT INTO patterns (pattern_hash, representative_snippet, occurrences,
|
||||
session_ids_json, first_seen, last_seen, proposal_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, '')
|
||||
ON CONFLICT(pattern_hash) DO UPDATE SET
|
||||
occurrences = excluded.occurrences,
|
||||
session_ids_json = excluded.session_ids_json,
|
||||
last_seen = excluded.last_seen
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, c := range clusters {
|
||||
ids, _ := json.Marshal(c.SessionIDs)
|
||||
if _, err := stmt.Exec(
|
||||
c.Hash, c.Representative, c.Occurrences, string(ids),
|
||||
c.FirstSeen, c.LastSeen,
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func splitCSV(s string) []string {
|
||||
out := []string{}
|
||||
for _, p := range strings.Split(s, ",") {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
out = append(out, p)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func trim(s string, max int) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max]
|
||||
}
|
||||
|
||||
// runClusterPatterns is the CLI entry point.
|
||||
func runClusterPatterns(dbPath string, cfg ClusterConfig, persist, formatJSON bool) {
|
||||
db, err := openDB(dbPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
clusters, err := ClusterPatterns(db.conn, cfg)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "cluster: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if persist {
|
||||
if err := UpsertPatterns(db.conn, clusters); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "persist patterns: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
if formatJSON {
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
enc.Encode(clusters)
|
||||
return
|
||||
}
|
||||
|
||||
tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
||||
fmt.Fprintf(tw, "=== %d pattern clusters (min %d occurrences, last %dd) ===\n",
|
||||
len(clusters), cfg.MinOccurrences, cfg.LookbackDays)
|
||||
if len(clusters) == 0 {
|
||||
fmt.Fprintln(tw, "(no clusters found)")
|
||||
tw.Flush()
|
||||
return
|
||||
}
|
||||
fmt.Fprintln(tw, "HASH\tOCCURRENCES\tSESSIONS\tFIRST_SEEN\tLAST_SEEN\tSNIPPET")
|
||||
for _, c := range clusters {
|
||||
snippet := strings.ReplaceAll(c.Representative, "\n", "\\n")
|
||||
if len(snippet) > 70 {
|
||||
snippet = snippet[:70] + "..."
|
||||
}
|
||||
fmt.Fprintf(tw, "%s\t%d\t%d\t%d\t%d\t%s\n",
|
||||
c.Hash, c.Occurrences, len(c.SessionIDs),
|
||||
c.FirstSeen, c.LastSeen, snippet)
|
||||
}
|
||||
tw.Flush()
|
||||
|
||||
if persist {
|
||||
fmt.Println()
|
||||
fmt.Printf("upserted %d clusters into patterns table\n", len(clusters))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
package main
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestNormalizeSnippet(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{"sqlite3 /tmp/x.db \"SELECT 42\"", "sqlite3 /PATH \"STR\""},
|
||||
{"grep -n 'foo' line 123", "grep -n 'STR' line N"},
|
||||
{" multi space ", "multi space"},
|
||||
{"sha=ab12cd34ef56", "sha=HEX"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := normalizeSnippet(c.in)
|
||||
if got != c.want {
|
||||
t.Errorf("normalize(%q)=%q want %q", c.in, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHashSnippetStable(t *testing.T) {
|
||||
a := hashSnippet(normalizeSnippet("ls /tmp/foo"))
|
||||
b := hashSnippet(normalizeSnippet("ls /tmp/bar"))
|
||||
if a != b {
|
||||
t.Errorf("expected same hash after normalization: %s vs %s", a, b)
|
||||
}
|
||||
c := hashSnippet(normalizeSnippet("cat /tmp/foo"))
|
||||
if a == c {
|
||||
t.Errorf("expected different hash for different commands")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitCSV(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"a,b,c", 3},
|
||||
{" a , b , c ", 3},
|
||||
{"", 0},
|
||||
{"a,,b", 2},
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := splitCSV(c.in)
|
||||
if len(got) != c.want {
|
||||
t.Errorf("splitCSV(%q) len=%d want %d (%v)", c.in, len(got), c.want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -62,6 +62,19 @@ func main() {
|
||||
MinSuccessRate: 0.9,
|
||||
}
|
||||
runSequences(resolveDB(*dbPath), *detect, *report, *propose, *formatJSON, *root, cfg)
|
||||
case "cluster-patterns":
|
||||
minOcc := fs.Int("min-occurrences", 3, "Minimum occurrences to keep a cluster.")
|
||||
lookback := fs.Int("lookback-days", 30, "How many days of calls to scan.")
|
||||
persist := fs.Bool("persist", false, "Upsert clusters into the patterns table.")
|
||||
formatJSON := fs.Bool("json", false, "Output JSON instead of text.")
|
||||
toolsCSV := fs.String("tools", "heredoc_py,heredoc_bash,sqlite_direct,bash_other", "Comma-separated tool_used values to consider.")
|
||||
fs.Parse(os.Args[2:])
|
||||
tools := splitCSV(*toolsCSV)
|
||||
runClusterPatterns(resolveDB(*dbPath), ClusterConfig{
|
||||
MinOccurrences: *minOcc,
|
||||
LookbackDays: *lookback,
|
||||
Tools: tools,
|
||||
}, *persist, *formatJSON)
|
||||
case "-h", "--help", "help":
|
||||
usage()
|
||||
default:
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user