package infra import ( "crypto/sha256" "database/sql" "fmt" "io/fs" "os" "path/filepath" "regexp" "strings" _ "github.com/mattn/go-sqlite3" ) // CopiedCodeEntry represents a function body in an app file that matches // the normalized body of a registry function. MVP only emits exact_copy. type CopiedCodeEntry struct { AppFile string `json:"app_file"` AppFunction string `json:"app_function"` RegistryID string `json:"registry_id"` BodyHash string `json:"body_hash"` Similarity float64 `json:"similarity"` Kind string `json:"kind"` // exact_copy | near_copy | partial_match } // AuditCopiedCode walks apps/ and projects/*/apps/, extracts function // declarations per language, computes a normalized body hash (strip // comments + collapse whitespace), and matches against fingerprints // built from registry.db.functions.code. // // MVP scope: // - Languages: go, py, bash, ts, cpp // - Match level: exact_copy only (similarity = 1.0) // - Skips paths whose own file_path is the registry function (a function // is not a copy of itself). // // Returns the list of suspected copies. Does NOT write to any DB. // Persistence is the caller's responsibility (e.g. call_monitor). func AuditCopiedCode(registryRoot string) ([]CopiedCodeEntry, error) { dbPath := filepath.Join(registryRoot, "registry.db") db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL") if err != nil { return nil, fmt.Errorf("audit_copied_code: open db: %w", err) } defer db.Close() // Build fingerprint index: normalized_hash -> [registry_id ...] // and skip-set: file_path -> registry_id (so we don't flag the function // as a copy of itself). rows, err := db.Query("SELECT id, name, lang, code, file_path FROM functions WHERE code != ''") if err != nil { return nil, fmt.Errorf("audit_copied_code: query functions: %w", err) } defer rows.Close() fpIndex := map[string][]string{} registryFilePaths := map[string]struct{}{} for rows.Next() { var id, name, lang, code, filePath string if err := rows.Scan(&id, &name, &lang, &code, &filePath); err != nil { return nil, err } if filePath != "" { registryFilePaths[filePath] = struct{}{} } // If the `code` column contains a whole module (e.g. Python file // with multiple defs), only index the function declaration whose // name matches the registry entry. Otherwise (single-fn files // like Go), hash the whole body. Both paths yield a single // normalized hash mapped to this id. decls := extractFunctions(code, lang) var fnBody string if len(decls) > 1 { for _, d := range decls { if d.Name == name { fnBody = d.Body break } } } else if len(decls) == 1 { fnBody = decls[0].Body } else { fnBody = code } if fnBody == "" { continue } h := normalizedBodyHash(fnBody, lang) if h == "" { continue } fpIndex[h] = append(fpIndex[h], id) } if err := rows.Err(); err != nil { return nil, err } // Walk app trees scanDirs := []string{filepath.Join(registryRoot, "apps")} if entries, err := os.ReadDir(filepath.Join(registryRoot, "projects")); err == nil { for _, p := range entries { if !p.IsDir() { continue } scanDirs = append(scanDirs, filepath.Join(registryRoot, "projects", p.Name(), "apps")) } } var out []CopiedCodeEntry for _, d := range scanDirs { _ = filepath.WalkDir(d, func(path string, dirent fs.DirEntry, err error) error { if err != nil { return nil } if dirent.IsDir() { if shouldSkipDir(dirent.Name()) { return filepath.SkipDir } return nil } lang := langFromExt(path) if lang == "" { return nil } rel, _ := filepath.Rel(registryRoot, path) // Don't audit the registry function file itself if _, isRegistry := registryFilePaths[rel]; isRegistry { return nil } data, err := os.ReadFile(path) if err != nil { return nil } funcs := extractFunctions(string(data), lang) for _, fn := range funcs { h := normalizedBodyHash(fn.Body, lang) if h == "" { continue } if matches, ok := fpIndex[h]; ok { for _, rid := range matches { out = append(out, CopiedCodeEntry{ AppFile: rel, AppFunction: fn.Name, RegistryID: rid, BodyHash: h, Similarity: 1.0, Kind: "exact_copy", }) } } } return nil }) } return out, nil } // ---- Helpers (unexported) ---- func shouldSkipDir(name string) bool { switch name { case ".git", ".venv", "node_modules", "__pycache__", "build", "dist", "vendor", ".pytest_cache", ".cache", "_vendored": return true } return false } func langFromExt(path string) string { switch strings.ToLower(filepath.Ext(path)) { case ".go": return "go" case ".py": return "py" case ".sh": return "bash" case ".ts", ".tsx": return "ts" case ".cpp", ".cc", ".cxx": return "cpp" } return "" } func normalizedBodyHash(code, lang string) string { norm := stripCommentsAndWhitespace(code, lang) if len(norm) < 20 { // Skip trivial bodies: too short to be meaningful match. return "" } h := sha256.Sum256([]byte(norm)) return fmt.Sprintf("%x", h)[:16] } func stripCommentsAndWhitespace(s, lang string) string { switch lang { case "go", "ts", "cpp": s = stripCStyleComments(s) case "py": s = stripPythonDocstrings(s) s = stripHashComments(s) case "bash": s = stripHashComments(s) } return collapseWhitespace(s) } var ( reCStyleSingle = regexp.MustCompile(`//[^\n]*`) reCStyleMulti = regexp.MustCompile(`(?s)/\*.*?\*/`) reHashLine = regexp.MustCompile(`(?m)#.*$`) reTripleQuote = regexp.MustCompile(`(?s)"""[\s\S]*?"""|'''[\s\S]*?'''`) reWS = regexp.MustCompile(`\s+`) ) func stripCStyleComments(s string) string { s = reCStyleMulti.ReplaceAllString(s, " ") s = reCStyleSingle.ReplaceAllString(s, " ") return s } func stripHashComments(s string) string { return reHashLine.ReplaceAllString(s, "") } func stripPythonDocstrings(s string) string { return reTripleQuote.ReplaceAllString(s, " ") } func collapseWhitespace(s string) string { return strings.TrimSpace(reWS.ReplaceAllString(s, " ")) } type fnDecl struct { Name string Body string } // extractFunctions extracts function declarations from source. // MVP: regex-based, naive brace matching. Misses nested closures, methods // with complex receivers, multi-line signatures with embedded {. func extractFunctions(src, lang string) []fnDecl { switch lang { case "go": return extractBracedFunctions(src, regexp.MustCompile(`(?m)^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(`)) case "bash": return extractBracedFunctions(src, regexp.MustCompile(`(?m)^(\w[\w_]*)\s*\(\s*\)\s*\{`)) case "ts": return extractBracedFunctions(src, regexp.MustCompile(`(?m)(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(`)) case "cpp": return extractBracedFunctions(src, regexp.MustCompile(`(?m)^[\w:][\w:\s\*&<>,]*\s+(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:noexcept\s*)?\{`)) case "py": return extractPythonFunctions(src) } return nil } // extractBracedFunctions matches a name via regex, then finds the body by // scanning to the next '{' and balancing braces. Crude: ignores strings, // comments, and char literals — false positives possible on weird code. func extractBracedFunctions(src string, re *regexp.Regexp) []fnDecl { var out []fnDecl matches := re.FindAllStringSubmatchIndex(src, -1) for _, m := range matches { name := src[m[2]:m[3]] // Find first '{' after match end start := -1 for i := m[1]; i < len(src); i++ { if src[i] == '{' { start = i break } // Bail out if newline-newline encountered (not a function with body) if i+1 < len(src) && src[i] == '\n' && src[i+1] == '\n' { break } } if start == -1 { continue } depth := 0 end := -1 for i := start; i < len(src); i++ { switch src[i] { case '{': depth++ case '}': depth-- if depth == 0 { end = i + 1 } } if end != -1 { break } } if end == -1 { continue } out = append(out, fnDecl{Name: name, Body: src[start:end]}) } return out } // extractPythonFunctions uses indentation: def name(...): ... until dedent. func extractPythonFunctions(src string) []fnDecl { var out []fnDecl lines := strings.Split(src, "\n") re := regexp.MustCompile(`^(\s*)def\s+(\w+)\s*\(`) for i, line := range lines { m := re.FindStringSubmatch(line) if m == nil { continue } defIndent := len(m[1]) name := m[2] bodyLines := []string{line} for j := i + 1; j < len(lines); j++ { l := lines[j] if strings.TrimSpace(l) == "" { bodyLines = append(bodyLines, l) continue } indent := 0 for _, c := range l { if c == ' ' || c == '\t' { indent++ } else { break } } if indent <= defIndent { break } bodyLines = append(bodyLines, l) } out = append(out, fnDecl{Name: name, Body: strings.Join(bodyLines, "\n")}) } return out }