Files
egutierrez 47fac22230 chore: auto-commit (799 archivos)
- .claude/CLAUDE.md
- .claude/commands/subagentes.md
- .claude/rules/INDEX.md
- .mcp.json
- bash/functions/cybersecurity/analyze_dns.md
- bash/functions/cybersecurity/audit_http_headers.md
- bash/functions/cybersecurity/audit_ssh_config.md
- bash/functions/cybersecurity/check_firewall.md
- bash/functions/cybersecurity/detect_suspicious_users.md
- bash/functions/cybersecurity/encrypt_file.md
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 00:28:20 +02:00

342 lines
8.8 KiB
Go

package infra
import (
"crypto/sha256"
"database/sql"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
_ "github.com/mattn/go-sqlite3"
)
// CopiedCodeEntry represents a function body in an app file that matches
// the normalized body of a registry function. MVP only emits exact_copy.
type CopiedCodeEntry struct {
AppFile string `json:"app_file"`
AppFunction string `json:"app_function"`
RegistryID string `json:"registry_id"`
BodyHash string `json:"body_hash"`
Similarity float64 `json:"similarity"`
Kind string `json:"kind"` // exact_copy | near_copy | partial_match
}
// AuditCopiedCode walks apps/ and projects/*/apps/, extracts function
// declarations per language, computes a normalized body hash (strip
// comments + collapse whitespace), and matches against fingerprints
// built from registry.db.functions.code.
//
// MVP scope:
// - Languages: go, py, bash, ts, cpp
// - Match level: exact_copy only (similarity = 1.0)
// - Skips paths whose own file_path is the registry function (a function
// is not a copy of itself).
//
// Returns the list of suspected copies. Does NOT write to any DB.
// Persistence is the caller's responsibility (e.g. call_monitor).
func AuditCopiedCode(registryRoot string) ([]CopiedCodeEntry, error) {
dbPath := filepath.Join(registryRoot, "registry.db")
db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL")
if err != nil {
return nil, fmt.Errorf("audit_copied_code: open db: %w", err)
}
defer db.Close()
// Build fingerprint index: normalized_hash -> [registry_id ...]
// and skip-set: file_path -> registry_id (so we don't flag the function
// as a copy of itself).
rows, err := db.Query("SELECT id, name, lang, code, file_path FROM functions WHERE code != ''")
if err != nil {
return nil, fmt.Errorf("audit_copied_code: query functions: %w", err)
}
defer rows.Close()
fpIndex := map[string][]string{}
registryFilePaths := map[string]struct{}{}
for rows.Next() {
var id, name, lang, code, filePath string
if err := rows.Scan(&id, &name, &lang, &code, &filePath); err != nil {
return nil, err
}
if filePath != "" {
registryFilePaths[filePath] = struct{}{}
}
// If the `code` column contains a whole module (e.g. Python file
// with multiple defs), only index the function declaration whose
// name matches the registry entry. Otherwise (single-fn files
// like Go), hash the whole body. Both paths yield a single
// normalized hash mapped to this id.
decls := extractFunctions(code, lang)
var fnBody string
if len(decls) > 1 {
for _, d := range decls {
if d.Name == name {
fnBody = d.Body
break
}
}
} else if len(decls) == 1 {
fnBody = decls[0].Body
} else {
fnBody = code
}
if fnBody == "" {
continue
}
h := normalizedBodyHash(fnBody, lang)
if h == "" {
continue
}
fpIndex[h] = append(fpIndex[h], id)
}
if err := rows.Err(); err != nil {
return nil, err
}
// Walk app trees
scanDirs := []string{filepath.Join(registryRoot, "apps")}
if entries, err := os.ReadDir(filepath.Join(registryRoot, "projects")); err == nil {
for _, p := range entries {
if !p.IsDir() {
continue
}
scanDirs = append(scanDirs, filepath.Join(registryRoot, "projects", p.Name(), "apps"))
}
}
var out []CopiedCodeEntry
for _, d := range scanDirs {
_ = filepath.WalkDir(d, func(path string, dirent fs.DirEntry, err error) error {
if err != nil {
return nil
}
if dirent.IsDir() {
if shouldSkipDir(dirent.Name()) {
return filepath.SkipDir
}
return nil
}
lang := langFromExt(path)
if lang == "" {
return nil
}
rel, _ := filepath.Rel(registryRoot, path)
// Don't audit the registry function file itself
if _, isRegistry := registryFilePaths[rel]; isRegistry {
return nil
}
data, err := os.ReadFile(path)
if err != nil {
return nil
}
funcs := extractFunctions(string(data), lang)
for _, fn := range funcs {
h := normalizedBodyHash(fn.Body, lang)
if h == "" {
continue
}
if matches, ok := fpIndex[h]; ok {
for _, rid := range matches {
out = append(out, CopiedCodeEntry{
AppFile: rel,
AppFunction: fn.Name,
RegistryID: rid,
BodyHash: h,
Similarity: 1.0,
Kind: "exact_copy",
})
}
}
}
return nil
})
}
return out, nil
}
// ---- Helpers (unexported) ----
func shouldSkipDir(name string) bool {
switch name {
case ".git", ".venv", "node_modules", "__pycache__", "build", "dist", "vendor", ".pytest_cache", ".cache", "_vendored":
return true
}
return false
}
func langFromExt(path string) string {
switch strings.ToLower(filepath.Ext(path)) {
case ".go":
return "go"
case ".py":
return "py"
case ".sh":
return "bash"
case ".ts", ".tsx":
return "ts"
case ".cpp", ".cc", ".cxx":
return "cpp"
}
return ""
}
func normalizedBodyHash(code, lang string) string {
norm := stripCommentsAndWhitespace(code, lang)
if len(norm) < 20 {
// Skip trivial bodies: too short to be meaningful match.
return ""
}
h := sha256.Sum256([]byte(norm))
return fmt.Sprintf("%x", h)[:16]
}
func stripCommentsAndWhitespace(s, lang string) string {
switch lang {
case "go", "ts", "cpp":
s = stripCStyleComments(s)
case "py":
s = stripPythonDocstrings(s)
s = stripHashComments(s)
case "bash":
s = stripHashComments(s)
}
return collapseWhitespace(s)
}
var (
reCStyleSingle = regexp.MustCompile(`//[^\n]*`)
reCStyleMulti = regexp.MustCompile(`(?s)/\*.*?\*/`)
reHashLine = regexp.MustCompile(`(?m)#.*$`)
reTripleQuote = regexp.MustCompile(`(?s)"""[\s\S]*?"""|'''[\s\S]*?'''`)
reWS = regexp.MustCompile(`\s+`)
)
func stripCStyleComments(s string) string {
s = reCStyleMulti.ReplaceAllString(s, " ")
s = reCStyleSingle.ReplaceAllString(s, " ")
return s
}
func stripHashComments(s string) string {
return reHashLine.ReplaceAllString(s, "")
}
func stripPythonDocstrings(s string) string {
return reTripleQuote.ReplaceAllString(s, " ")
}
func collapseWhitespace(s string) string {
return strings.TrimSpace(reWS.ReplaceAllString(s, " "))
}
type fnDecl struct {
Name string
Body string
}
// extractFunctions extracts function declarations from source.
// MVP: regex-based, naive brace matching. Misses nested closures, methods
// with complex receivers, multi-line signatures with embedded {.
func extractFunctions(src, lang string) []fnDecl {
switch lang {
case "go":
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(`))
case "bash":
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^(\w[\w_]*)\s*\(\s*\)\s*\{`))
case "ts":
return extractBracedFunctions(src, regexp.MustCompile(`(?m)(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(`))
case "cpp":
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^[\w:][\w:\s\*&<>,]*\s+(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:noexcept\s*)?\{`))
case "py":
return extractPythonFunctions(src)
}
return nil
}
// extractBracedFunctions matches a name via regex, then finds the body by
// scanning to the next '{' and balancing braces. Crude: ignores strings,
// comments, and char literals — false positives possible on weird code.
func extractBracedFunctions(src string, re *regexp.Regexp) []fnDecl {
var out []fnDecl
matches := re.FindAllStringSubmatchIndex(src, -1)
for _, m := range matches {
name := src[m[2]:m[3]]
// Find first '{' after match end
start := -1
for i := m[1]; i < len(src); i++ {
if src[i] == '{' {
start = i
break
}
// Bail out if newline-newline encountered (not a function with body)
if i+1 < len(src) && src[i] == '\n' && src[i+1] == '\n' {
break
}
}
if start == -1 {
continue
}
depth := 0
end := -1
for i := start; i < len(src); i++ {
switch src[i] {
case '{':
depth++
case '}':
depth--
if depth == 0 {
end = i + 1
}
}
if end != -1 {
break
}
}
if end == -1 {
continue
}
out = append(out, fnDecl{Name: name, Body: src[start:end]})
}
return out
}
// extractPythonFunctions uses indentation: def name(...): ... until dedent.
func extractPythonFunctions(src string) []fnDecl {
var out []fnDecl
lines := strings.Split(src, "\n")
re := regexp.MustCompile(`^(\s*)def\s+(\w+)\s*\(`)
for i, line := range lines {
m := re.FindStringSubmatch(line)
if m == nil {
continue
}
defIndent := len(m[1])
name := m[2]
bodyLines := []string{line}
for j := i + 1; j < len(lines); j++ {
l := lines[j]
if strings.TrimSpace(l) == "" {
bodyLines = append(bodyLines, l)
continue
}
indent := 0
for _, c := range l {
if c == ' ' || c == '\t' {
indent++
} else {
break
}
}
if indent <= defIndent {
break
}
bodyLines = append(bodyLines, l)
}
out = append(out, fnDecl{Name: name, Body: strings.Join(bodyLines, "\n")})
}
return out
}