47fac22230
- .claude/CLAUDE.md - .claude/commands/subagentes.md - .claude/rules/INDEX.md - .mcp.json - bash/functions/cybersecurity/analyze_dns.md - bash/functions/cybersecurity/audit_http_headers.md - bash/functions/cybersecurity/audit_ssh_config.md - bash/functions/cybersecurity/check_firewall.md - bash/functions/cybersecurity/detect_suspicious_users.md - bash/functions/cybersecurity/encrypt_file.md - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
342 lines
8.8 KiB
Go
342 lines
8.8 KiB
Go
package infra
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"database/sql"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
// CopiedCodeEntry represents a function body in an app file that matches
|
|
// the normalized body of a registry function. MVP only emits exact_copy.
|
|
type CopiedCodeEntry struct {
|
|
AppFile string `json:"app_file"`
|
|
AppFunction string `json:"app_function"`
|
|
RegistryID string `json:"registry_id"`
|
|
BodyHash string `json:"body_hash"`
|
|
Similarity float64 `json:"similarity"`
|
|
Kind string `json:"kind"` // exact_copy | near_copy | partial_match
|
|
}
|
|
|
|
// AuditCopiedCode walks apps/ and projects/*/apps/, extracts function
|
|
// declarations per language, computes a normalized body hash (strip
|
|
// comments + collapse whitespace), and matches against fingerprints
|
|
// built from registry.db.functions.code.
|
|
//
|
|
// MVP scope:
|
|
// - Languages: go, py, bash, ts, cpp
|
|
// - Match level: exact_copy only (similarity = 1.0)
|
|
// - Skips paths whose own file_path is the registry function (a function
|
|
// is not a copy of itself).
|
|
//
|
|
// Returns the list of suspected copies. Does NOT write to any DB.
|
|
// Persistence is the caller's responsibility (e.g. call_monitor).
|
|
func AuditCopiedCode(registryRoot string) ([]CopiedCodeEntry, error) {
|
|
dbPath := filepath.Join(registryRoot, "registry.db")
|
|
db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("audit_copied_code: open db: %w", err)
|
|
}
|
|
defer db.Close()
|
|
|
|
// Build fingerprint index: normalized_hash -> [registry_id ...]
|
|
// and skip-set: file_path -> registry_id (so we don't flag the function
|
|
// as a copy of itself).
|
|
rows, err := db.Query("SELECT id, name, lang, code, file_path FROM functions WHERE code != ''")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("audit_copied_code: query functions: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
|
|
fpIndex := map[string][]string{}
|
|
registryFilePaths := map[string]struct{}{}
|
|
for rows.Next() {
|
|
var id, name, lang, code, filePath string
|
|
if err := rows.Scan(&id, &name, &lang, &code, &filePath); err != nil {
|
|
return nil, err
|
|
}
|
|
if filePath != "" {
|
|
registryFilePaths[filePath] = struct{}{}
|
|
}
|
|
// If the `code` column contains a whole module (e.g. Python file
|
|
// with multiple defs), only index the function declaration whose
|
|
// name matches the registry entry. Otherwise (single-fn files
|
|
// like Go), hash the whole body. Both paths yield a single
|
|
// normalized hash mapped to this id.
|
|
decls := extractFunctions(code, lang)
|
|
var fnBody string
|
|
if len(decls) > 1 {
|
|
for _, d := range decls {
|
|
if d.Name == name {
|
|
fnBody = d.Body
|
|
break
|
|
}
|
|
}
|
|
} else if len(decls) == 1 {
|
|
fnBody = decls[0].Body
|
|
} else {
|
|
fnBody = code
|
|
}
|
|
if fnBody == "" {
|
|
continue
|
|
}
|
|
h := normalizedBodyHash(fnBody, lang)
|
|
if h == "" {
|
|
continue
|
|
}
|
|
fpIndex[h] = append(fpIndex[h], id)
|
|
}
|
|
if err := rows.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Walk app trees
|
|
scanDirs := []string{filepath.Join(registryRoot, "apps")}
|
|
if entries, err := os.ReadDir(filepath.Join(registryRoot, "projects")); err == nil {
|
|
for _, p := range entries {
|
|
if !p.IsDir() {
|
|
continue
|
|
}
|
|
scanDirs = append(scanDirs, filepath.Join(registryRoot, "projects", p.Name(), "apps"))
|
|
}
|
|
}
|
|
|
|
var out []CopiedCodeEntry
|
|
for _, d := range scanDirs {
|
|
_ = filepath.WalkDir(d, func(path string, dirent fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
if dirent.IsDir() {
|
|
if shouldSkipDir(dirent.Name()) {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
lang := langFromExt(path)
|
|
if lang == "" {
|
|
return nil
|
|
}
|
|
rel, _ := filepath.Rel(registryRoot, path)
|
|
// Don't audit the registry function file itself
|
|
if _, isRegistry := registryFilePaths[rel]; isRegistry {
|
|
return nil
|
|
}
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
funcs := extractFunctions(string(data), lang)
|
|
for _, fn := range funcs {
|
|
h := normalizedBodyHash(fn.Body, lang)
|
|
if h == "" {
|
|
continue
|
|
}
|
|
if matches, ok := fpIndex[h]; ok {
|
|
for _, rid := range matches {
|
|
out = append(out, CopiedCodeEntry{
|
|
AppFile: rel,
|
|
AppFunction: fn.Name,
|
|
RegistryID: rid,
|
|
BodyHash: h,
|
|
Similarity: 1.0,
|
|
Kind: "exact_copy",
|
|
})
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// ---- Helpers (unexported) ----
|
|
|
|
func shouldSkipDir(name string) bool {
|
|
switch name {
|
|
case ".git", ".venv", "node_modules", "__pycache__", "build", "dist", "vendor", ".pytest_cache", ".cache", "_vendored":
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func langFromExt(path string) string {
|
|
switch strings.ToLower(filepath.Ext(path)) {
|
|
case ".go":
|
|
return "go"
|
|
case ".py":
|
|
return "py"
|
|
case ".sh":
|
|
return "bash"
|
|
case ".ts", ".tsx":
|
|
return "ts"
|
|
case ".cpp", ".cc", ".cxx":
|
|
return "cpp"
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func normalizedBodyHash(code, lang string) string {
|
|
norm := stripCommentsAndWhitespace(code, lang)
|
|
if len(norm) < 20 {
|
|
// Skip trivial bodies: too short to be meaningful match.
|
|
return ""
|
|
}
|
|
h := sha256.Sum256([]byte(norm))
|
|
return fmt.Sprintf("%x", h)[:16]
|
|
}
|
|
|
|
func stripCommentsAndWhitespace(s, lang string) string {
|
|
switch lang {
|
|
case "go", "ts", "cpp":
|
|
s = stripCStyleComments(s)
|
|
case "py":
|
|
s = stripPythonDocstrings(s)
|
|
s = stripHashComments(s)
|
|
case "bash":
|
|
s = stripHashComments(s)
|
|
}
|
|
return collapseWhitespace(s)
|
|
}
|
|
|
|
var (
|
|
reCStyleSingle = regexp.MustCompile(`//[^\n]*`)
|
|
reCStyleMulti = regexp.MustCompile(`(?s)/\*.*?\*/`)
|
|
reHashLine = regexp.MustCompile(`(?m)#.*$`)
|
|
reTripleQuote = regexp.MustCompile(`(?s)"""[\s\S]*?"""|'''[\s\S]*?'''`)
|
|
reWS = regexp.MustCompile(`\s+`)
|
|
)
|
|
|
|
func stripCStyleComments(s string) string {
|
|
s = reCStyleMulti.ReplaceAllString(s, " ")
|
|
s = reCStyleSingle.ReplaceAllString(s, " ")
|
|
return s
|
|
}
|
|
|
|
func stripHashComments(s string) string {
|
|
return reHashLine.ReplaceAllString(s, "")
|
|
}
|
|
|
|
func stripPythonDocstrings(s string) string {
|
|
return reTripleQuote.ReplaceAllString(s, " ")
|
|
}
|
|
|
|
func collapseWhitespace(s string) string {
|
|
return strings.TrimSpace(reWS.ReplaceAllString(s, " "))
|
|
}
|
|
|
|
type fnDecl struct {
|
|
Name string
|
|
Body string
|
|
}
|
|
|
|
// extractFunctions extracts function declarations from source.
|
|
// MVP: regex-based, naive brace matching. Misses nested closures, methods
|
|
// with complex receivers, multi-line signatures with embedded {.
|
|
func extractFunctions(src, lang string) []fnDecl {
|
|
switch lang {
|
|
case "go":
|
|
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(`))
|
|
case "bash":
|
|
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^(\w[\w_]*)\s*\(\s*\)\s*\{`))
|
|
case "ts":
|
|
return extractBracedFunctions(src, regexp.MustCompile(`(?m)(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(`))
|
|
case "cpp":
|
|
return extractBracedFunctions(src, regexp.MustCompile(`(?m)^[\w:][\w:\s\*&<>,]*\s+(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:noexcept\s*)?\{`))
|
|
case "py":
|
|
return extractPythonFunctions(src)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// extractBracedFunctions matches a name via regex, then finds the body by
|
|
// scanning to the next '{' and balancing braces. Crude: ignores strings,
|
|
// comments, and char literals — false positives possible on weird code.
|
|
func extractBracedFunctions(src string, re *regexp.Regexp) []fnDecl {
|
|
var out []fnDecl
|
|
matches := re.FindAllStringSubmatchIndex(src, -1)
|
|
for _, m := range matches {
|
|
name := src[m[2]:m[3]]
|
|
// Find first '{' after match end
|
|
start := -1
|
|
for i := m[1]; i < len(src); i++ {
|
|
if src[i] == '{' {
|
|
start = i
|
|
break
|
|
}
|
|
// Bail out if newline-newline encountered (not a function with body)
|
|
if i+1 < len(src) && src[i] == '\n' && src[i+1] == '\n' {
|
|
break
|
|
}
|
|
}
|
|
if start == -1 {
|
|
continue
|
|
}
|
|
depth := 0
|
|
end := -1
|
|
for i := start; i < len(src); i++ {
|
|
switch src[i] {
|
|
case '{':
|
|
depth++
|
|
case '}':
|
|
depth--
|
|
if depth == 0 {
|
|
end = i + 1
|
|
}
|
|
}
|
|
if end != -1 {
|
|
break
|
|
}
|
|
}
|
|
if end == -1 {
|
|
continue
|
|
}
|
|
out = append(out, fnDecl{Name: name, Body: src[start:end]})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// extractPythonFunctions uses indentation: def name(...): ... until dedent.
|
|
func extractPythonFunctions(src string) []fnDecl {
|
|
var out []fnDecl
|
|
lines := strings.Split(src, "\n")
|
|
re := regexp.MustCompile(`^(\s*)def\s+(\w+)\s*\(`)
|
|
for i, line := range lines {
|
|
m := re.FindStringSubmatch(line)
|
|
if m == nil {
|
|
continue
|
|
}
|
|
defIndent := len(m[1])
|
|
name := m[2]
|
|
bodyLines := []string{line}
|
|
for j := i + 1; j < len(lines); j++ {
|
|
l := lines[j]
|
|
if strings.TrimSpace(l) == "" {
|
|
bodyLines = append(bodyLines, l)
|
|
continue
|
|
}
|
|
indent := 0
|
|
for _, c := range l {
|
|
if c == ' ' || c == '\t' {
|
|
indent++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if indent <= defIndent {
|
|
break
|
|
}
|
|
bodyLines = append(bodyLines, l)
|
|
}
|
|
out = append(out, fnDecl{Name: name, Body: strings.Join(bodyLines, "\n")})
|
|
}
|
|
return out
|
|
}
|