package infra import ( "crypto/sha256" "encoding/hex" "fmt" "io" "net/http" "os" "path/filepath" "sort" "strings" ) // VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath) // for every regular file found, skipping: // - vault_index.db, vault_index.db-shm, vault_index.db-wal // - .git/ directories at any depth // - hidden files/dirs (names starting with ".") at the vault root level only // // For each file it computes: relative path (forward slashes), size, mtime (unix UTC), // sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket. // // MIME detection priority: // 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet // 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib) // // NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its // signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed // for upload validation, not for open-ended inventory scanning where any MIME is valid. // http.DetectContentType provides the same magic-byte detection without the allowlist // coupling and handles a broader set of formats including text/plain for CSV fallback. func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) { var files []VaultFile err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error { if err != nil { return err } name := d.Name() // Skip .git directories at any depth. if d.IsDir() && name == ".git" { return filepath.SkipDir } // Skip hidden entries (names starting with ".") at vault root only. if strings.HasPrefix(name, ".") { rel, relErr := filepath.Rel(vaultPath, path) if relErr == nil { // At root level the relative path has no separator. if !strings.Contains(filepath.ToSlash(rel), "/") { if d.IsDir() { return filepath.SkipDir } return nil } } } if d.IsDir() { return nil } // Skip vault_index.db and its WAL/SHM sidecar files. if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" { return nil } rel, err := filepath.Rel(vaultPath, path) if err != nil { return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err) } rel = filepath.ToSlash(rel) info, err := d.Info() if err != nil { return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err) } // Compute sha256 by streaming — avoids loading large files into memory. sha, err := fileSha256(path) if err != nil { return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err) } mime, err := detectVaultFileMime(path, name) if err != nil { return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err) } ext := strings.ToLower(filepath.Ext(name)) bucket, subBucket := vaultBucketParts(rel) files = append(files, VaultFile{ VaultID: vaultID, VaultName: vaultName, RelPath: rel, Size: info.Size(), Mtime: info.ModTime().UTC().Unix(), Sha256: sha, Mime: mime, Ext: ext, Bucket: bucket, SubBucket: subBucket, }) return nil }) if err != nil { return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err) } sort.Slice(files, func(i, j int) bool { return files[i].RelPath < files[j].RelPath }) return files, nil } // fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming. func fileSha256(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() h := sha256.New() if _, err := io.Copy(h, f); err != nil { return "", err } return hex.EncodeToString(h.Sum(nil)), nil } // detectVaultFileMime returns the MIME type for a vault file. // Extension overrides take priority; otherwise http.DetectContentType is used. func detectVaultFileMime(path, name string) (string, error) { ext := strings.ToLower(filepath.Ext(name)) switch ext { case ".csv": return "text/csv", nil case ".md": return "text/markdown", nil case ".parquet": return "application/parquet", nil } f, err := os.Open(path) if err != nil { return "", err } defer f.Close() buf := make([]byte, 512) n, err := f.Read(buf) if err != nil && err != io.EOF { return "", err } return http.DetectContentType(buf[:n]), nil } // vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and // the second-level sub-bucket from a forward-slash relative path. // Returns empty strings for files at vault root or with no recognisable bucket. func vaultBucketParts(relPath string) (bucket, subBucket string) { parts := strings.SplitN(relPath, "/", 3) if len(parts) < 1 { return "", "" } bucket = parts[0] if len(parts) >= 2 { subBucket = parts[1] } return bucket, subBucket }