Files
fn_registry/functions/infra/vault_inventory_scan.go
T
egutierrez e3c8979e8d chore: auto-commit (95 archivos)
- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 00:50:34 +02:00

175 lines
4.8 KiB
Go

package infra
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
)
// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
// for every regular file found, skipping:
// - vault_index.db, vault_index.db-shm, vault_index.db-wal
// - .git/ directories at any depth
// - hidden files/dirs (names starting with ".") at the vault root level only
//
// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
//
// MIME detection priority:
// 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
// 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
//
// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
// for upload validation, not for open-ended inventory scanning where any MIME is valid.
// http.DetectContentType provides the same magic-byte detection without the allowlist
// coupling and handles a broader set of formats including text/plain for CSV fallback.
func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
var files []VaultFile
err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
name := d.Name()
// Skip .git directories at any depth.
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
// Skip hidden entries (names starting with ".") at vault root only.
if strings.HasPrefix(name, ".") {
rel, relErr := filepath.Rel(vaultPath, path)
if relErr == nil {
// At root level the relative path has no separator.
if !strings.Contains(filepath.ToSlash(rel), "/") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
}
}
if d.IsDir() {
return nil
}
// Skip vault_index.db and its WAL/SHM sidecar files.
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
return nil
}
rel, err := filepath.Rel(vaultPath, path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
}
rel = filepath.ToSlash(rel)
info, err := d.Info()
if err != nil {
return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
}
// Compute sha256 by streaming — avoids loading large files into memory.
sha, err := fileSha256(path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
}
mime, err := detectVaultFileMime(path, name)
if err != nil {
return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
}
ext := strings.ToLower(filepath.Ext(name))
bucket, subBucket := vaultBucketParts(rel)
files = append(files, VaultFile{
VaultID: vaultID,
VaultName: vaultName,
RelPath: rel,
Size: info.Size(),
Mtime: info.ModTime().UTC().Unix(),
Sha256: sha,
Mime: mime,
Ext: ext,
Bucket: bucket,
SubBucket: subBucket,
})
return nil
})
if err != nil {
return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
}
sort.Slice(files, func(i, j int) bool {
return files[i].RelPath < files[j].RelPath
})
return files, nil
}
// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
func fileSha256(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// detectVaultFileMime returns the MIME type for a vault file.
// Extension overrides take priority; otherwise http.DetectContentType is used.
func detectVaultFileMime(path, name string) (string, error) {
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".csv":
return "text/csv", nil
case ".md":
return "text/markdown", nil
case ".parquet":
return "application/parquet", nil
}
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
buf := make([]byte, 512)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
return http.DetectContentType(buf[:n]), nil
}
// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
// the second-level sub-bucket from a forward-slash relative path.
// Returns empty strings for files at vault root or with no recognisable bucket.
func vaultBucketParts(relPath string) (bucket, subBucket string) {
parts := strings.SplitN(relPath, "/", 3)
if len(parts) < 1 {
return "", ""
}
bucket = parts[0]
if len(parts) >= 2 {
subBucket = parts[1]
}
return bucket, subBucket
}