chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,216 @@
|
||||
"""vault_csv_profile — Perfila un CSV del vault y persiste metadata en vault_index.db."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _detect_encoding(path: Path) -> str:
|
||||
"""Detecta encoding del archivo con chardet o por intentos."""
|
||||
try:
|
||||
import chardet
|
||||
|
||||
with open(path, "rb") as f:
|
||||
raw = f.read(min(65536, path.stat().st_size))
|
||||
result = chardet.detect(raw)
|
||||
if result and result.get("encoding") and result.get("confidence", 0) >= 0.6:
|
||||
return result["encoding"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for enc in ("utf-8-sig", "utf-8", "latin-1", "cp1252"):
|
||||
try:
|
||||
with open(path, encoding=enc) as f:
|
||||
f.read(4096)
|
||||
return enc
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
|
||||
return "utf-8?"
|
||||
|
||||
|
||||
def _read_with_polars(path: Path, encoding: str) -> tuple[list[dict], int]:
|
||||
"""Lee CSV con polars. Retorna (cols, n_rows)."""
|
||||
import polars as pl
|
||||
|
||||
enc = encoding.rstrip("?").replace("utf-8-sig", "utf8").replace("utf-8", "utf8")
|
||||
if enc not in ("utf8", "utf-8"):
|
||||
enc = "utf8"
|
||||
|
||||
lf = pl.scan_csv(path, encoding="utf8", ignore_errors=True, infer_schema_length=1000)
|
||||
schema = lf.collect_schema()
|
||||
cols = [{"name": name, "dtype": str(dtype)} for name, dtype in schema.items()]
|
||||
n_rows = lf.select(pl.len()).collect().item()
|
||||
return cols, n_rows
|
||||
|
||||
|
||||
def _read_with_pandas(path: Path, encoding: str) -> tuple[list[dict], int]:
|
||||
"""Fallback: lee CSV con pandas."""
|
||||
import pandas as pd
|
||||
|
||||
enc = encoding.rstrip("?") or "utf-8"
|
||||
df = pd.read_csv(path, encoding=enc, encoding_errors="replace", nrows=None)
|
||||
cols = [{"name": col, "dtype": str(df[col].dtype)} for col in df.columns]
|
||||
n_rows = len(df)
|
||||
return cols, n_rows
|
||||
|
||||
|
||||
def _detect_dates(path: Path, encoding: str) -> tuple[str | None, str | None]:
|
||||
"""Intenta detectar columna de fecha y retorna (date_min, date_max) en ISO."""
|
||||
try:
|
||||
import polars as pl
|
||||
|
||||
lf = pl.scan_csv(path, encoding="utf8", ignore_errors=True, infer_schema_length=0)
|
||||
schema = lf.collect_schema()
|
||||
df = lf.collect()
|
||||
|
||||
for col_name, dtype in schema.items():
|
||||
if "Date" in str(dtype) or "Datetime" in str(dtype):
|
||||
series = df[col_name].drop_nulls()
|
||||
if len(series) > 0:
|
||||
mn = series.min()
|
||||
mx = series.max()
|
||||
return str(mn)[:10], str(mx)[:10]
|
||||
|
||||
# Intenta parsear columnas string como fecha
|
||||
for col_name, dtype in schema.items():
|
||||
if "Utf8" not in str(dtype) and "String" not in str(dtype):
|
||||
continue
|
||||
series = df[col_name].drop_nulls()
|
||||
if len(series) == 0:
|
||||
continue
|
||||
try:
|
||||
parsed = series.str.to_date(strict=False)
|
||||
valid = parsed.drop_nulls()
|
||||
if len(valid) / max(len(series), 1) >= 0.8:
|
||||
mn = valid.min()
|
||||
mx = valid.max()
|
||||
return str(mn)[:10], str(mx)[:10]
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
|
||||
def _build_fts_text(path: Path, cols: list[dict], encoding: str) -> str:
|
||||
"""Construye content_text para files_fts: nombres de cols + primeras 5 filas."""
|
||||
col_names = " ".join(c["name"] for c in cols)
|
||||
try:
|
||||
import polars as pl
|
||||
|
||||
lf = pl.scan_csv(path, encoding="utf8", ignore_errors=True)
|
||||
sample = lf.head(5).collect()
|
||||
rows_text = " ".join(
|
||||
" ".join(str(v) for v in row) for row in sample.iter_rows()
|
||||
)
|
||||
return f"{col_names} {rows_text}".strip()
|
||||
except Exception:
|
||||
pass
|
||||
return col_names
|
||||
|
||||
|
||||
def vault_csv_profile(
|
||||
vault_path: str,
|
||||
rel_path: str,
|
||||
db_path: str | None = None,
|
||||
) -> dict:
|
||||
"""Perfila un CSV del vault: schema, n_rows, encoding, fechas; persiste en vault_index.db.
|
||||
|
||||
Args:
|
||||
vault_path: Ruta absoluta a la raiz del vault.
|
||||
rel_path: Ruta relativa al CSV dentro del vault.
|
||||
db_path: Override de la ruta a vault_index.db. Por defecto <vault_path>/vault_index.db.
|
||||
|
||||
Returns:
|
||||
Dict con rel_path, cols, n_rows, encoding, date_min, date_max, persisted.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Si el archivo no existe o no se puede leer.
|
||||
"""
|
||||
vault = Path(vault_path)
|
||||
csv_file = vault / rel_path
|
||||
if not csv_file.exists():
|
||||
raise RuntimeError(f"vault_csv_profile: archivo no encontrado: {csv_file}")
|
||||
|
||||
db = Path(db_path) if db_path else vault / "vault_index.db"
|
||||
|
||||
# Resultado por defecto para CSV vacío
|
||||
result: dict = {
|
||||
"rel_path": rel_path,
|
||||
"cols": [],
|
||||
"n_rows": 0,
|
||||
"encoding": "utf-8",
|
||||
"date_min": None,
|
||||
"date_max": None,
|
||||
"persisted": False,
|
||||
}
|
||||
|
||||
# Detectar encoding
|
||||
encoding = _detect_encoding(csv_file)
|
||||
result["encoding"] = encoding
|
||||
|
||||
# Leer schema y n_rows — short-circuit para archivos vacíos
|
||||
if csv_file.stat().st_size == 0:
|
||||
cols, n_rows = [], 0
|
||||
else:
|
||||
try:
|
||||
cols, n_rows = _read_with_polars(csv_file, encoding)
|
||||
except Exception:
|
||||
try:
|
||||
cols, n_rows = _read_with_pandas(csv_file, encoding)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"vault_csv_profile: no se pudo leer {rel_path}: {exc}") from exc
|
||||
|
||||
result["cols"] = cols
|
||||
result["n_rows"] = n_rows
|
||||
|
||||
# Detección de fechas (solo si hay filas)
|
||||
if n_rows > 0 and cols:
|
||||
date_min, date_max = _detect_dates(csv_file, encoding)
|
||||
result["date_min"] = date_min
|
||||
result["date_max"] = date_max
|
||||
|
||||
# Construir texto para FTS
|
||||
fts_text = _build_fts_text(csv_file, cols, encoding) if cols else ""
|
||||
|
||||
# Persistir en vault_index.db
|
||||
if db.exists():
|
||||
conn = sqlite3.connect(str(db))
|
||||
try:
|
||||
cols_json = __import__("json").dumps(cols)
|
||||
now = int(time.time())
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO csv_profiles(rel_path, cols_json, n_rows, encoding, date_min, date_max, profiled_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(rel_path) DO UPDATE SET
|
||||
cols_json=excluded.cols_json,
|
||||
n_rows=excluded.n_rows,
|
||||
encoding=excluded.encoding,
|
||||
date_min=excluded.date_min,
|
||||
date_max=excluded.date_max,
|
||||
profiled_at=excluded.profiled_at
|
||||
""",
|
||||
(rel_path, cols_json, n_rows, encoding, result["date_min"], result["date_max"], now),
|
||||
)
|
||||
# Actualizar files_fts (rowid debe coincidir con files)
|
||||
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files_fts(rowid, rel_path, content_text)
|
||||
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
|
||||
""",
|
||||
(rel_path, rel_path, fts_text),
|
||||
)
|
||||
conn.commit()
|
||||
result["persisted"] = True
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user