chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,161 @@
|
||||
"""Tests para vault_csv_profile."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from vault_csv_profile import vault_csv_profile
|
||||
|
||||
|
||||
def _make_vault(tmp: Path) -> tuple[Path, Path]:
|
||||
"""Crea un vault mínimo con vault_index.db y tabla files + files_fts + csv_profiles."""
|
||||
db = tmp / "vault_index.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
rel_path TEXT UNIQUE NOT NULL,
|
||||
size_bytes INTEGER,
|
||||
ext TEXT
|
||||
);
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts
|
||||
USING fts5(rel_path, content_text, content='', contentless_delete=1);
|
||||
CREATE TABLE IF NOT EXISTS csv_profiles (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
cols_json TEXT,
|
||||
n_rows INTEGER,
|
||||
encoding TEXT,
|
||||
date_min TEXT,
|
||||
date_max TEXT,
|
||||
profiled_at INTEGER
|
||||
);
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return tmp, db
|
||||
|
||||
|
||||
def _insert_file_entry(db: Path, rel_path: str):
|
||||
"""Inserta entrada en files para que files_fts tenga rowid válido."""
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO files(rel_path, size_bytes, ext) VALUES (?, 0, '.csv')",
|
||||
(rel_path,),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_csv_basic(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "data/basic.csv"
|
||||
csv_file = vault / rel
|
||||
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
csv_file.write_text("nombre,edad,score\nAna,30,9.5\nBob,25,8.0\nCarla,35,7.5\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_csv_profile(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["rel_path"] == rel
|
||||
assert result["n_rows"] == 3
|
||||
assert len(result["cols"]) == 3
|
||||
col_names = [c["name"] for c in result["cols"]]
|
||||
assert "nombre" in col_names
|
||||
assert "edad" in col_names
|
||||
assert "score" in col_names
|
||||
assert result["persisted"] is True
|
||||
|
||||
# Verificar persistencia en csv_profiles
|
||||
conn = sqlite3.connect(str(db))
|
||||
row = conn.execute("SELECT n_rows FROM csv_profiles WHERE rel_path = ?", (rel,)).fetchone()
|
||||
conn.close()
|
||||
assert row is not None
|
||||
assert row[0] == 3
|
||||
|
||||
|
||||
def test_csv_date_detection(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "data/fechas.csv"
|
||||
csv_file = vault / rel
|
||||
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
csv_file.write_text(
|
||||
"fecha,valor\n2023-01-01,100\n2023-06-15,200\n2023-12-31,300\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_csv_profile(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["date_min"] is not None
|
||||
assert result["date_max"] is not None
|
||||
assert result["date_min"] <= "2023-01-01"
|
||||
assert result["date_max"] >= "2023-12-31"
|
||||
|
||||
|
||||
def test_csv_encoding_latin1(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "data/tildes.csv"
|
||||
csv_file = vault / rel
|
||||
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
csv_file.write_bytes(
|
||||
"ciudad,poblacion\nMálaga,500000\nCórdoba,320000\n".encode("latin-1")
|
||||
)
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_csv_profile(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["n_rows"] == 2
|
||||
assert result["encoding"] != "utf-8?"
|
||||
# encoding detectado (algún valor no vacío)
|
||||
assert result["encoding"]
|
||||
assert result["persisted"] is True
|
||||
|
||||
|
||||
def test_csv_empty(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "data/empty.csv"
|
||||
csv_file = vault / rel
|
||||
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
csv_file.write_text("", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_csv_profile(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["n_rows"] == 0
|
||||
assert result["cols"] == []
|
||||
assert result["date_min"] is None
|
||||
assert result["date_max"] is None
|
||||
|
||||
|
||||
def test_csv_persists_fts(tmp_path):
|
||||
"""FTS5 contentless: verifica que las columnas son buscables con MATCH."""
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "data/fts_test.csv"
|
||||
csv_file = vault / rel
|
||||
csv_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
csv_file.write_text("producto,precio\nManzana,1.5\nPera,2.0\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
vault_csv_profile(str(vault), rel, db_path=str(db))
|
||||
|
||||
conn = sqlite3.connect(str(db))
|
||||
# FTS5 contentless no permite SELECT directo — usar MATCH para verificar indexado
|
||||
row_prod = conn.execute(
|
||||
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'producto'",
|
||||
).fetchone()
|
||||
row_prec = conn.execute(
|
||||
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'precio'",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert row_prod is not None, "FTS no encontró 'producto'"
|
||||
assert row_prec is not None, "FTS no encontró 'precio'"
|
||||
@@ -0,0 +1,147 @@
|
||||
"""Tests para vault_pdf_extract."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from vault_pdf_extract import vault_pdf_extract
|
||||
|
||||
|
||||
def _make_vault(tmp: Path) -> tuple[Path, Path]:
|
||||
"""Crea un vault mínimo con vault_index.db."""
|
||||
db = tmp / "vault_index.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
rel_path TEXT UNIQUE NOT NULL,
|
||||
size_bytes INTEGER,
|
||||
ext TEXT
|
||||
);
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts
|
||||
USING fts5(rel_path, content_text, content='', contentless_delete=1);
|
||||
CREATE TABLE IF NOT EXISTS pdf_extracts (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
page_count INTEGER,
|
||||
text_len INTEGER,
|
||||
extracted_to TEXT,
|
||||
extracted_at INTEGER
|
||||
);
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return tmp, db
|
||||
|
||||
|
||||
def _insert_file_entry(db: Path, rel_path: str):
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO files(rel_path, size_bytes, ext) VALUES (?, 0, '.pdf')",
|
||||
(rel_path,),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def _make_pdf(path: Path, text: str = "Hello vault PDF.\nPage two content."):
|
||||
"""Crea un PDF mínimo con fitz para tests."""
|
||||
import fitz
|
||||
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((72, 72), text)
|
||||
doc.save(str(path))
|
||||
doc.close()
|
||||
|
||||
|
||||
def test_pdf_extract_basic(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/test.pdf"
|
||||
pdf = vault / rel
|
||||
pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
_make_pdf(pdf)
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_pdf_extract(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["rel_path"] == rel
|
||||
assert result["page_count"] >= 1
|
||||
assert result["text_len"] > 0
|
||||
assert result["persisted"] is True
|
||||
|
||||
conn = sqlite3.connect(str(db))
|
||||
row = conn.execute("SELECT page_count, text_len FROM pdf_extracts WHERE rel_path=?", (rel,)).fetchone()
|
||||
conn.close()
|
||||
assert row is not None
|
||||
assert row[0] >= 1
|
||||
assert row[1] > 0
|
||||
|
||||
|
||||
def test_pdf_dump_text_creates_file(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/dump.pdf"
|
||||
pdf = vault / rel
|
||||
pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
_make_pdf(pdf, "Contenido para dump a disco.")
|
||||
_insert_file_entry(db, rel)
|
||||
# Crear data/processed/ para que se use ese directorio
|
||||
(vault / "data" / "processed").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = vault_pdf_extract(str(vault), rel, db_path=str(db), dump_text=True)
|
||||
|
||||
assert result["extracted_to"] is not None
|
||||
txt_path = vault / result["extracted_to"]
|
||||
assert txt_path.exists()
|
||||
assert txt_path.stat().st_size > 0
|
||||
|
||||
|
||||
def test_pdf_no_dump(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/nodump.pdf"
|
||||
pdf = vault / rel
|
||||
pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
_make_pdf(pdf, "No se debe volcar a disco.")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_pdf_extract(str(vault), rel, db_path=str(db), dump_text=False)
|
||||
|
||||
assert result["extracted_to"] is None
|
||||
|
||||
|
||||
def test_pdf_persists_to_fts(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/fts.pdf"
|
||||
pdf = vault / rel
|
||||
pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
_make_pdf(pdf, "Texto especial para FTS xyzpdftest.")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
vault_pdf_extract(str(vault), rel, db_path=str(db), dump_text=False)
|
||||
|
||||
conn = sqlite3.connect(str(db))
|
||||
# FTS5 contentless: no permite SELECT directo, usar MATCH
|
||||
row = conn.execute(
|
||||
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'xyzpdftest'",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row is not None, "FTS no encontró el texto del PDF"
|
||||
|
||||
|
||||
def test_pdf_corrupt_errors(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/corrupt.pdf"
|
||||
pdf = vault / rel
|
||||
pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
pdf.write_bytes(b"%PDF-1.4 garbage bytes \x00\x01\x02 not a real pdf")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
with pytest.raises(RuntimeError, match="corrupto|inválido|PDF"):
|
||||
vault_pdf_extract(str(vault), rel, db_path=str(db))
|
||||
Reference in New Issue
Block a user