Files
egutierrez a802f59f55 chore: auto-commit (95 archivos)
- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 00:50:34 +02:00

122 lines
4.0 KiB
Python

"""vault_pdf_extract — Extrae texto de un PDF del vault y persiste en vault_index.db."""
from __future__ import annotations
import sqlite3
import time
from pathlib import Path
def vault_pdf_extract(
vault_path: str,
rel_path: str,
db_path: str | None = None,
dump_text: bool = True,
) -> dict:
"""Extrae texto de un PDF del vault; persiste page_count, text_len y actualiza files_fts.
Args:
vault_path: Ruta absoluta a la raiz del vault.
rel_path: Ruta relativa al PDF dentro del vault.
db_path: Override opcional de la ruta a vault_index.db.
dump_text: Si True, escribe el texto extraído a un .txt en data/processed/ o .vault_extracts/.
Returns:
Dict con: rel_path, page_count, text_len, extracted_to (ruta relativa o None), persisted.
Raises:
RuntimeError: Si el PDF no existe, está corrupto o no se puede leer.
"""
try:
import fitz # PyMuPDF
except ImportError as exc:
raise RuntimeError(
"vault_pdf_extract requiere PyMuPDF. Instalar con: uv add pymupdf"
) from exc
vault = Path(vault_path)
pdf_file = vault / rel_path
if not pdf_file.exists():
raise RuntimeError(f"vault_pdf_extract: archivo no encontrado: {pdf_file}")
db = Path(db_path) if db_path else vault / "vault_index.db"
# Abrir PDF
try:
doc = fitz.open(str(pdf_file))
except Exception as exc:
raise RuntimeError(f"vault_pdf_extract: PDF corrupto o inválido ({rel_path}): {exc}") from exc
page_count = doc.page_count
text_parts: list[str] = []
for page in doc:
try:
text_parts.append(page.get_text())
except Exception:
text_parts.append("")
doc.close()
full_text = "\n".join(text_parts)
text_len = len(full_text)
# Truncar a 10 MB para FTS
_MAX_FTS = 10 * 1024 * 1024
fts_text = full_text[:_MAX_FTS]
# Dump text a disco
extracted_to: str | None = None
if dump_text and full_text.strip():
basename = Path(rel_path).stem
# Preferir data/processed/ si existe; si no, usar .vault_extracts/
processed_dir = vault / "data" / "processed"
if not processed_dir.exists():
processed_dir = vault / ".vault_extracts"
processed_dir.mkdir(parents=True, exist_ok=True)
txt_path = processed_dir / f"{basename}.txt"
txt_path.write_text(full_text, encoding="utf-8")
extracted_to = str(txt_path.relative_to(vault))
# Persistir en vault_index.db
persisted = False
if db.exists():
conn = sqlite3.connect(str(db))
try:
now = int(time.time())
conn.execute(
"""
INSERT INTO pdf_extracts(rel_path, page_count, text_len, extracted_to, extracted_at)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(rel_path) DO UPDATE SET
page_count=excluded.page_count,
text_len=excluded.text_len,
extracted_to=excluded.extracted_to,
extracted_at=excluded.extracted_at
""",
(rel_path, page_count, text_len, extracted_to, now),
)
# Actualizar files_fts (rowid debe coincidir con files)
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
if fts_text.strip():
conn.execute(
"""
INSERT INTO files_fts(rowid, rel_path, content_text)
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
""",
(rel_path, rel_path, fts_text),
)
conn.commit()
persisted = True
except Exception:
conn.rollback()
raise
finally:
conn.close()
return {
"rel_path": rel_path,
"page_count": page_count,
"text_len": text_len,
"extracted_to": extracted_to,
"persisted": persisted,
}