chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,121 @@
|
||||
"""vault_pdf_extract — Extrae texto de un PDF del vault y persiste en vault_index.db."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def vault_pdf_extract(
|
||||
vault_path: str,
|
||||
rel_path: str,
|
||||
db_path: str | None = None,
|
||||
dump_text: bool = True,
|
||||
) -> dict:
|
||||
"""Extrae texto de un PDF del vault; persiste page_count, text_len y actualiza files_fts.
|
||||
|
||||
Args:
|
||||
vault_path: Ruta absoluta a la raiz del vault.
|
||||
rel_path: Ruta relativa al PDF dentro del vault.
|
||||
db_path: Override opcional de la ruta a vault_index.db.
|
||||
dump_text: Si True, escribe el texto extraído a un .txt en data/processed/ o .vault_extracts/.
|
||||
|
||||
Returns:
|
||||
Dict con: rel_path, page_count, text_len, extracted_to (ruta relativa o None), persisted.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Si el PDF no existe, está corrupto o no se puede leer.
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"vault_pdf_extract requiere PyMuPDF. Instalar con: uv add pymupdf"
|
||||
) from exc
|
||||
|
||||
vault = Path(vault_path)
|
||||
pdf_file = vault / rel_path
|
||||
if not pdf_file.exists():
|
||||
raise RuntimeError(f"vault_pdf_extract: archivo no encontrado: {pdf_file}")
|
||||
|
||||
db = Path(db_path) if db_path else vault / "vault_index.db"
|
||||
|
||||
# Abrir PDF
|
||||
try:
|
||||
doc = fitz.open(str(pdf_file))
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"vault_pdf_extract: PDF corrupto o inválido ({rel_path}): {exc}") from exc
|
||||
|
||||
page_count = doc.page_count
|
||||
text_parts: list[str] = []
|
||||
for page in doc:
|
||||
try:
|
||||
text_parts.append(page.get_text())
|
||||
except Exception:
|
||||
text_parts.append("")
|
||||
doc.close()
|
||||
|
||||
full_text = "\n".join(text_parts)
|
||||
text_len = len(full_text)
|
||||
|
||||
# Truncar a 10 MB para FTS
|
||||
_MAX_FTS = 10 * 1024 * 1024
|
||||
fts_text = full_text[:_MAX_FTS]
|
||||
|
||||
# Dump text a disco
|
||||
extracted_to: str | None = None
|
||||
if dump_text and full_text.strip():
|
||||
basename = Path(rel_path).stem
|
||||
# Preferir data/processed/ si existe; si no, usar .vault_extracts/
|
||||
processed_dir = vault / "data" / "processed"
|
||||
if not processed_dir.exists():
|
||||
processed_dir = vault / ".vault_extracts"
|
||||
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||
txt_path = processed_dir / f"{basename}.txt"
|
||||
txt_path.write_text(full_text, encoding="utf-8")
|
||||
extracted_to = str(txt_path.relative_to(vault))
|
||||
|
||||
# Persistir en vault_index.db
|
||||
persisted = False
|
||||
if db.exists():
|
||||
conn = sqlite3.connect(str(db))
|
||||
try:
|
||||
now = int(time.time())
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO pdf_extracts(rel_path, page_count, text_len, extracted_to, extracted_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(rel_path) DO UPDATE SET
|
||||
page_count=excluded.page_count,
|
||||
text_len=excluded.text_len,
|
||||
extracted_to=excluded.extracted_to,
|
||||
extracted_at=excluded.extracted_at
|
||||
""",
|
||||
(rel_path, page_count, text_len, extracted_to, now),
|
||||
)
|
||||
# Actualizar files_fts (rowid debe coincidir con files)
|
||||
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
|
||||
if fts_text.strip():
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files_fts(rowid, rel_path, content_text)
|
||||
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
|
||||
""",
|
||||
(rel_path, rel_path, fts_text),
|
||||
)
|
||||
conn.commit()
|
||||
persisted = True
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
"rel_path": rel_path,
|
||||
"page_count": page_count,
|
||||
"text_len": text_len,
|
||||
"extracted_to": extracted_to,
|
||||
"persisted": persisted,
|
||||
}
|
||||
Reference in New Issue
Block a user