e3c8979e8d
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
122 lines
4.0 KiB
Python
122 lines
4.0 KiB
Python
"""vault_pdf_extract — Extrae texto de un PDF del vault y persiste en vault_index.db."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
def vault_pdf_extract(
|
|
vault_path: str,
|
|
rel_path: str,
|
|
db_path: str | None = None,
|
|
dump_text: bool = True,
|
|
) -> dict:
|
|
"""Extrae texto de un PDF del vault; persiste page_count, text_len y actualiza files_fts.
|
|
|
|
Args:
|
|
vault_path: Ruta absoluta a la raiz del vault.
|
|
rel_path: Ruta relativa al PDF dentro del vault.
|
|
db_path: Override opcional de la ruta a vault_index.db.
|
|
dump_text: Si True, escribe el texto extraído a un .txt en data/processed/ o .vault_extracts/.
|
|
|
|
Returns:
|
|
Dict con: rel_path, page_count, text_len, extracted_to (ruta relativa o None), persisted.
|
|
|
|
Raises:
|
|
RuntimeError: Si el PDF no existe, está corrupto o no se puede leer.
|
|
"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError as exc:
|
|
raise RuntimeError(
|
|
"vault_pdf_extract requiere PyMuPDF. Instalar con: uv add pymupdf"
|
|
) from exc
|
|
|
|
vault = Path(vault_path)
|
|
pdf_file = vault / rel_path
|
|
if not pdf_file.exists():
|
|
raise RuntimeError(f"vault_pdf_extract: archivo no encontrado: {pdf_file}")
|
|
|
|
db = Path(db_path) if db_path else vault / "vault_index.db"
|
|
|
|
# Abrir PDF
|
|
try:
|
|
doc = fitz.open(str(pdf_file))
|
|
except Exception as exc:
|
|
raise RuntimeError(f"vault_pdf_extract: PDF corrupto o inválido ({rel_path}): {exc}") from exc
|
|
|
|
page_count = doc.page_count
|
|
text_parts: list[str] = []
|
|
for page in doc:
|
|
try:
|
|
text_parts.append(page.get_text())
|
|
except Exception:
|
|
text_parts.append("")
|
|
doc.close()
|
|
|
|
full_text = "\n".join(text_parts)
|
|
text_len = len(full_text)
|
|
|
|
# Truncar a 10 MB para FTS
|
|
_MAX_FTS = 10 * 1024 * 1024
|
|
fts_text = full_text[:_MAX_FTS]
|
|
|
|
# Dump text a disco
|
|
extracted_to: str | None = None
|
|
if dump_text and full_text.strip():
|
|
basename = Path(rel_path).stem
|
|
# Preferir data/processed/ si existe; si no, usar .vault_extracts/
|
|
processed_dir = vault / "data" / "processed"
|
|
if not processed_dir.exists():
|
|
processed_dir = vault / ".vault_extracts"
|
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
|
txt_path = processed_dir / f"{basename}.txt"
|
|
txt_path.write_text(full_text, encoding="utf-8")
|
|
extracted_to = str(txt_path.relative_to(vault))
|
|
|
|
# Persistir en vault_index.db
|
|
persisted = False
|
|
if db.exists():
|
|
conn = sqlite3.connect(str(db))
|
|
try:
|
|
now = int(time.time())
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO pdf_extracts(rel_path, page_count, text_len, extracted_to, extracted_at)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
ON CONFLICT(rel_path) DO UPDATE SET
|
|
page_count=excluded.page_count,
|
|
text_len=excluded.text_len,
|
|
extracted_to=excluded.extracted_to,
|
|
extracted_at=excluded.extracted_at
|
|
""",
|
|
(rel_path, page_count, text_len, extracted_to, now),
|
|
)
|
|
# Actualizar files_fts (rowid debe coincidir con files)
|
|
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
|
|
if fts_text.strip():
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO files_fts(rowid, rel_path, content_text)
|
|
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
|
|
""",
|
|
(rel_path, rel_path, fts_text),
|
|
)
|
|
conn.commit()
|
|
persisted = True
|
|
except Exception:
|
|
conn.rollback()
|
|
raise
|
|
finally:
|
|
conn.close()
|
|
|
|
return {
|
|
"rel_path": rel_path,
|
|
"page_count": page_count,
|
|
"text_len": text_len,
|
|
"extracted_to": extracted_to,
|
|
"persisted": persisted,
|
|
}
|