"""vault_pdf_extract — Extrae texto de un PDF del vault y persiste en vault_index.db.""" from __future__ import annotations import sqlite3 import time from pathlib import Path def vault_pdf_extract( vault_path: str, rel_path: str, db_path: str | None = None, dump_text: bool = True, ) -> dict: """Extrae texto de un PDF del vault; persiste page_count, text_len y actualiza files_fts. Args: vault_path: Ruta absoluta a la raiz del vault. rel_path: Ruta relativa al PDF dentro del vault. db_path: Override opcional de la ruta a vault_index.db. dump_text: Si True, escribe el texto extraído a un .txt en data/processed/ o .vault_extracts/. Returns: Dict con: rel_path, page_count, text_len, extracted_to (ruta relativa o None), persisted. Raises: RuntimeError: Si el PDF no existe, está corrupto o no se puede leer. """ try: import fitz # PyMuPDF except ImportError as exc: raise RuntimeError( "vault_pdf_extract requiere PyMuPDF. Instalar con: uv add pymupdf" ) from exc vault = Path(vault_path) pdf_file = vault / rel_path if not pdf_file.exists(): raise RuntimeError(f"vault_pdf_extract: archivo no encontrado: {pdf_file}") db = Path(db_path) if db_path else vault / "vault_index.db" # Abrir PDF try: doc = fitz.open(str(pdf_file)) except Exception as exc: raise RuntimeError(f"vault_pdf_extract: PDF corrupto o inválido ({rel_path}): {exc}") from exc page_count = doc.page_count text_parts: list[str] = [] for page in doc: try: text_parts.append(page.get_text()) except Exception: text_parts.append("") doc.close() full_text = "\n".join(text_parts) text_len = len(full_text) # Truncar a 10 MB para FTS _MAX_FTS = 10 * 1024 * 1024 fts_text = full_text[:_MAX_FTS] # Dump text a disco extracted_to: str | None = None if dump_text and full_text.strip(): basename = Path(rel_path).stem # Preferir data/processed/ si existe; si no, usar .vault_extracts/ processed_dir = vault / "data" / "processed" if not processed_dir.exists(): processed_dir = vault / ".vault_extracts" processed_dir.mkdir(parents=True, exist_ok=True) txt_path = processed_dir / f"{basename}.txt" txt_path.write_text(full_text, encoding="utf-8") extracted_to = str(txt_path.relative_to(vault)) # Persistir en vault_index.db persisted = False if db.exists(): conn = sqlite3.connect(str(db)) try: now = int(time.time()) conn.execute( """ INSERT INTO pdf_extracts(rel_path, page_count, text_len, extracted_to, extracted_at) VALUES (?, ?, ?, ?, ?) ON CONFLICT(rel_path) DO UPDATE SET page_count=excluded.page_count, text_len=excluded.text_len, extracted_to=excluded.extracted_to, extracted_at=excluded.extracted_at """, (rel_path, page_count, text_len, extracted_to, now), ) # Actualizar files_fts (rowid debe coincidir con files) conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,)) if fts_text.strip(): conn.execute( """ INSERT INTO files_fts(rowid, rel_path, content_text) VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?) """, (rel_path, rel_path, fts_text), ) conn.commit() persisted = True except Exception: conn.rollback() raise finally: conn.close() return { "rel_path": rel_path, "page_count": page_count, "text_len": text_len, "extracted_to": extracted_to, "persisted": persisted, }