chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
---
|
||||
name: vault_pdf_extract
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def vault_pdf_extract(vault_path: str, rel_path: str, db_path: str | None = None, dump_text: bool = True) -> dict"
|
||||
description: "Extrae texto de un PDF del vault con PyMuPDF; persiste page_count y text_len en pdf_extracts; vuelca texto a .txt en data/processed/ o .vault_extracts/; actualiza files_fts para búsqueda por contenido."
|
||||
tags: [vault, pdf, extract, pymupdf, fts, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [sqlite3, time, pathlib, fitz]
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta absoluta a la raiz del vault donde vive el PDF y vault_index.db."
|
||||
- name: rel_path
|
||||
desc: "Ruta relativa al PDF dentro del vault (ej. 'docs/informe.pdf')."
|
||||
- name: db_path
|
||||
desc: "Override opcional de la ruta a vault_index.db. Por defecto <vault_path>/vault_index.db."
|
||||
- name: dump_text
|
||||
desc: "Si True (default), escribe el texto extraído a un .txt. La carpeta destino es data/processed/ si existe, si no .vault_extracts/."
|
||||
output: "Dict con: rel_path (str), page_count (int), text_len (int), extracted_to (ruta relativa al .txt o None), persisted (bool)."
|
||||
tested: true
|
||||
tests:
|
||||
- "test_pdf_extract_basic"
|
||||
- "test_pdf_dump_text_creates_file"
|
||||
- "test_pdf_no_dump"
|
||||
- "test_pdf_persists_to_fts"
|
||||
- "test_pdf_corrupt_errors"
|
||||
test_file_path: "python/functions/datascience/tests/test_vault_pdf_extract.py"
|
||||
file_path: "python/functions/datascience/vault_pdf_extract.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from vault_pdf_extract import vault_pdf_extract
|
||||
|
||||
result = vault_pdf_extract("/vaults/mi_vault", "docs/informe_anual.pdf")
|
||||
# {
|
||||
# "rel_path": "docs/informe_anual.pdf",
|
||||
# "page_count": 24,
|
||||
# "text_len": 45210,
|
||||
# "extracted_to": "data/processed/informe_anual.txt",
|
||||
# "persisted": True
|
||||
# }
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Requiere PyMuPDF (paquete `pymupdf`, importado como `fitz`). Ya instalado en python/.venv.
|
||||
- El texto se trunca a 10 MB antes de insertarlo en files_fts para evitar tablas FTS5 masivas.
|
||||
- Layout de volcado: si `<vault_path>/data/processed/` existe, se usa; si no, se crea `<vault_path>/.vault_extracts/`.
|
||||
- PDFs corruptos levantan RuntimeError con mensaje descriptivo.
|
||||
- El rowid de files_fts se ancla al rowid de la tabla files (subquery) para que vault_search funcione correctamente.
|
||||
- Si vault_index.db no existe, retorna el dict sin intentar persistir (persisted=False).
|
||||
Reference in New Issue
Block a user