Files
fn_registry/python/functions/infra/vault_knowledge_parse.py
egutierrez e3c8979e8d chore: auto-commit (95 archivos)
- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 00:50:34 +02:00

143 lines
4.3 KiB
Python

"""vault_knowledge_parse — Parsea un Markdown del vault y persiste en knowledge_docs."""
from __future__ import annotations
import json
import re
import sqlite3
import time
from pathlib import Path
def _parse_frontmatter(text: str) -> tuple[dict, str]:
"""Separa YAML frontmatter del cuerpo. Retorna (frontmatter_dict, body)."""
if not text.startswith("---\n") and not text.startswith("---\r\n"):
return {}, text
# Buscar cierre del frontmatter
end = text.find("\n---", 4)
if end == -1:
return {}, text
yaml_block = text[4:end].strip()
body = text[end + 4:].lstrip("\n\r")
try:
import yaml
fm = yaml.safe_load(yaml_block) or {}
if not isinstance(fm, dict):
fm = {}
except Exception:
fm = {}
return fm, body
def _extract_headings(body: str) -> list[dict]:
"""Extrae headings Markdown (# ... ### ...) del cuerpo."""
headings = []
for line in body.splitlines():
m = re.match(r"^(#{1,6})\s+(.*)", line)
if m:
headings.append({"level": len(m.group(1)), "text": m.group(2).strip()})
return headings
def _extract_title(frontmatter: dict, body: str, basename: str) -> str:
"""Extrae título: frontmatter['title'] > primer H1 > basename."""
if frontmatter.get("title"):
return str(frontmatter["title"])
for line in body.splitlines():
m = re.match(r"^#\s+(.*)", line)
if m:
return m.group(1).strip()
return basename
def vault_knowledge_parse(
vault_path: str,
rel_path: str,
db_path: str | None = None,
) -> dict:
"""Parsea un archivo Markdown del vault: extrae frontmatter, título, headings y cuerpo.
Args:
vault_path: Ruta absoluta a la raiz del vault.
rel_path: Ruta relativa al archivo Markdown dentro del vault.
db_path: Override opcional de la ruta a vault_index.db.
Returns:
Dict con: rel_path, title, frontmatter, headings, content_text, persisted.
Raises:
RuntimeError: Si el archivo no existe o no se puede leer.
"""
vault = Path(vault_path)
md_file = vault / rel_path
if not md_file.exists():
raise RuntimeError(f"vault_knowledge_parse: archivo no encontrado: {md_file}")
db = Path(db_path) if db_path else vault / "vault_index.db"
try:
text = md_file.read_text(encoding="utf-8")
except UnicodeDecodeError:
text = md_file.read_text(encoding="latin-1", errors="replace")
frontmatter, body = _parse_frontmatter(text)
headings = _extract_headings(body)
basename = md_file.stem
title = _extract_title(frontmatter, body, basename)
content_text = body
# Persistir en vault_index.db
persisted = False
if db.exists():
conn = sqlite3.connect(str(db))
try:
now = int(time.time())
conn.execute(
"""
INSERT INTO knowledge_docs(rel_path, title, frontmatter_json, headings_json, parsed_at)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(rel_path) DO UPDATE SET
title=excluded.title,
frontmatter_json=excluded.frontmatter_json,
headings_json=excluded.headings_json,
parsed_at=excluded.parsed_at
""",
(
rel_path,
title,
json.dumps(frontmatter, ensure_ascii=False),
json.dumps(headings, ensure_ascii=False),
now,
),
)
# Actualizar files_fts (rowid debe coincidir con files)
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
conn.execute(
"""
INSERT INTO files_fts(rowid, rel_path, content_text)
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
""",
(rel_path, rel_path, content_text),
)
conn.commit()
persisted = True
except Exception:
conn.rollback()
raise
finally:
conn.close()
return {
"rel_path": rel_path,
"title": title,
"frontmatter": frontmatter,
"headings": headings,
"content_text": content_text,
"persisted": persisted,
}