"""vault_knowledge_parse — Parsea un Markdown del vault y persiste en knowledge_docs.""" from __future__ import annotations import json import re import sqlite3 import time from pathlib import Path def _parse_frontmatter(text: str) -> tuple[dict, str]: """Separa YAML frontmatter del cuerpo. Retorna (frontmatter_dict, body).""" if not text.startswith("---\n") and not text.startswith("---\r\n"): return {}, text # Buscar cierre del frontmatter end = text.find("\n---", 4) if end == -1: return {}, text yaml_block = text[4:end].strip() body = text[end + 4:].lstrip("\n\r") try: import yaml fm = yaml.safe_load(yaml_block) or {} if not isinstance(fm, dict): fm = {} except Exception: fm = {} return fm, body def _extract_headings(body: str) -> list[dict]: """Extrae headings Markdown (# ... ### ...) del cuerpo.""" headings = [] for line in body.splitlines(): m = re.match(r"^(#{1,6})\s+(.*)", line) if m: headings.append({"level": len(m.group(1)), "text": m.group(2).strip()}) return headings def _extract_title(frontmatter: dict, body: str, basename: str) -> str: """Extrae título: frontmatter['title'] > primer H1 > basename.""" if frontmatter.get("title"): return str(frontmatter["title"]) for line in body.splitlines(): m = re.match(r"^#\s+(.*)", line) if m: return m.group(1).strip() return basename def vault_knowledge_parse( vault_path: str, rel_path: str, db_path: str | None = None, ) -> dict: """Parsea un archivo Markdown del vault: extrae frontmatter, título, headings y cuerpo. Args: vault_path: Ruta absoluta a la raiz del vault. rel_path: Ruta relativa al archivo Markdown dentro del vault. db_path: Override opcional de la ruta a vault_index.db. Returns: Dict con: rel_path, title, frontmatter, headings, content_text, persisted. Raises: RuntimeError: Si el archivo no existe o no se puede leer. """ vault = Path(vault_path) md_file = vault / rel_path if not md_file.exists(): raise RuntimeError(f"vault_knowledge_parse: archivo no encontrado: {md_file}") db = Path(db_path) if db_path else vault / "vault_index.db" try: text = md_file.read_text(encoding="utf-8") except UnicodeDecodeError: text = md_file.read_text(encoding="latin-1", errors="replace") frontmatter, body = _parse_frontmatter(text) headings = _extract_headings(body) basename = md_file.stem title = _extract_title(frontmatter, body, basename) content_text = body # Persistir en vault_index.db persisted = False if db.exists(): conn = sqlite3.connect(str(db)) try: now = int(time.time()) conn.execute( """ INSERT INTO knowledge_docs(rel_path, title, frontmatter_json, headings_json, parsed_at) VALUES (?, ?, ?, ?, ?) ON CONFLICT(rel_path) DO UPDATE SET title=excluded.title, frontmatter_json=excluded.frontmatter_json, headings_json=excluded.headings_json, parsed_at=excluded.parsed_at """, ( rel_path, title, json.dumps(frontmatter, ensure_ascii=False), json.dumps(headings, ensure_ascii=False), now, ), ) # Actualizar files_fts (rowid debe coincidir con files) conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,)) conn.execute( """ INSERT INTO files_fts(rowid, rel_path, content_text) VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?) """, (rel_path, rel_path, content_text), ) conn.commit() persisted = True except Exception: conn.rollback() raise finally: conn.close() return { "rel_path": rel_path, "title": title, "frontmatter": frontmatter, "headings": headings, "content_text": content_text, "persisted": persisted, }