docs(flows): DoD obligatorio con user-facing surface + abrir issues 0100-0103 (taxonomia, frontmatter migration, dev_console, work dashboard)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 00:07:03 +02:00
parent a03675113a
commit 6ad82167bb
72 changed files with 3920 additions and 303 deletions
@@ -110,11 +110,23 @@ def validate_recipe_yaml(yaml_text: str) -> dict:
                            )

            sink = output.get("sink")
-            valid_sinks = {"data_factory.runs", "stdout", "json_file"}
+            # duckdb sink: requires output.duckdb_path (relative or absolute) and
+            # output.table (table name). Optional output.database_id (default =
+            # recipe_name + "_db") used to register/lookup in data_factory.databases.
+            valid_sinks = {"data_factory.runs", "stdout", "json_file", "duckdb"}
            if sink is not None and sink not in valid_sinks:
                errors.append(
                    f"Campo 'output.sink' debe ser uno de {sorted(valid_sinks)}, got '{sink}'."
                )
+            if sink == "duckdb":
+                if not output.get("duckdb_path"):
+                    errors.append(
+                        "Sink 'duckdb' requiere 'output.duckdb_path' (ruta al archivo .duckdb)."
+                    )
+                if not output.get("table"):
+                    errors.append(
+                        "Sink 'duckdb' requiere 'output.table' (nombre de la tabla destino)."
+                    )

    return {
        "valid": len(errors) == 0,
@@ -1,9 +1,28 @@
 """Invoca `claude -p` via subprocess y devuelve la respuesta como string."""

+import os
 import shutil
 import subprocess


+def _resolve_claude_bin() -> str | None:
+    """Localiza claude CLI: PATH first, luego rutas convencionales."""
+    p = shutil.which("claude")
+    if p:
+        return p
+    # Fallback paths comunes (WSL subsession sin .profile cargado, etc).
+    home = os.path.expanduser("~")
+    candidates = [
+        f"{home}/.local/bin/claude",
+        "/usr/local/bin/claude",
+        "/opt/homebrew/bin/claude",
+    ]
+    for c in candidates:
+        if os.path.isfile(c) and os.access(c, os.X_OK):
+            return c
+    return None
+
+
 def claude_cli_prompt(
    prompt: str,
    timeout_s: int = 60,
@@ -24,16 +43,18 @@ def claude_cli_prompt(
        Respuesta de Claude como texto (stdout), truncada a max_chars_response.

    Raises:
-        FileNotFoundError: Si `claude` no esta en PATH.
+        FileNotFoundError: Si `claude` no esta en PATH ni rutas convencionales.
        RuntimeError: Si exit code != 0 (incluye primeros 500 chars de stderr).
        subprocess.TimeoutExpired: Si la llamada supera timeout_s segundos.
    """
-    if shutil.which("claude") is None:
+    claude_bin = _resolve_claude_bin()
+    if claude_bin is None:
        raise FileNotFoundError(
-            "'claude' CLI no encontrado en PATH. Instala Claude Code."
+            "'claude' CLI no encontrado en PATH ni rutas convencionales "
+            "(~/.local/bin, /usr/local/bin, /opt/homebrew/bin). Instala Claude Code."
        )

-    cmd = ["claude", "-p", prompt]
+    cmd = [claude_bin, "-p", prompt]
    if model:
        cmd.extend(["--model", model])
    if extra_args:
@@ -0,0 +1,75 @@
+---
+name: codegen_app_modules
+kind: function
+lang: py
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int"
+description: "Reads app.md uses_modules + modules/<name>/module.md frontmatters, emits <app>_modules_generated.cpp with fn::app_modules_array[] + fn::app_modules_count. CMake hook for add_imgui_app. Pure YAML parsing, no registry.db dep."
+tags: [codegen, modules, cmake, cpp, build]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports:
+  - yaml
+example: |
+  python python/functions/infra/codegen_app_modules.py \
+    --app-md apps/data_factory/app.md \
+    --modules-root modules \
+    --app-name data_factory \
+    --out cpp/build/apps/data_factory/data_factory_modules_generated.cpp
+file_path: "python/functions/infra/codegen_app_modules.py"
+params:
+  - name: app_md
+    desc: "Path absoluto al app.md de la app consumidora. Lee uses_modules del frontmatter YAML."
+  - name: modules_root
+    desc: "Raiz del directorio modules/. Cada modulo es modules/<name>/module.md."
+  - name: app_name
+    desc: "Nombre de la app (solo para el comment-header del .cpp generado)."
+  - name: out_path
+    desc: "Path donde escribir el .cpp generado. Idempotente: skip si contenido coincide."
+output: "Exit code: 0 si OK, 2 si OK pero algun modulo declarado no existe (warning), >0 si error."
+---
+
+## Ejemplo
+
+Generar el .cpp para `data_factory`:
+
+```bash
+python python/functions/infra/codegen_app_modules.py \
+  --app-md apps/data_factory/app.md \
+  --modules-root modules \
+  --app-name data_factory \
+  --out /tmp/data_factory_modules_generated.cpp
+```
+
+Si `data_factory/app.md` declara `uses_modules: [data_table_cpp]`, el .cpp generado es:
+
+```cpp
+// Auto-generated by codegen_app_modules.py — do not edit.
+// App: data_factory
+// Source of truth: apps/data_factory/app.md (uses_modules)
+
+#include "app_modules.h"
+
+namespace fn {
+const ModuleInfo app_modules_array[] = {
+    { "data_table", "1.4.0", "Reusable C++ ImGui module..." },
+};
+const unsigned long app_modules_count = 1;
+} // namespace fn
+```
+
+## Cuando usarla
+
+CMake hook automatico — la macro `add_imgui_app` la invoca al configurar el build. Apps no la llaman manualmente. Manual override: solo si quieres regenerar fuera del flujo cmake (debugging).
+
+## Gotchas
+
+- Resuelve `<name>_cpp` strippeando el sufijo `_cpp/_py/_ts/_bash/_go`. Mismo patron que `GenerateModuleID`.
+- Si un modulo declarado en `uses_modules` no existe, emite warning a stderr y EXIT=2 (no falla el build).
+- Idempotente: solo reescribe si el contenido cambia. Evita rebuilds innecesarios cuando los modulos no cambiaron.
+- Requiere `pyyaml`. Disponible en `python/.venv` del registry.
@@ -0,0 +1,149 @@
+"""Generate <app>_modules_generated.cpp from app.md uses_modules + modules/*/module.md.
+
+Stand-alone — no dependencies beyond PyYAML. Invoked from CMake at configure time.
+Reads YAML frontmatter directly (no registry.db dependency, no Go binary).
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+
+def _read_frontmatter(md_path: Path) -> dict:
+    if not md_path.exists():
+        return {}
+    text = md_path.read_text(encoding="utf-8")
+    if not text.startswith("---\n") and not text.startswith("---\r\n"):
+        return {}
+    end = text.find("\n---", 4)
+    if end < 0:
+        return {}
+    raw = text[4:end]
+    try:
+        return yaml.safe_load(raw) or {}
+    except yaml.YAMLError:
+        return {}
+
+
+def _escape_c_string(s: str) -> str:
+    out = []
+    for ch in s or "":
+        if ch == "\\":
+            out.append("\\\\")
+        elif ch == '"':
+            out.append('\\"')
+        elif ch == "\n":
+            out.append("\\n")
+        elif ch == "\r":
+            out.append("\\r")
+        elif ch == "\t":
+            out.append("\\t")
+        elif ord(ch) < 32:
+            out.append(f"\\x{ord(ch):02x}")
+        else:
+            out.append(ch)
+    return "".join(out)
+
+
+def _resolve_module(modules_root: Path, mod_id: str) -> Optional[dict]:
+    """mod_id is e.g. `data_table_cpp`. Lookup module.md by name (strip _<lang>)."""
+    name = mod_id
+    for suffix in ("_cpp", "_py", "_ts", "_bash", "_go"):
+        if name.endswith(suffix):
+            name = name[: -len(suffix)]
+            break
+    md = modules_root / name / "module.md"
+    fm = _read_frontmatter(md)
+    if not fm:
+        return None
+    return {
+        "name": fm.get("name", name),
+        "version": fm.get("version", "0.0.0"),
+        "description": fm.get("description", ""),
+    }
+
+
+def generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int:
+    fm = _read_frontmatter(app_md)
+    uses_modules = fm.get("uses_modules") or []
+    if not isinstance(uses_modules, list):
+        uses_modules = []
+
+    entries: list[dict] = []
+    missing: list[str] = []
+    for mid in uses_modules:
+        info = _resolve_module(modules_root, str(mid))
+        if info is None:
+            missing.append(str(mid))
+            continue
+        entries.append(info)
+
+    lines: list[str] = []
+    lines.append(f"// Auto-generated by codegen_app_modules.py — do not edit.")
+    lines.append(f"// App: {app_name}")
+    lines.append(f"// Source of truth: {app_md.as_posix()} (uses_modules)")
+    lines.append("")
+    lines.append('#include "app_modules.h"')
+    lines.append("")
+    lines.append("namespace fn {")
+    if entries:
+        lines.append("const ModuleInfo app_modules_array[] = {")
+        for e in entries:
+            lines.append(
+                '    { "%s", "%s", "%s" },'
+                % (
+                    _escape_c_string(e["name"]),
+                    _escape_c_string(e["version"]),
+                    _escape_c_string(e["description"]),
+                )
+            )
+        lines.append("};")
+        lines.append(f"const unsigned long app_modules_count = {len(entries)};")
+    else:
+        lines.append("const ModuleInfo app_modules_array[1] = { { nullptr, nullptr, nullptr } };")
+        lines.append("const unsigned long app_modules_count = 0;")
+    lines.append("} // namespace fn")
+    lines.append("")
+
+    new_content = "\n".join(lines)
+
+    # Idempotent: skip rewrite when content matches.
+    if out_path.exists() and out_path.read_text(encoding="utf-8") == new_content:
+        return 0 if not missing else 2
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(new_content, encoding="utf-8")
+
+    if missing:
+        sys.stderr.write(
+            f"codegen_app_modules: WARNING — module(s) not found: {', '.join(missing)} "
+            f"(app {app_name})\n"
+        )
+        return 2
+    return 0
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate <app>_modules_generated.cpp from app.md")
+    ap.add_argument("--app-md", required=True, help="Path to app.md")
+    ap.add_argument("--modules-root", required=True, help="Path to modules/ root")
+    ap.add_argument("--app-name", required=True, help="App name (for comment header)")
+    ap.add_argument("--out", required=True, help="Output path for generated .cpp")
+    args = ap.parse_args()
+
+    rc = generate(
+        app_md=Path(args.app_md),
+        modules_root=Path(args.modules_root),
+        app_name=args.app_name,
+        out_path=Path(args.out),
+    )
+    return 0 if rc in (0, 2) else rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,72 @@
+---
+name: export_hub_manifest
+kind: function
+lang: py
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict"
+description: "Genera el TSV sidecar para app_hub_launcher: consulta registry.db por todas las apps cpp/imgui, lee su app.md para extraer nombre, descripcion y accent_hex, y escribe un archivo TSV con cabecera a out_path. Retorna {ok, count, out_path}."
+tags: [hub, launcher, manifest, suite, cpp-windows]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [sqlite3, yaml, pathlib]
+params:
+  - name: out_path
+    desc: "Ruta de destino del archivo TSV. Puede ser absoluta o relativa al cwd. El directorio padre se crea si no existe."
+  - name: registry_root
+    desc: "Raiz del fn_registry. Si None, usa la variable de entorno FN_REGISTRY_ROOT o /home/lucas/fn_registry como fallback."
+output: "Dict {ok: True, count: N, out_path: str} con la ruta absoluta del TSV escrito y el numero de apps incluidas."
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/infra/export_hub_manifest.py"
+---
+
+## Ejemplo
+
+```bash
+# Uso directo con fn run (la salida JSON se imprime en stdout)
+./fn run export_hub_manifest_py_infra /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
+```
+
+```python
+# Desde un heredoc o pipeline Python
+import sys
+sys.path.insert(0, "python/functions")
+from infra import export_hub_manifest
+
+result = export_hub_manifest(
+    "/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv"
+)
+print(result)
+# {'ok': True, 'count': 12, 'out_path': '/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv'}
+```
+
+```bash
+# Ver el contenido del TSV generado
+head -5 /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
+# name          display_name        description                         accent_hex
+# chart_demo    Chart Demo          Demo ImGui de primitivos viz...     #0ea5e9
+# dag_engine_ui Dag Engine Ui       Motor de DAGs con frontend...       #f59e0b
+```
+
+## Cuando usarla
+
+Antes de desplegar `app_hub_launcher` a Windows: genera el `hub_manifest.tsv` que el hub lee al arrancar para listar y colorear los botones de cada app. El hub en runtime no tiene acceso a `registry.db` ni a los `app.md` del WSL, por lo que necesita este sidecar. Ejecutar tras añadir o modificar una app C++ imgui en el registry.
+
+## Gotchas
+
+- **PyYAML en el venv**: requiere `yaml` disponible en `python/.venv`. Ya instalado por defecto. Si falta: `cd python && uv pip install pyyaml`.
+- **app.md faltante no aborta**: si un `app.md` no existe o tiene frontmatter malformado, la app sigue apareciendo en el TSV con `description` vacía y accent `#64748b` (slate). Se imprime un WARN a stderr.
+- **Filtro estricto `lang='cpp' AND framework='imgui'`**: solo apps C++ con el shell `fn::run_app`. Apps Python, Bash o C++ sin imgui quedan excluidas. Correcto para el hub.
+- **La ruta `dir_path` en registry.db es relativa a la raiz del registry**: la funcion la combina con `registry_root` para construir el path absoluto al `app.md`. Si una app tiene `dir_path` incorrecto en su `app.md`, el WARN indicara cual falló.
+- **TSV UTF-8**: el hub debe abrir el archivo con encoding UTF-8. Tabs y saltos de linea en los campos se limpian automaticamente (reemplazados por espacio).
+- **`display_name` es generado, no leido**: se deriva del `name` de la app convirtiendo snake_case a Title Case. No se puede personalizar desde el `app.md` en esta version.
+
+## Capability growth log
+
+*(sin cambios desde v1.0.0)*
@@ -0,0 +1,142 @@
+"""export_hub_manifest — genera el TSV sidecar para app_hub_launcher."""
+
+from __future__ import annotations
+
+import os
+import sqlite3
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def _read_frontmatter(md_path: Path) -> dict[str, Any]:
+    """Parse YAML frontmatter from a .md file. Returns {} on any error."""
+    try:
+        import yaml  # PyYAML — available in python/.venv
+
+        text = md_path.read_text(encoding="utf-8")
+        if not text.startswith("---"):
+            return {}
+        # Find the closing ---
+        end = text.find("\n---", 3)
+        if end == -1:
+            return {}
+        yaml_block = text[3:end].strip()
+        data = yaml.safe_load(yaml_block)
+        return data if isinstance(data, dict) else {}
+    except Exception as exc:
+        print(f"[export_hub_manifest] WARN: could not parse {md_path}: {exc}", file=sys.stderr)
+        return {}
+
+
+def _snake_to_display(name: str) -> str:
+    """Convert snake_case name to Title Case With Spaces.
+
+    Examples:
+        graph_explorer   -> Graph Explorer
+        dag_engine_ui    -> Dag Engine Ui
+        app_hub_launcher -> App Hub Launcher
+    """
+    return " ".join(part.capitalize() for part in name.split("_"))
+
+
+def export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict:
+    """Generate TSV sidecar manifest for app_hub_launcher.
+
+    Queries registry.db for all cpp/imgui apps, reads their app.md
+    frontmatter to extract name, description and accent color, then
+    writes a UTF-8 TSV to out_path.
+
+    Args:
+        out_path: Destination path for the TSV manifest file.
+        registry_root: Path to the fn_registry root directory.
+            Defaults to FN_REGISTRY_ROOT env var or /home/lucas/fn_registry.
+
+    Returns:
+        {"ok": True, "count": N, "out_path": "<abs_path>"}
+    """
+    root = Path(
+        registry_root
+        or os.environ.get("FN_REGISTRY_ROOT", "/home/lucas/fn_registry")
+    ).resolve()
+
+    db_path = root / "registry.db"
+    if not db_path.exists():
+        raise FileNotFoundError(f"registry.db not found at {db_path}")
+
+    con = sqlite3.connect(str(db_path))
+    con.row_factory = sqlite3.Row
+    try:
+        rows = con.execute(
+            "SELECT id, name, dir_path FROM apps WHERE lang='cpp' AND framework='imgui' ORDER BY name"
+        ).fetchall()
+    finally:
+        con.close()
+
+    DEFAULT_ACCENT = "#64748b"
+    TSV_HEADER = "name\tdisplay_name\tdescription\taccent_hex\n"
+
+    lines: list[str] = [TSV_HEADER]
+    count = 0
+
+    for row in rows:
+        app_name: str = row["name"]
+        dir_path: str = row["dir_path"]
+
+        # Derive defaults in case app.md is missing / malformed
+        display_name = _snake_to_display(app_name)
+        description = ""
+        accent_hex = DEFAULT_ACCENT
+
+        md_path = root / dir_path / "app.md"
+        if md_path.exists():
+            fm = _read_frontmatter(md_path)
+            if fm:
+                description = fm.get("description", "") or ""
+                icon_block = fm.get("icon")
+                if isinstance(icon_block, dict):
+                    accent_hex = icon_block.get("accent", DEFAULT_ACCENT) or DEFAULT_ACCENT
+            else:
+                print(
+                    f"[export_hub_manifest] WARN: empty/malformed frontmatter in {md_path}",
+                    file=sys.stderr,
+                )
+        else:
+            print(
+                f"[export_hub_manifest] WARN: app.md missing for {app_name} at {md_path}",
+                file=sys.stderr,
+            )
+
+        # Sanitize: TSV values must not contain tabs or newlines
+        def clean(s: str) -> str:
+            return s.replace("\t", " ").replace("\n", " ").replace("\r", "")
+
+        lines.append(
+            f"{clean(app_name)}\t{clean(display_name)}\t{clean(description)}\t{clean(accent_hex)}\n"
+        )
+        count += 1
+
+    out = Path(out_path).resolve()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text("".join(lines), encoding="utf-8")
+
+    return {"ok": True, "count": count, "out_path": str(out)}
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(
+        description="Export hub manifest TSV for app_hub_launcher."
+    )
+    parser.add_argument("out_path", help="Destination .tsv file path")
+    parser.add_argument(
+        "--registry-root",
+        default=None,
+        help="Path to fn_registry root (default: FN_REGISTRY_ROOT env or /home/lucas/fn_registry)",
+    )
+    args = parser.parse_args()
+
+    result = export_hub_manifest(args.out_path, registry_root=args.registry_root)
+    print(json.dumps(result, indent=2))
@@ -3,7 +3,7 @@ name: cdp_extract_recipe
 kind: pipeline
 lang: py
 domain: pipelines
-version: "1.0.0"
+version: "1.2.0"
 purity: impure
 signature: "def cdp_extract_recipe(recipe_path: str, debug_port: int = 9222, tab_id: str | None = None, record_run: bool = True) -> dict"
 description: "Ejecuta una recipe YAML contra Chrome remoto via CDP. Valida recipe, busca tab por url_pattern, ejecuta steps (wait_selector/js) y envia resultado al sink declarado."
@@ -22,7 +22,7 @@ params:
  - name: tab_id
    desc: "ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern de la recipe."
  - name: record_run
-    desc: "Si True y output.sink=='data_factory.runs', registra la ejecucion en data_factory."
+    desc: "Si True, registra la ejecucion en data_factory.runs (para sink 'data_factory.runs' y 'duckdb')."
 output: "dict {status: ok|error, rows_out: int, kb_out: float, duration_ms: int, error: str, sample_rows: list}"
 tested: false
 tests: []
@@ -60,6 +60,10 @@ output:

 Cuando tienes una recipe YAML validada y Chrome corriendo con remote debugging, y quieres extraer datos en un solo paso sin montar pipeline manualmente. Encadena con `cdp_open_url_and_wait` si necesitas abrir la URL primero.

+## Capability growth log
+
+- v1.2.0 (2026-05-16) — sink `duckdb` writes rows to a DuckDB file + registers run in data_factory.runs with storage_db_id/storage_table for traceability.
+
 ## Gotchas

 - Chrome debe estar corriendo con `--remote-debugging-port=<debug_port>`.
@@ -41,9 +41,14 @@ def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 1


 def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
-    """Polling cada 200ms hasta que document.querySelector(selector) no sea null."""
+    """Polling cada 200ms hasta que document.querySelector(selector) no sea null.
+
+    Drena eventos CDP (paginas con Page.enable emiten loads, frames, etc.) y
+    matchea por `id` para evitar leer respuestas ajenas o eventos del server.
+    """
    deadline = time.time() + timeout_s
    msg_id = 1000
+    ws.settimeout(0.5)
    while time.time() < deadline:
        ws.send(json.dumps({
            "id": msg_id,
@@ -53,19 +58,28 @@ def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
                "returnByValue": True,
            }
        }))
-        time.sleep(0.2)
-        msg_id += 1
-        # Leer respuesta en loop simple (websocket-client sync)
-        # Para modo sync usamos recv()
-        try:
-            raw = ws.sock.recv()
-            if raw:
+        # Leer hasta 30 frames buscando uno con nuestro id; ignorar eventos.
+        got_response = False
+        for _ in range(30):
+            try:
+                raw = ws.recv()
+            except Exception:
+                break
+            if not raw:
+                break
+            try:
                msg = json.loads(raw)
+            except Exception:
+                continue
+            if msg.get("id") == msg_id:
+                got_response = True
                val = msg.get("result", {}).get("result", {}).get("value", False)
                if val:
                    return True
-        except Exception:
-            pass
+                break
+        msg_id += 1
+        if not got_response:
+            time.sleep(0.2)
    return False


@@ -188,16 +202,114 @@ def cdp_extract_recipe(
        out_path = output_cfg.get("path", "output.json")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(rows, f, ensure_ascii=False, indent=2)
+    elif sink == "duckdb":
+        duckdb_path = output_cfg.get("duckdb_path", "")
+        table_name  = output_cfg.get("table", "")
+        if not duckdb_path or not table_name:
+            # not fatal: rows already returned via sample_rows
+            pass
+        else:
+            import duckdb
+            import uuid
+            import datetime
+            # resolve duckdb_path relative to FN_REGISTRY_ROOT if not absolute
+            if not os.path.isabs(duckdb_path):
+                duckdb_path = os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), duckdb_path)
+            os.makedirs(os.path.dirname(duckdb_path), exist_ok=True)
+            conn = duckdb.connect(duckdb_path)
+            try:
+                if rows:
+                    # Detect columns from first row keys (assumes list of dicts).
+                    if not isinstance(rows[0], dict):
+                        # Fallback: wrap scalar rows as {"value": v}.
+                        rows = [{"value": r} for r in rows]
+                    cols = list(rows[0].keys())
+                    # Build CREATE TABLE IF NOT EXISTS with VARCHAR for safety
+                    # plus extracted_at TIMESTAMP and run_id VARCHAR for lineage.
+                    col_defs = ", ".join(f'"{c}" VARCHAR' for c in cols)
+                    ddl = (
+                        f'CREATE TABLE IF NOT EXISTS "{table_name}" ('
+                        f'  run_id VARCHAR, extracted_at TIMESTAMP, {col_defs}'
+                        f')'
+                    )
+                    conn.execute(ddl)
+                    run_id_str = uuid.uuid4().hex[:16]
+                    now_iso = datetime.datetime.utcnow().isoformat() + "Z"
+                    placeholders = ", ".join(["?"] * (len(cols) + 2))
+                    insert_sql = (
+                        f'INSERT INTO "{table_name}" '
+                        f'(run_id, extracted_at, {", ".join(chr(34) + c + chr(34) for c in cols)}) '
+                        f'VALUES ({placeholders})'
+                    )
+                    for r in rows:
+                        vals = [run_id_str, now_iso] + [str(r.get(c, "")) for c in cols]
+                        conn.execute(insert_sql, vals)
+                    # Also record into data_factory.runs with storage info
+                    registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
+                    if registry_root and record_run:
+                        import sqlite3
+                        df_db = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
+                        if os.path.exists(df_db):
+                            try:
+                                df_conn = sqlite3.connect(df_db)
+                                df_conn.execute("PRAGMA foreign_keys = ON")
+                                trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
+                                db_id = output_cfg.get("database_id", recipe.get("name", "unknown") + "_db")
+                                df_run_id = uuid.uuid4().hex[:16]
+                                df_conn.execute(
+                                    "INSERT INTO runs(id, node_id, started_at, finished_at, status,"
+                                    " rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes,"
+                                    " storage_db_id, storage_table)"
+                                    " VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+                                    (
+                                        df_run_id, recipe.get("name", "unknown"),
+                                        now_iso, now_iso, "success",
+                                        0, rows_out, 0, int(round(kb_out)), duration_ms,
+                                        trigger, "",
+                                        json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
+                                        db_id, table_name,
+                                    ),
+                                )
+                                df_conn.commit()
+                                df_conn.close()
+                            except Exception:
+                                pass
+            finally:
+                conn.close()
    elif sink == "data_factory.runs" and record_run:
+        # Escribe DIRECTO a data_factory.db evitando spawn `fn run` (loop infinito
+        # si data_factory_record_run re-ejecuta esta misma funcion). Confia en que
+        # el node ya existe en `nodes` con id == recipe.name.
        try:
-            from pipelines.data_factory_record_run import data_factory_record_run
-            data_factory_record_run(
-                node_id=recipe.get("name", "unknown"),
-                function_id="cdp_extract_recipe_py_pipelines",
-                args={"recipe_path": recipe_path, "debug_port": debug_port},
+            import sqlite3
+            import datetime
+            import uuid
+            registry_root = os.environ.get("FN_REGISTRY_ROOT", "").strip()
+            if not registry_root:
+                # No fatal — el dato ya fue extraido / impreso por otro sink
+                raise RuntimeError("FN_REGISTRY_ROOT not set; cannot locate data_factory.db")
+            db_path = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
+            trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
+            run_id = uuid.uuid4().hex[:16]
+            now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+            node_id = recipe.get("name", "unknown")
+            conn = sqlite3.connect(db_path)
+            conn.execute("PRAGMA foreign_keys = ON")
+            conn.execute(
+                "INSERT INTO runs(id, node_id, started_at, finished_at, status,"
+                " rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes)"
+                " VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)",
+                (
+                    run_id, node_id, now, now, "success",
+                    0, rows_out, 0, int(round(kb_out)), duration_ms,
+                    trigger, "",
+                    json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
+                ),
            )
-        except Exception as e:
-            # No fatal — el dato ya fue extraido
+            conn.commit()
+            conn.close()
+        except Exception:
+            # No fatal — el dato ya fue extraido (sample_rows en retorno)
            pass

    return {
@@ -0,0 +1,60 @@
+---
+name: dedup_duckdb_table_by_hash
+kind: pipeline
+lang: py
+domain: pipelines
+purity: impure
+version: "1.0.0"
+signature: "def dedup_duckdb_table_by_hash(duckdb_path: str, table: str, exclude_cols: list[str] | None = None) -> dict"
+description: "Elimina filas duplicadas de una tabla DuckDB calculando un md5 de las columnas de datos. Anade columna row_hash idempotentemente, actualiza hashes nulos y borra duplicados conservando la primera insercion por rowid."
+tags: [dedup, duckdb, transformer, pipeline, dataops]
+uses_functions: [cdp_extract_recipe_py_pipelines]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: error_go_core
+imports: [duckdb]
+tested: true
+tests:
+  - "dedup elimina filas duplicadas y conserva unicas"
+test_file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash_test.py"
+file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash.py"
+params:
+  - name: duckdb_path
+    desc: "Ruta DuckDB file (absoluta o relativa a FN_REGISTRY_ROOT)."
+  - name: table
+    desc: "Nombre tabla a deduplicar."
+  - name: exclude_cols
+    desc: "Cols a excluir del hash (metadata como run_id, extracted_at, row_hash). None usa default [run_id, extracted_at, row_hash]."
+output: "dict {status, rows_before, rows_after, dedup_removed, duration_ms, hash_column}"
+---
+
+## Ejemplo
+
+```python
+from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
+
+r = dedup_duckdb_table_by_hash("apps/data_factory/data/hn_top_stories.duckdb", "hn_stories")
+print(r)
+# {"status": "ok", "rows_before": 120, "rows_after": 30, "dedup_removed": 90, "duration_ms": 45, "hash_column": "row_hash"}
+```
+
+CLI directo:
+
+```bash
+/home/lucas/fn_registry/python/.venv/bin/python3 \
+  python/functions/pipelines/dedup_duckdb_table_by_hash.py \
+  apps/data_factory/data/hn_top_stories.duckdb hn_stories
+```
+
+## Cuando usarla
+
+Cuando un extractor periodico re-inserta filas iguales (mismo contenido, distinto `run_id`/`extracted_at`) y quieres deduplicar in-place sin tocar el pipeline upstream. Tipicamente como paso `transformer` despues de `cdp_extract_recipe` en un DAG de scraping.
+
+## Gotchas
+
+- **rowid y VACUUM**: DuckDB rowid puede recalcularse tras `VACUUM`. En esta funcion solo se usa dentro de la misma transaccion de DELETE, por lo que no hay inconsistencia practica.
+- **Colisiones md5**: md5 no colisiona en practica para tablas de escala HN (miles de filas). Si la tabla crece a millones de filas con datos binarios, cambiar `md5(...)` por `sha256(...)` en el SQL.
+- **Tabla inexistente**: si `<table>` no existe en el DuckDB, retorna `status=error` con mensaje descriptivo en lugar de lanzar excepcion.
+- **exclude_cols case**: la comparacion de columnas excluidas es case-insensitive (`c.lower()`), pero el nombre en la query se usa tal cual lo devuelve `DESCRIBE`.
+- **Primera ejecucion**: si la tabla ya tiene `row_hash` de una ejecucion anterior, solo se actualizan las filas con `row_hash IS NULL` (idempotente).
@@ -0,0 +1,141 @@
+"""dedup_duckdb_table_by_hash — Remove duplicate rows from a DuckDB table using md5 hash of data columns."""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Any
+
+
+def dedup_duckdb_table_by_hash(
+    duckdb_path: str,
+    table: str,
+    exclude_cols: list[str] | None = None,
+) -> dict[str, Any]:
+    """Remove duplicate rows from a DuckDB table by computing md5 hash of data columns.
+
+    Args:
+        duckdb_path: Path to DuckDB file. Absolute or relative to FN_REGISTRY_ROOT.
+        table: Table name to deduplicate.
+        exclude_cols: Columns to exclude from hash computation (metadata cols).
+                      Defaults to ["run_id", "extracted_at", "row_hash"].
+
+    Returns:
+        dict with keys: status, rows_before, rows_after, dedup_removed,
+        duration_ms, hash_column.
+    """
+    import duckdb  # type: ignore
+
+    t0 = time.monotonic()
+
+    # Resolve path against FN_REGISTRY_ROOT if relative
+    if not os.path.isabs(duckdb_path):
+        root = os.environ.get("FN_REGISTRY_ROOT", os.getcwd())
+        duckdb_path = os.path.join(root, duckdb_path)
+
+    if exclude_cols is None:
+        exclude_cols = ["run_id", "extracted_at", "row_hash"]
+
+    exclude_set = {c.lower() for c in exclude_cols}
+
+    conn = duckdb.connect(duckdb_path)
+    try:
+        # Verify table exists
+        tables = [r[0] for r in conn.execute("SHOW TABLES").fetchall()]
+        if table not in tables:
+            return {
+                "status": "error",
+                "error": f"Table '{table}' not found in {duckdb_path}. Available: {tables}",
+                "rows_before": 0,
+                "rows_after": 0,
+                "dedup_removed": 0,
+                "duration_ms": int((time.monotonic() - t0) * 1000),
+                "hash_column": "row_hash",
+            }
+
+        # Introspect columns
+        desc = conn.execute(f'DESCRIBE "{table}"').fetchall()
+        all_cols = [r[0] for r in desc]
+        existing_col_names_lower = {c.lower() for c in all_cols}
+
+        # Add row_hash column if missing (idempotent)
+        if "row_hash" not in existing_col_names_lower:
+            conn.execute(f'ALTER TABLE "{table}" ADD COLUMN row_hash VARCHAR')
+            all_cols.append("row_hash")
+            existing_col_names_lower.add("row_hash")
+
+        # Data columns = all columns minus excluded
+        data_cols = [c for c in all_cols if c.lower() not in exclude_set]
+
+        if not data_cols:
+            return {
+                "status": "error",
+                "error": "No data columns remaining after exclusion.",
+                "rows_before": 0,
+                "rows_after": 0,
+                "dedup_removed": 0,
+                "duration_ms": int((time.monotonic() - t0) * 1000),
+                "hash_column": "row_hash",
+            }
+
+        # Build md5 expression: md5(col1 || '\t' || col2 || ...)
+        # Each col: COALESCE(CAST("colname" AS VARCHAR), '')
+        parts = " || '\t' || ".join(
+            f"COALESCE(CAST(\"{c}\" AS VARCHAR), '')" for c in data_cols
+        )
+        hash_expr = f"md5({parts})"
+
+        # Update row_hash where NULL
+        conn.execute(
+            f'UPDATE "{table}" SET row_hash = {hash_expr} WHERE row_hash IS NULL'
+        )
+
+        # Count rows before dedup
+        rows_before = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
+
+        # Delete duplicates, keeping row with smallest rowid (earliest insert)
+        conn.execute(
+            f"""
+            DELETE FROM "{table}"
+            WHERE rowid NOT IN (
+                SELECT min(rowid) FROM "{table}" GROUP BY row_hash
+            )
+            """
+        )
+
+        # Count rows after dedup
+        rows_after = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
+
+    finally:
+        conn.close()
+
+    duration_ms = int((time.monotonic() - t0) * 1000)
+    dedup_removed = rows_before - rows_after
+
+    return {
+        "status": "ok",
+        "rows_before": rows_before,
+        "rows_after": rows_after,
+        "dedup_removed": dedup_removed,
+        "duration_ms": duration_ms,
+        "hash_column": "row_hash",
+    }
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Dedup a DuckDB table by row hash.")
+    parser.add_argument("duckdb_path", help="Path to DuckDB file")
+    parser.add_argument("table", help="Table name to deduplicate")
+    parser.add_argument(
+        "--exclude-cols",
+        nargs="*",
+        default=None,
+        help="Columns to exclude from hash (default: run_id extracted_at row_hash)",
+    )
+    args = parser.parse_args()
+
+    result = dedup_duckdb_table_by_hash(args.duckdb_path, args.table, args.exclude_cols)
+    print(json.dumps(result, indent=2))
@@ -0,0 +1,95 @@
+"""Tests para dedup_duckdb_table_by_hash."""
+
+from __future__ import annotations
+
+import os
+import tempfile
+
+import duckdb
+import pytest
+
+from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
+
+
+def _make_test_db(path: str) -> None:
+    """Create a test DuckDB with 5 rows: 3 unique data, 2 duplicates."""
+    conn = duckdb.connect(path)
+    conn.execute(
+        """
+        CREATE TABLE stories (
+            run_id      VARCHAR,
+            extracted_at TIMESTAMP,
+            rank        INTEGER,
+            title       VARCHAR,
+            url         VARCHAR,
+            points      INTEGER
+        )
+        """
+    )
+    conn.execute(
+        """
+        INSERT INTO stories VALUES
+          ('run-001', '2026-05-16 10:00:00', 1, 'Story A', 'https://a.com', 100),
+          ('run-001', '2026-05-16 10:00:00', 2, 'Story B', 'https://b.com', 200),
+          ('run-001', '2026-05-16 10:00:00', 3, 'Story C', 'https://c.com', 300),
+          ('run-002', '2026-05-16 10:30:00', 1, 'Story A', 'https://a.com', 100),
+          ('run-002', '2026-05-16 10:30:00', 2, 'Story B', 'https://b.com', 200)
+        """
+    )
+    conn.close()
+
+
+def test_dedup_elimina_filas_duplicadas_y_conserva_unicas():
+    """dedup elimina filas duplicadas y conserva unicas"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = os.path.join(tmpdir, "test.duckdb")
+        _make_test_db(db_path)
+
+        result = dedup_duckdb_table_by_hash(db_path, "stories")
+
+        assert result["status"] == "ok", f"Expected ok, got: {result}"
+        assert result["rows_before"] == 5
+        assert result["rows_after"] == 3, f"Expected 3 unique rows, got {result['rows_after']}"
+        assert result["dedup_removed"] == 2
+        assert result["hash_column"] == "row_hash"
+        assert result["duration_ms"] >= 0
+
+        # Verify row_hash column exists and is populated
+        conn = duckdb.connect(db_path)
+        hashes = conn.execute("SELECT DISTINCT row_hash FROM stories").fetchall()
+        conn.close()
+        assert len(hashes) == 3, f"Expected 3 distinct hashes, got {len(hashes)}"
+        # All hashes should be non-null
+        assert all(h[0] is not None for h in hashes), "Some row_hash values are NULL"
+
+
+def test_dedup_idempotente():
+    """Running dedup twice leaves rows_after unchanged."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = os.path.join(tmpdir, "test.duckdb")
+        _make_test_db(db_path)
+
+        r1 = dedup_duckdb_table_by_hash(db_path, "stories")
+        r2 = dedup_duckdb_table_by_hash(db_path, "stories")
+
+        assert r1["status"] == "ok"
+        assert r2["status"] == "ok"
+        assert r2["rows_before"] == 3
+        assert r2["rows_after"] == 3
+        assert r2["dedup_removed"] == 0
+
+
+def test_dedup_tabla_inexistente():
+    """Returns status=error when table does not exist."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = os.path.join(tmpdir, "empty.duckdb")
+        conn = duckdb.connect(db_path)
+        conn.close()
+
+        result = dedup_duckdb_table_by_hash(db_path, "nonexistent_table")
+        assert result["status"] == "error"
+        assert "nonexistent_table" in result["error"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
@@ -0,0 +1,66 @@
+---
+name: regenerate_app_icons
+kind: pipeline
+lang: py
+domain: pipelines
+version: "1.0.0"
+purity: impure
+signature: "def regenerate_app_icons(only: list[str] | None = None) -> dict"
+description: "Escanea todas las apps C++ del registry, lee el bloque `icon: {phosphor, accent}` de cada app.md y regenera el appicon.ico via generate_app_icon. Reemplaza el script ad-hoc dev/gen_app_icons.py."
+tags: [cpp-windows, icon, phosphor, batch]
+uses_functions: [generate_app_icon_py_infra]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [os, sys, pathlib, typing, yaml]
+params:
+  - name: only
+    desc: "Lista opcional de nombres de app (campo `name` del frontmatter) a procesar. Si None, regenera todas las apps C++ con icon: declarado."
+output: "dict {ok: [name], skipped: [{name, reason}], failed: [{name, error}]}"
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/pipelines/regenerate_app_icons.py"
+---
+
+## Ejemplo
+
+```bash
+# Regenerar todas las apps C++ con icon: declarado
+./fn run regenerate_app_icons
+
+# Solo una app
+./fn run regenerate_app_icons chart_demo
+
+# Varias apps
+./fn run regenerate_app_icons chart_demo registry_dashboard
+```
+
+```python
+import sys
+sys.path.insert(0, "python/functions")
+from pipelines.regenerate_app_icons import regenerate_app_icons
+
+result = regenerate_app_icons()
+print(f"OK: {len(result['ok'])}, FAIL: {len(result['failed'])}")
+```
+
+Bloque `icon:` esperado en `app.md`:
+```yaml
+icon:
+  phosphor: "chart-bar"
+  accent: "#0ea5e9"
+```
+
+## Cuando usarla
+
+Cuando anades una app C++ nueva (anades `icon:` a su `app.md` y corres el pipeline), cambias el color/glyph de una app existente, o pulleas cambios de iconos desde otra rama. Antes de `redeploy_cpp_app_windows` para que el `.exe` lleve el icono actualizado.
+
+## Gotchas
+
+- **Sobreescribe `appicon.ico` sin warning** — igual que `generate_app_icon`. Hacer backup si necesitas preservar version anterior.
+- **Requiere `sources/phosphor-core/`**: clonar con `git clone --depth=1 https://github.com/phosphor-icons/core.git sources/phosphor-core` si no existe.
+- **Solo procesa apps con `lang: cpp`** en frontmatter — apps Go/Python se ignoran aunque tengan `icon:`.
+- **Apps sin `icon:` se reportan en `skipped`**, no son error. Util para detectar apps C++ a las que falta declarar el icono.
+- **No invalida el cache de iconos de Windows** — si Explorer no muestra el icono nuevo tras redeploy: `ie4uinit.exe -show` o reiniciar Explorer.
@@ -0,0 +1,97 @@
+"""Regenera el appicon.ico de todas las apps C++ que declaren bloque icon: en su app.md."""
+
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from infra.generate_app_icon import generate_app_icon
+
+
+def _find_registry_root() -> Path:
+    env_root = os.environ.get("FN_REGISTRY_ROOT")
+    if env_root:
+        return Path(env_root).resolve()
+    current = Path(__file__).resolve()
+    for parent in current.parents:
+        if (parent / "registry.db").exists():
+            return parent
+    raise FileNotFoundError("registry.db no encontrado; define FN_REGISTRY_ROOT")
+
+
+def _read_frontmatter(md_path: Path) -> Optional[dict]:
+    text = md_path.read_text(encoding="utf-8")
+    if not text.startswith("---"):
+        return None
+    end = text.find("\n---", 3)
+    if end < 0:
+        return None
+    try:
+        return yaml.safe_load(text[3:end])
+    except yaml.YAMLError:
+        return None
+
+
+def _iter_cpp_app_mds(root: Path):
+    for pattern in ("apps/*/app.md", "projects/*/apps/*/app.md"):
+        for md in sorted(root.glob(pattern)):
+            fm = _read_frontmatter(md)
+            if not fm or fm.get("lang") != "cpp":
+                continue
+            yield md, fm
+
+
+def regenerate_app_icons(only: Optional[list[str]] = None) -> dict:
+    """Recorre apps C++ con bloque icon: en su frontmatter y regenera appicon.ico.
+
+    Args:
+        only: Lista opcional de nombres de app a filtrar (campo `name`). Si None,
+            procesa todas las apps C++ con `icon:` declarado.
+
+    Returns:
+        dict con keys: ok (list[str]), skipped (list[dict]), failed (list[dict]).
+    """
+    root = _find_registry_root()
+    ok, skipped, failed = [], [], []
+
+    for md, fm in _iter_cpp_app_mds(root):
+        name = fm.get("name", md.parent.name)
+        if only and name not in only:
+            continue
+        icon = fm.get("icon")
+        if not icon or not isinstance(icon, dict):
+            skipped.append({"name": name, "reason": "no icon: block"})
+            continue
+        phosphor = icon.get("phosphor")
+        accent = icon.get("accent")
+        if not phosphor or not accent:
+            skipped.append({"name": name, "reason": "icon: missing phosphor/accent"})
+            continue
+        out_ico = md.parent / "appicon.ico"
+        try:
+            generate_app_icon(
+                phosphor_icon_name=phosphor,
+                accent_hex=accent,
+                out_ico_path=str(out_ico),
+            )
+            ok.append(name)
+        except Exception as e:
+            failed.append({"name": name, "error": str(e)})
+
+    return {"ok": ok, "skipped": skipped, "failed": failed}
+
+
+if __name__ == "__main__":
+    only = sys.argv[1:] or None
+    result = regenerate_app_icons(only=only)
+    for name in result["ok"]:
+        print(f"OK   {name}")
+    for s in result["skipped"]:
+        print(f"SKIP {s['name']}: {s['reason']}")
+    for f in result["failed"]:
+        print(f"FAIL {f['name']}: {f['error']}")
+    sys.exit(1 if result["failed"] else 0)