docs(flows): DoD obligatorio con user-facing surface + abrir issues 0100-0103 (taxonomia, frontmatter migration, dev_console, work dashboard)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-17 00:07:03 +02:00
parent a03675113a
commit 6ad82167bb
72 changed files with 3920 additions and 303 deletions
+13 -1
View File
@@ -110,11 +110,23 @@ def validate_recipe_yaml(yaml_text: str) -> dict:
)
sink = output.get("sink")
valid_sinks = {"data_factory.runs", "stdout", "json_file"}
# duckdb sink: requires output.duckdb_path (relative or absolute) and
# output.table (table name). Optional output.database_id (default =
# recipe_name + "_db") used to register/lookup in data_factory.databases.
valid_sinks = {"data_factory.runs", "stdout", "json_file", "duckdb"}
if sink is not None and sink not in valid_sinks:
errors.append(
f"Campo 'output.sink' debe ser uno de {sorted(valid_sinks)}, got '{sink}'."
)
if sink == "duckdb":
if not output.get("duckdb_path"):
errors.append(
"Sink 'duckdb' requiere 'output.duckdb_path' (ruta al archivo .duckdb)."
)
if not output.get("table"):
errors.append(
"Sink 'duckdb' requiere 'output.table' (nombre de la tabla destino)."
)
return {
"valid": len(errors) == 0,
+25 -4
View File
@@ -1,9 +1,28 @@
"""Invoca `claude -p` via subprocess y devuelve la respuesta como string."""
import os
import shutil
import subprocess
def _resolve_claude_bin() -> str | None:
"""Localiza claude CLI: PATH first, luego rutas convencionales."""
p = shutil.which("claude")
if p:
return p
# Fallback paths comunes (WSL subsession sin .profile cargado, etc).
home = os.path.expanduser("~")
candidates = [
f"{home}/.local/bin/claude",
"/usr/local/bin/claude",
"/opt/homebrew/bin/claude",
]
for c in candidates:
if os.path.isfile(c) and os.access(c, os.X_OK):
return c
return None
def claude_cli_prompt(
prompt: str,
timeout_s: int = 60,
@@ -24,16 +43,18 @@ def claude_cli_prompt(
Respuesta de Claude como texto (stdout), truncada a max_chars_response.
Raises:
FileNotFoundError: Si `claude` no esta en PATH.
FileNotFoundError: Si `claude` no esta en PATH ni rutas convencionales.
RuntimeError: Si exit code != 0 (incluye primeros 500 chars de stderr).
subprocess.TimeoutExpired: Si la llamada supera timeout_s segundos.
"""
if shutil.which("claude") is None:
claude_bin = _resolve_claude_bin()
if claude_bin is None:
raise FileNotFoundError(
"'claude' CLI no encontrado en PATH. Instala Claude Code."
"'claude' CLI no encontrado en PATH ni rutas convencionales "
"(~/.local/bin, /usr/local/bin, /opt/homebrew/bin). Instala Claude Code."
)
cmd = ["claude", "-p", prompt]
cmd = [claude_bin, "-p", prompt]
if model:
cmd.extend(["--model", model])
if extra_args:
@@ -0,0 +1,75 @@
---
name: codegen_app_modules
kind: function
lang: py
domain: infra
version: "1.0.0"
purity: impure
signature: "generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int"
description: "Reads app.md uses_modules + modules/<name>/module.md frontmatters, emits <app>_modules_generated.cpp with fn::app_modules_array[] + fn::app_modules_count. CMake hook for add_imgui_app. Pure YAML parsing, no registry.db dep."
tags: [codegen, modules, cmake, cpp, build]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- yaml
example: |
python python/functions/infra/codegen_app_modules.py \
--app-md apps/data_factory/app.md \
--modules-root modules \
--app-name data_factory \
--out cpp/build/apps/data_factory/data_factory_modules_generated.cpp
file_path: "python/functions/infra/codegen_app_modules.py"
params:
- name: app_md
desc: "Path absoluto al app.md de la app consumidora. Lee uses_modules del frontmatter YAML."
- name: modules_root
desc: "Raiz del directorio modules/. Cada modulo es modules/<name>/module.md."
- name: app_name
desc: "Nombre de la app (solo para el comment-header del .cpp generado)."
- name: out_path
desc: "Path donde escribir el .cpp generado. Idempotente: skip si contenido coincide."
output: "Exit code: 0 si OK, 2 si OK pero algun modulo declarado no existe (warning), >0 si error."
---
## Ejemplo
Generar el .cpp para `data_factory`:
```bash
python python/functions/infra/codegen_app_modules.py \
--app-md apps/data_factory/app.md \
--modules-root modules \
--app-name data_factory \
--out /tmp/data_factory_modules_generated.cpp
```
Si `data_factory/app.md` declara `uses_modules: [data_table_cpp]`, el .cpp generado es:
```cpp
// Auto-generated by codegen_app_modules.py — do not edit.
// App: data_factory
// Source of truth: apps/data_factory/app.md (uses_modules)
#include "app_modules.h"
namespace fn {
const ModuleInfo app_modules_array[] = {
{ "data_table", "1.4.0", "Reusable C++ ImGui module..." },
};
const unsigned long app_modules_count = 1;
} // namespace fn
```
## Cuando usarla
CMake hook automatico — la macro `add_imgui_app` la invoca al configurar el build. Apps no la llaman manualmente. Manual override: solo si quieres regenerar fuera del flujo cmake (debugging).
## Gotchas
- Resuelve `<name>_cpp` strippeando el sufijo `_cpp/_py/_ts/_bash/_go`. Mismo patron que `GenerateModuleID`.
- Si un modulo declarado en `uses_modules` no existe, emite warning a stderr y EXIT=2 (no falla el build).
- Idempotente: solo reescribe si el contenido cambia. Evita rebuilds innecesarios cuando los modulos no cambiaron.
- Requiere `pyyaml`. Disponible en `python/.venv` del registry.
@@ -0,0 +1,149 @@
"""Generate <app>_modules_generated.cpp from app.md uses_modules + modules/*/module.md.
Stand-alone — no dependencies beyond PyYAML. Invoked from CMake at configure time.
Reads YAML frontmatter directly (no registry.db dependency, no Go binary).
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from typing import Optional
import yaml
def _read_frontmatter(md_path: Path) -> dict:
if not md_path.exists():
return {}
text = md_path.read_text(encoding="utf-8")
if not text.startswith("---\n") and not text.startswith("---\r\n"):
return {}
end = text.find("\n---", 4)
if end < 0:
return {}
raw = text[4:end]
try:
return yaml.safe_load(raw) or {}
except yaml.YAMLError:
return {}
def _escape_c_string(s: str) -> str:
out = []
for ch in s or "":
if ch == "\\":
out.append("\\\\")
elif ch == '"':
out.append('\\"')
elif ch == "\n":
out.append("\\n")
elif ch == "\r":
out.append("\\r")
elif ch == "\t":
out.append("\\t")
elif ord(ch) < 32:
out.append(f"\\x{ord(ch):02x}")
else:
out.append(ch)
return "".join(out)
def _resolve_module(modules_root: Path, mod_id: str) -> Optional[dict]:
"""mod_id is e.g. `data_table_cpp`. Lookup module.md by name (strip _<lang>)."""
name = mod_id
for suffix in ("_cpp", "_py", "_ts", "_bash", "_go"):
if name.endswith(suffix):
name = name[: -len(suffix)]
break
md = modules_root / name / "module.md"
fm = _read_frontmatter(md)
if not fm:
return None
return {
"name": fm.get("name", name),
"version": fm.get("version", "0.0.0"),
"description": fm.get("description", ""),
}
def generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int:
fm = _read_frontmatter(app_md)
uses_modules = fm.get("uses_modules") or []
if not isinstance(uses_modules, list):
uses_modules = []
entries: list[dict] = []
missing: list[str] = []
for mid in uses_modules:
info = _resolve_module(modules_root, str(mid))
if info is None:
missing.append(str(mid))
continue
entries.append(info)
lines: list[str] = []
lines.append(f"// Auto-generated by codegen_app_modules.py — do not edit.")
lines.append(f"// App: {app_name}")
lines.append(f"// Source of truth: {app_md.as_posix()} (uses_modules)")
lines.append("")
lines.append('#include "app_modules.h"')
lines.append("")
lines.append("namespace fn {")
if entries:
lines.append("const ModuleInfo app_modules_array[] = {")
for e in entries:
lines.append(
' { "%s", "%s", "%s" },'
% (
_escape_c_string(e["name"]),
_escape_c_string(e["version"]),
_escape_c_string(e["description"]),
)
)
lines.append("};")
lines.append(f"const unsigned long app_modules_count = {len(entries)};")
else:
lines.append("const ModuleInfo app_modules_array[1] = { { nullptr, nullptr, nullptr } };")
lines.append("const unsigned long app_modules_count = 0;")
lines.append("} // namespace fn")
lines.append("")
new_content = "\n".join(lines)
# Idempotent: skip rewrite when content matches.
if out_path.exists() and out_path.read_text(encoding="utf-8") == new_content:
return 0 if not missing else 2
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(new_content, encoding="utf-8")
if missing:
sys.stderr.write(
f"codegen_app_modules: WARNING — module(s) not found: {', '.join(missing)} "
f"(app {app_name})\n"
)
return 2
return 0
def main() -> int:
ap = argparse.ArgumentParser(description="Generate <app>_modules_generated.cpp from app.md")
ap.add_argument("--app-md", required=True, help="Path to app.md")
ap.add_argument("--modules-root", required=True, help="Path to modules/ root")
ap.add_argument("--app-name", required=True, help="App name (for comment header)")
ap.add_argument("--out", required=True, help="Output path for generated .cpp")
args = ap.parse_args()
rc = generate(
app_md=Path(args.app_md),
modules_root=Path(args.modules_root),
app_name=args.app_name,
out_path=Path(args.out),
)
return 0 if rc in (0, 2) else rc
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,72 @@
---
name: export_hub_manifest
kind: function
lang: py
domain: infra
version: "1.0.0"
purity: impure
signature: "export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict"
description: "Genera el TSV sidecar para app_hub_launcher: consulta registry.db por todas las apps cpp/imgui, lee su app.md para extraer nombre, descripcion y accent_hex, y escribe un archivo TSV con cabecera a out_path. Retorna {ok, count, out_path}."
tags: [hub, launcher, manifest, suite, cpp-windows]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [sqlite3, yaml, pathlib]
params:
- name: out_path
desc: "Ruta de destino del archivo TSV. Puede ser absoluta o relativa al cwd. El directorio padre se crea si no existe."
- name: registry_root
desc: "Raiz del fn_registry. Si None, usa la variable de entorno FN_REGISTRY_ROOT o /home/lucas/fn_registry como fallback."
output: "Dict {ok: True, count: N, out_path: str} con la ruta absoluta del TSV escrito y el numero de apps incluidas."
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/infra/export_hub_manifest.py"
---
## Ejemplo
```bash
# Uso directo con fn run (la salida JSON se imprime en stdout)
./fn run export_hub_manifest_py_infra /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
```
```python
# Desde un heredoc o pipeline Python
import sys
sys.path.insert(0, "python/functions")
from infra import export_hub_manifest
result = export_hub_manifest(
"/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv"
)
print(result)
# {'ok': True, 'count': 12, 'out_path': '/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv'}
```
```bash
# Ver el contenido del TSV generado
head -5 /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
# name display_name description accent_hex
# chart_demo Chart Demo Demo ImGui de primitivos viz... #0ea5e9
# dag_engine_ui Dag Engine Ui Motor de DAGs con frontend... #f59e0b
```
## Cuando usarla
Antes de desplegar `app_hub_launcher` a Windows: genera el `hub_manifest.tsv` que el hub lee al arrancar para listar y colorear los botones de cada app. El hub en runtime no tiene acceso a `registry.db` ni a los `app.md` del WSL, por lo que necesita este sidecar. Ejecutar tras añadir o modificar una app C++ imgui en el registry.
## Gotchas
- **PyYAML en el venv**: requiere `yaml` disponible en `python/.venv`. Ya instalado por defecto. Si falta: `cd python && uv pip install pyyaml`.
- **app.md faltante no aborta**: si un `app.md` no existe o tiene frontmatter malformado, la app sigue apareciendo en el TSV con `description` vacía y accent `#64748b` (slate). Se imprime un WARN a stderr.
- **Filtro estricto `lang='cpp' AND framework='imgui'`**: solo apps C++ con el shell `fn::run_app`. Apps Python, Bash o C++ sin imgui quedan excluidas. Correcto para el hub.
- **La ruta `dir_path` en registry.db es relativa a la raiz del registry**: la funcion la combina con `registry_root` para construir el path absoluto al `app.md`. Si una app tiene `dir_path` incorrecto en su `app.md`, el WARN indicara cual falló.
- **TSV UTF-8**: el hub debe abrir el archivo con encoding UTF-8. Tabs y saltos de linea en los campos se limpian automaticamente (reemplazados por espacio).
- **`display_name` es generado, no leido**: se deriva del `name` de la app convirtiendo snake_case a Title Case. No se puede personalizar desde el `app.md` en esta version.
## Capability growth log
*(sin cambios desde v1.0.0)*
@@ -0,0 +1,142 @@
"""export_hub_manifest — genera el TSV sidecar para app_hub_launcher."""
from __future__ import annotations
import os
import sqlite3
import sys
from pathlib import Path
from typing import Any
def _read_frontmatter(md_path: Path) -> dict[str, Any]:
"""Parse YAML frontmatter from a .md file. Returns {} on any error."""
try:
import yaml # PyYAML — available in python/.venv
text = md_path.read_text(encoding="utf-8")
if not text.startswith("---"):
return {}
# Find the closing ---
end = text.find("\n---", 3)
if end == -1:
return {}
yaml_block = text[3:end].strip()
data = yaml.safe_load(yaml_block)
return data if isinstance(data, dict) else {}
except Exception as exc:
print(f"[export_hub_manifest] WARN: could not parse {md_path}: {exc}", file=sys.stderr)
return {}
def _snake_to_display(name: str) -> str:
"""Convert snake_case name to Title Case With Spaces.
Examples:
graph_explorer -> Graph Explorer
dag_engine_ui -> Dag Engine Ui
app_hub_launcher -> App Hub Launcher
"""
return " ".join(part.capitalize() for part in name.split("_"))
def export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict:
"""Generate TSV sidecar manifest for app_hub_launcher.
Queries registry.db for all cpp/imgui apps, reads their app.md
frontmatter to extract name, description and accent color, then
writes a UTF-8 TSV to out_path.
Args:
out_path: Destination path for the TSV manifest file.
registry_root: Path to the fn_registry root directory.
Defaults to FN_REGISTRY_ROOT env var or /home/lucas/fn_registry.
Returns:
{"ok": True, "count": N, "out_path": "<abs_path>"}
"""
root = Path(
registry_root
or os.environ.get("FN_REGISTRY_ROOT", "/home/lucas/fn_registry")
).resolve()
db_path = root / "registry.db"
if not db_path.exists():
raise FileNotFoundError(f"registry.db not found at {db_path}")
con = sqlite3.connect(str(db_path))
con.row_factory = sqlite3.Row
try:
rows = con.execute(
"SELECT id, name, dir_path FROM apps WHERE lang='cpp' AND framework='imgui' ORDER BY name"
).fetchall()
finally:
con.close()
DEFAULT_ACCENT = "#64748b"
TSV_HEADER = "name\tdisplay_name\tdescription\taccent_hex\n"
lines: list[str] = [TSV_HEADER]
count = 0
for row in rows:
app_name: str = row["name"]
dir_path: str = row["dir_path"]
# Derive defaults in case app.md is missing / malformed
display_name = _snake_to_display(app_name)
description = ""
accent_hex = DEFAULT_ACCENT
md_path = root / dir_path / "app.md"
if md_path.exists():
fm = _read_frontmatter(md_path)
if fm:
description = fm.get("description", "") or ""
icon_block = fm.get("icon")
if isinstance(icon_block, dict):
accent_hex = icon_block.get("accent", DEFAULT_ACCENT) or DEFAULT_ACCENT
else:
print(
f"[export_hub_manifest] WARN: empty/malformed frontmatter in {md_path}",
file=sys.stderr,
)
else:
print(
f"[export_hub_manifest] WARN: app.md missing for {app_name} at {md_path}",
file=sys.stderr,
)
# Sanitize: TSV values must not contain tabs or newlines
def clean(s: str) -> str:
return s.replace("\t", " ").replace("\n", " ").replace("\r", "")
lines.append(
f"{clean(app_name)}\t{clean(display_name)}\t{clean(description)}\t{clean(accent_hex)}\n"
)
count += 1
out = Path(out_path).resolve()
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text("".join(lines), encoding="utf-8")
return {"ok": True, "count": count, "out_path": str(out)}
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(
description="Export hub manifest TSV for app_hub_launcher."
)
parser.add_argument("out_path", help="Destination .tsv file path")
parser.add_argument(
"--registry-root",
default=None,
help="Path to fn_registry root (default: FN_REGISTRY_ROOT env or /home/lucas/fn_registry)",
)
args = parser.parse_args()
result = export_hub_manifest(args.out_path, registry_root=args.registry_root)
print(json.dumps(result, indent=2))
@@ -3,7 +3,7 @@ name: cdp_extract_recipe
kind: pipeline
lang: py
domain: pipelines
version: "1.0.0"
version: "1.2.0"
purity: impure
signature: "def cdp_extract_recipe(recipe_path: str, debug_port: int = 9222, tab_id: str | None = None, record_run: bool = True) -> dict"
description: "Ejecuta una recipe YAML contra Chrome remoto via CDP. Valida recipe, busca tab por url_pattern, ejecuta steps (wait_selector/js) y envia resultado al sink declarado."
@@ -22,7 +22,7 @@ params:
- name: tab_id
desc: "ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern de la recipe."
- name: record_run
desc: "Si True y output.sink=='data_factory.runs', registra la ejecucion en data_factory."
desc: "Si True, registra la ejecucion en data_factory.runs (para sink 'data_factory.runs' y 'duckdb')."
output: "dict {status: ok|error, rows_out: int, kb_out: float, duration_ms: int, error: str, sample_rows: list}"
tested: false
tests: []
@@ -60,6 +60,10 @@ output:
Cuando tienes una recipe YAML validada y Chrome corriendo con remote debugging, y quieres extraer datos en un solo paso sin montar pipeline manualmente. Encadena con `cdp_open_url_and_wait` si necesitas abrir la URL primero.
## Capability growth log
- v1.2.0 (2026-05-16) — sink `duckdb` writes rows to a DuckDB file + registers run in data_factory.runs with storage_db_id/storage_table for traceability.
## Gotchas
- Chrome debe estar corriendo con `--remote-debugging-port=<debug_port>`.
+129 -17
View File
@@ -41,9 +41,14 @@ def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 1
def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null."""
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null.
Drena eventos CDP (paginas con Page.enable emiten loads, frames, etc.) y
matchea por `id` para evitar leer respuestas ajenas o eventos del server.
"""
deadline = time.time() + timeout_s
msg_id = 1000
ws.settimeout(0.5)
while time.time() < deadline:
ws.send(json.dumps({
"id": msg_id,
@@ -53,19 +58,28 @@ def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
"returnByValue": True,
}
}))
time.sleep(0.2)
msg_id += 1
# Leer respuesta en loop simple (websocket-client sync)
# Para modo sync usamos recv()
try:
raw = ws.sock.recv()
if raw:
# Leer hasta 30 frames buscando uno con nuestro id; ignorar eventos.
got_response = False
for _ in range(30):
try:
raw = ws.recv()
except Exception:
break
if not raw:
break
try:
msg = json.loads(raw)
except Exception:
continue
if msg.get("id") == msg_id:
got_response = True
val = msg.get("result", {}).get("result", {}).get("value", False)
if val:
return True
except Exception:
pass
break
msg_id += 1
if not got_response:
time.sleep(0.2)
return False
@@ -188,16 +202,114 @@ def cdp_extract_recipe(
out_path = output_cfg.get("path", "output.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
elif sink == "duckdb":
duckdb_path = output_cfg.get("duckdb_path", "")
table_name = output_cfg.get("table", "")
if not duckdb_path or not table_name:
# not fatal: rows already returned via sample_rows
pass
else:
import duckdb
import uuid
import datetime
# resolve duckdb_path relative to FN_REGISTRY_ROOT if not absolute
if not os.path.isabs(duckdb_path):
duckdb_path = os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), duckdb_path)
os.makedirs(os.path.dirname(duckdb_path), exist_ok=True)
conn = duckdb.connect(duckdb_path)
try:
if rows:
# Detect columns from first row keys (assumes list of dicts).
if not isinstance(rows[0], dict):
# Fallback: wrap scalar rows as {"value": v}.
rows = [{"value": r} for r in rows]
cols = list(rows[0].keys())
# Build CREATE TABLE IF NOT EXISTS with VARCHAR for safety
# plus extracted_at TIMESTAMP and run_id VARCHAR for lineage.
col_defs = ", ".join(f'"{c}" VARCHAR' for c in cols)
ddl = (
f'CREATE TABLE IF NOT EXISTS "{table_name}" ('
f' run_id VARCHAR, extracted_at TIMESTAMP, {col_defs}'
f')'
)
conn.execute(ddl)
run_id_str = uuid.uuid4().hex[:16]
now_iso = datetime.datetime.utcnow().isoformat() + "Z"
placeholders = ", ".join(["?"] * (len(cols) + 2))
insert_sql = (
f'INSERT INTO "{table_name}" '
f'(run_id, extracted_at, {", ".join(chr(34) + c + chr(34) for c in cols)}) '
f'VALUES ({placeholders})'
)
for r in rows:
vals = [run_id_str, now_iso] + [str(r.get(c, "")) for c in cols]
conn.execute(insert_sql, vals)
# Also record into data_factory.runs with storage info
registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
if registry_root and record_run:
import sqlite3
df_db = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
if os.path.exists(df_db):
try:
df_conn = sqlite3.connect(df_db)
df_conn.execute("PRAGMA foreign_keys = ON")
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
db_id = output_cfg.get("database_id", recipe.get("name", "unknown") + "_db")
df_run_id = uuid.uuid4().hex[:16]
df_conn.execute(
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes,"
" storage_db_id, storage_table)"
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
(
df_run_id, recipe.get("name", "unknown"),
now_iso, now_iso, "success",
0, rows_out, 0, int(round(kb_out)), duration_ms,
trigger, "",
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
db_id, table_name,
),
)
df_conn.commit()
df_conn.close()
except Exception:
pass
finally:
conn.close()
elif sink == "data_factory.runs" and record_run:
# Escribe DIRECTO a data_factory.db evitando spawn `fn run` (loop infinito
# si data_factory_record_run re-ejecuta esta misma funcion). Confia en que
# el node ya existe en `nodes` con id == recipe.name.
try:
from pipelines.data_factory_record_run import data_factory_record_run
data_factory_record_run(
node_id=recipe.get("name", "unknown"),
function_id="cdp_extract_recipe_py_pipelines",
args={"recipe_path": recipe_path, "debug_port": debug_port},
import sqlite3
import datetime
import uuid
registry_root = os.environ.get("FN_REGISTRY_ROOT", "").strip()
if not registry_root:
# No fatal — el dato ya fue extraido / impreso por otro sink
raise RuntimeError("FN_REGISTRY_ROOT not set; cannot locate data_factory.db")
db_path = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
run_id = uuid.uuid4().hex[:16]
now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
node_id = recipe.get("name", "unknown")
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA foreign_keys = ON")
conn.execute(
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes)"
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)",
(
run_id, node_id, now, now, "success",
0, rows_out, 0, int(round(kb_out)), duration_ms,
trigger, "",
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
),
)
except Exception as e:
# No fatal — el dato ya fue extraido
conn.commit()
conn.close()
except Exception:
# No fatal — el dato ya fue extraido (sample_rows en retorno)
pass
return {
@@ -0,0 +1,60 @@
---
name: dedup_duckdb_table_by_hash
kind: pipeline
lang: py
domain: pipelines
purity: impure
version: "1.0.0"
signature: "def dedup_duckdb_table_by_hash(duckdb_path: str, table: str, exclude_cols: list[str] | None = None) -> dict"
description: "Elimina filas duplicadas de una tabla DuckDB calculando un md5 de las columnas de datos. Anade columna row_hash idempotentemente, actualiza hashes nulos y borra duplicados conservando la primera insercion por rowid."
tags: [dedup, duckdb, transformer, pipeline, dataops]
uses_functions: [cdp_extract_recipe_py_pipelines]
uses_types: []
returns: []
returns_optional: false
error_type: error_go_core
imports: [duckdb]
tested: true
tests:
- "dedup elimina filas duplicadas y conserva unicas"
test_file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash_test.py"
file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash.py"
params:
- name: duckdb_path
desc: "Ruta DuckDB file (absoluta o relativa a FN_REGISTRY_ROOT)."
- name: table
desc: "Nombre tabla a deduplicar."
- name: exclude_cols
desc: "Cols a excluir del hash (metadata como run_id, extracted_at, row_hash). None usa default [run_id, extracted_at, row_hash]."
output: "dict {status, rows_before, rows_after, dedup_removed, duration_ms, hash_column}"
---
## Ejemplo
```python
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
r = dedup_duckdb_table_by_hash("apps/data_factory/data/hn_top_stories.duckdb", "hn_stories")
print(r)
# {"status": "ok", "rows_before": 120, "rows_after": 30, "dedup_removed": 90, "duration_ms": 45, "hash_column": "row_hash"}
```
CLI directo:
```bash
/home/lucas/fn_registry/python/.venv/bin/python3 \
python/functions/pipelines/dedup_duckdb_table_by_hash.py \
apps/data_factory/data/hn_top_stories.duckdb hn_stories
```
## Cuando usarla
Cuando un extractor periodico re-inserta filas iguales (mismo contenido, distinto `run_id`/`extracted_at`) y quieres deduplicar in-place sin tocar el pipeline upstream. Tipicamente como paso `transformer` despues de `cdp_extract_recipe` en un DAG de scraping.
## Gotchas
- **rowid y VACUUM**: DuckDB rowid puede recalcularse tras `VACUUM`. En esta funcion solo se usa dentro de la misma transaccion de DELETE, por lo que no hay inconsistencia practica.
- **Colisiones md5**: md5 no colisiona en practica para tablas de escala HN (miles de filas). Si la tabla crece a millones de filas con datos binarios, cambiar `md5(...)` por `sha256(...)` en el SQL.
- **Tabla inexistente**: si `<table>` no existe en el DuckDB, retorna `status=error` con mensaje descriptivo en lugar de lanzar excepcion.
- **exclude_cols case**: la comparacion de columnas excluidas es case-insensitive (`c.lower()`), pero el nombre en la query se usa tal cual lo devuelve `DESCRIBE`.
- **Primera ejecucion**: si la tabla ya tiene `row_hash` de una ejecucion anterior, solo se actualizan las filas con `row_hash IS NULL` (idempotente).
@@ -0,0 +1,141 @@
"""dedup_duckdb_table_by_hash — Remove duplicate rows from a DuckDB table using md5 hash of data columns."""
from __future__ import annotations
import os
import time
from typing import Any
def dedup_duckdb_table_by_hash(
duckdb_path: str,
table: str,
exclude_cols: list[str] | None = None,
) -> dict[str, Any]:
"""Remove duplicate rows from a DuckDB table by computing md5 hash of data columns.
Args:
duckdb_path: Path to DuckDB file. Absolute or relative to FN_REGISTRY_ROOT.
table: Table name to deduplicate.
exclude_cols: Columns to exclude from hash computation (metadata cols).
Defaults to ["run_id", "extracted_at", "row_hash"].
Returns:
dict with keys: status, rows_before, rows_after, dedup_removed,
duration_ms, hash_column.
"""
import duckdb # type: ignore
t0 = time.monotonic()
# Resolve path against FN_REGISTRY_ROOT if relative
if not os.path.isabs(duckdb_path):
root = os.environ.get("FN_REGISTRY_ROOT", os.getcwd())
duckdb_path = os.path.join(root, duckdb_path)
if exclude_cols is None:
exclude_cols = ["run_id", "extracted_at", "row_hash"]
exclude_set = {c.lower() for c in exclude_cols}
conn = duckdb.connect(duckdb_path)
try:
# Verify table exists
tables = [r[0] for r in conn.execute("SHOW TABLES").fetchall()]
if table not in tables:
return {
"status": "error",
"error": f"Table '{table}' not found in {duckdb_path}. Available: {tables}",
"rows_before": 0,
"rows_after": 0,
"dedup_removed": 0,
"duration_ms": int((time.monotonic() - t0) * 1000),
"hash_column": "row_hash",
}
# Introspect columns
desc = conn.execute(f'DESCRIBE "{table}"').fetchall()
all_cols = [r[0] for r in desc]
existing_col_names_lower = {c.lower() for c in all_cols}
# Add row_hash column if missing (idempotent)
if "row_hash" not in existing_col_names_lower:
conn.execute(f'ALTER TABLE "{table}" ADD COLUMN row_hash VARCHAR')
all_cols.append("row_hash")
existing_col_names_lower.add("row_hash")
# Data columns = all columns minus excluded
data_cols = [c for c in all_cols if c.lower() not in exclude_set]
if not data_cols:
return {
"status": "error",
"error": "No data columns remaining after exclusion.",
"rows_before": 0,
"rows_after": 0,
"dedup_removed": 0,
"duration_ms": int((time.monotonic() - t0) * 1000),
"hash_column": "row_hash",
}
# Build md5 expression: md5(col1 || '\t' || col2 || ...)
# Each col: COALESCE(CAST("colname" AS VARCHAR), '')
parts = " || '\t' || ".join(
f"COALESCE(CAST(\"{c}\" AS VARCHAR), '')" for c in data_cols
)
hash_expr = f"md5({parts})"
# Update row_hash where NULL
conn.execute(
f'UPDATE "{table}" SET row_hash = {hash_expr} WHERE row_hash IS NULL'
)
# Count rows before dedup
rows_before = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
# Delete duplicates, keeping row with smallest rowid (earliest insert)
conn.execute(
f"""
DELETE FROM "{table}"
WHERE rowid NOT IN (
SELECT min(rowid) FROM "{table}" GROUP BY row_hash
)
"""
)
# Count rows after dedup
rows_after = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
finally:
conn.close()
duration_ms = int((time.monotonic() - t0) * 1000)
dedup_removed = rows_before - rows_after
return {
"status": "ok",
"rows_before": rows_before,
"rows_after": rows_after,
"dedup_removed": dedup_removed,
"duration_ms": duration_ms,
"hash_column": "row_hash",
}
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="Dedup a DuckDB table by row hash.")
parser.add_argument("duckdb_path", help="Path to DuckDB file")
parser.add_argument("table", help="Table name to deduplicate")
parser.add_argument(
"--exclude-cols",
nargs="*",
default=None,
help="Columns to exclude from hash (default: run_id extracted_at row_hash)",
)
args = parser.parse_args()
result = dedup_duckdb_table_by_hash(args.duckdb_path, args.table, args.exclude_cols)
print(json.dumps(result, indent=2))
@@ -0,0 +1,95 @@
"""Tests para dedup_duckdb_table_by_hash."""
from __future__ import annotations
import os
import tempfile
import duckdb
import pytest
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
def _make_test_db(path: str) -> None:
"""Create a test DuckDB with 5 rows: 3 unique data, 2 duplicates."""
conn = duckdb.connect(path)
conn.execute(
"""
CREATE TABLE stories (
run_id VARCHAR,
extracted_at TIMESTAMP,
rank INTEGER,
title VARCHAR,
url VARCHAR,
points INTEGER
)
"""
)
conn.execute(
"""
INSERT INTO stories VALUES
('run-001', '2026-05-16 10:00:00', 1, 'Story A', 'https://a.com', 100),
('run-001', '2026-05-16 10:00:00', 2, 'Story B', 'https://b.com', 200),
('run-001', '2026-05-16 10:00:00', 3, 'Story C', 'https://c.com', 300),
('run-002', '2026-05-16 10:30:00', 1, 'Story A', 'https://a.com', 100),
('run-002', '2026-05-16 10:30:00', 2, 'Story B', 'https://b.com', 200)
"""
)
conn.close()
def test_dedup_elimina_filas_duplicadas_y_conserva_unicas():
"""dedup elimina filas duplicadas y conserva unicas"""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = os.path.join(tmpdir, "test.duckdb")
_make_test_db(db_path)
result = dedup_duckdb_table_by_hash(db_path, "stories")
assert result["status"] == "ok", f"Expected ok, got: {result}"
assert result["rows_before"] == 5
assert result["rows_after"] == 3, f"Expected 3 unique rows, got {result['rows_after']}"
assert result["dedup_removed"] == 2
assert result["hash_column"] == "row_hash"
assert result["duration_ms"] >= 0
# Verify row_hash column exists and is populated
conn = duckdb.connect(db_path)
hashes = conn.execute("SELECT DISTINCT row_hash FROM stories").fetchall()
conn.close()
assert len(hashes) == 3, f"Expected 3 distinct hashes, got {len(hashes)}"
# All hashes should be non-null
assert all(h[0] is not None for h in hashes), "Some row_hash values are NULL"
def test_dedup_idempotente():
"""Running dedup twice leaves rows_after unchanged."""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = os.path.join(tmpdir, "test.duckdb")
_make_test_db(db_path)
r1 = dedup_duckdb_table_by_hash(db_path, "stories")
r2 = dedup_duckdb_table_by_hash(db_path, "stories")
assert r1["status"] == "ok"
assert r2["status"] == "ok"
assert r2["rows_before"] == 3
assert r2["rows_after"] == 3
assert r2["dedup_removed"] == 0
def test_dedup_tabla_inexistente():
"""Returns status=error when table does not exist."""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = os.path.join(tmpdir, "empty.duckdb")
conn = duckdb.connect(db_path)
conn.close()
result = dedup_duckdb_table_by_hash(db_path, "nonexistent_table")
assert result["status"] == "error"
assert "nonexistent_table" in result["error"]
if __name__ == "__main__":
pytest.main([__file__, "-v"])
@@ -0,0 +1,66 @@
---
name: regenerate_app_icons
kind: pipeline
lang: py
domain: pipelines
version: "1.0.0"
purity: impure
signature: "def regenerate_app_icons(only: list[str] | None = None) -> dict"
description: "Escanea todas las apps C++ del registry, lee el bloque `icon: {phosphor, accent}` de cada app.md y regenera el appicon.ico via generate_app_icon. Reemplaza el script ad-hoc dev/gen_app_icons.py."
tags: [cpp-windows, icon, phosphor, batch]
uses_functions: [generate_app_icon_py_infra]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [os, sys, pathlib, typing, yaml]
params:
- name: only
desc: "Lista opcional de nombres de app (campo `name` del frontmatter) a procesar. Si None, regenera todas las apps C++ con icon: declarado."
output: "dict {ok: [name], skipped: [{name, reason}], failed: [{name, error}]}"
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/pipelines/regenerate_app_icons.py"
---
## Ejemplo
```bash
# Regenerar todas las apps C++ con icon: declarado
./fn run regenerate_app_icons
# Solo una app
./fn run regenerate_app_icons chart_demo
# Varias apps
./fn run regenerate_app_icons chart_demo registry_dashboard
```
```python
import sys
sys.path.insert(0, "python/functions")
from pipelines.regenerate_app_icons import regenerate_app_icons
result = regenerate_app_icons()
print(f"OK: {len(result['ok'])}, FAIL: {len(result['failed'])}")
```
Bloque `icon:` esperado en `app.md`:
```yaml
icon:
phosphor: "chart-bar"
accent: "#0ea5e9"
```
## Cuando usarla
Cuando anades una app C++ nueva (anades `icon:` a su `app.md` y corres el pipeline), cambias el color/glyph de una app existente, o pulleas cambios de iconos desde otra rama. Antes de `redeploy_cpp_app_windows` para que el `.exe` lleve el icono actualizado.
## Gotchas
- **Sobreescribe `appicon.ico` sin warning** — igual que `generate_app_icon`. Hacer backup si necesitas preservar version anterior.
- **Requiere `sources/phosphor-core/`**: clonar con `git clone --depth=1 https://github.com/phosphor-icons/core.git sources/phosphor-core` si no existe.
- **Solo procesa apps con `lang: cpp`** en frontmatter — apps Go/Python se ignoran aunque tengan `icon:`.
- **Apps sin `icon:` se reportan en `skipped`**, no son error. Util para detectar apps C++ a las que falta declarar el icono.
- **No invalida el cache de iconos de Windows** — si Explorer no muestra el icono nuevo tras redeploy: `ie4uinit.exe -show` o reiniciar Explorer.
@@ -0,0 +1,97 @@
"""Regenera el appicon.ico de todas las apps C++ que declaren bloque icon: en su app.md."""
import os
import sys
from pathlib import Path
from typing import Optional
import yaml
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from infra.generate_app_icon import generate_app_icon
def _find_registry_root() -> Path:
env_root = os.environ.get("FN_REGISTRY_ROOT")
if env_root:
return Path(env_root).resolve()
current = Path(__file__).resolve()
for parent in current.parents:
if (parent / "registry.db").exists():
return parent
raise FileNotFoundError("registry.db no encontrado; define FN_REGISTRY_ROOT")
def _read_frontmatter(md_path: Path) -> Optional[dict]:
text = md_path.read_text(encoding="utf-8")
if not text.startswith("---"):
return None
end = text.find("\n---", 3)
if end < 0:
return None
try:
return yaml.safe_load(text[3:end])
except yaml.YAMLError:
return None
def _iter_cpp_app_mds(root: Path):
for pattern in ("apps/*/app.md", "projects/*/apps/*/app.md"):
for md in sorted(root.glob(pattern)):
fm = _read_frontmatter(md)
if not fm or fm.get("lang") != "cpp":
continue
yield md, fm
def regenerate_app_icons(only: Optional[list[str]] = None) -> dict:
"""Recorre apps C++ con bloque icon: en su frontmatter y regenera appicon.ico.
Args:
only: Lista opcional de nombres de app a filtrar (campo `name`). Si None,
procesa todas las apps C++ con `icon:` declarado.
Returns:
dict con keys: ok (list[str]), skipped (list[dict]), failed (list[dict]).
"""
root = _find_registry_root()
ok, skipped, failed = [], [], []
for md, fm in _iter_cpp_app_mds(root):
name = fm.get("name", md.parent.name)
if only and name not in only:
continue
icon = fm.get("icon")
if not icon or not isinstance(icon, dict):
skipped.append({"name": name, "reason": "no icon: block"})
continue
phosphor = icon.get("phosphor")
accent = icon.get("accent")
if not phosphor or not accent:
skipped.append({"name": name, "reason": "icon: missing phosphor/accent"})
continue
out_ico = md.parent / "appicon.ico"
try:
generate_app_icon(
phosphor_icon_name=phosphor,
accent_hex=accent,
out_ico_path=str(out_ico),
)
ok.append(name)
except Exception as e:
failed.append({"name": name, "error": str(e)})
return {"ok": ok, "skipped": skipped, "failed": failed}
if __name__ == "__main__":
only = sys.argv[1:] or None
result = regenerate_app_icons(only=only)
for name in result["ok"]:
print(f"OK {name}")
for s in result["skipped"]:
print(f"SKIP {s['name']}: {s['reason']}")
for f in result["failed"]:
print(f"FAIL {f['name']}: {f['error']}")
sys.exit(1 if result["failed"] else 0)