docs(flows): DoD obligatorio con user-facing surface + abrir issues 0100-0103 (taxonomia, frontmatter migration, dev_console, work dashboard)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -110,11 +110,23 @@ def validate_recipe_yaml(yaml_text: str) -> dict:
|
||||
)
|
||||
|
||||
sink = output.get("sink")
|
||||
valid_sinks = {"data_factory.runs", "stdout", "json_file"}
|
||||
# duckdb sink: requires output.duckdb_path (relative or absolute) and
|
||||
# output.table (table name). Optional output.database_id (default =
|
||||
# recipe_name + "_db") used to register/lookup in data_factory.databases.
|
||||
valid_sinks = {"data_factory.runs", "stdout", "json_file", "duckdb"}
|
||||
if sink is not None and sink not in valid_sinks:
|
||||
errors.append(
|
||||
f"Campo 'output.sink' debe ser uno de {sorted(valid_sinks)}, got '{sink}'."
|
||||
)
|
||||
if sink == "duckdb":
|
||||
if not output.get("duckdb_path"):
|
||||
errors.append(
|
||||
"Sink 'duckdb' requiere 'output.duckdb_path' (ruta al archivo .duckdb)."
|
||||
)
|
||||
if not output.get("table"):
|
||||
errors.append(
|
||||
"Sink 'duckdb' requiere 'output.table' (nombre de la tabla destino)."
|
||||
)
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
|
||||
@@ -1,9 +1,28 @@
|
||||
"""Invoca `claude -p` via subprocess y devuelve la respuesta como string."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
|
||||
def _resolve_claude_bin() -> str | None:
|
||||
"""Localiza claude CLI: PATH first, luego rutas convencionales."""
|
||||
p = shutil.which("claude")
|
||||
if p:
|
||||
return p
|
||||
# Fallback paths comunes (WSL subsession sin .profile cargado, etc).
|
||||
home = os.path.expanduser("~")
|
||||
candidates = [
|
||||
f"{home}/.local/bin/claude",
|
||||
"/usr/local/bin/claude",
|
||||
"/opt/homebrew/bin/claude",
|
||||
]
|
||||
for c in candidates:
|
||||
if os.path.isfile(c) and os.access(c, os.X_OK):
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def claude_cli_prompt(
|
||||
prompt: str,
|
||||
timeout_s: int = 60,
|
||||
@@ -24,16 +43,18 @@ def claude_cli_prompt(
|
||||
Respuesta de Claude como texto (stdout), truncada a max_chars_response.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: Si `claude` no esta en PATH.
|
||||
FileNotFoundError: Si `claude` no esta en PATH ni rutas convencionales.
|
||||
RuntimeError: Si exit code != 0 (incluye primeros 500 chars de stderr).
|
||||
subprocess.TimeoutExpired: Si la llamada supera timeout_s segundos.
|
||||
"""
|
||||
if shutil.which("claude") is None:
|
||||
claude_bin = _resolve_claude_bin()
|
||||
if claude_bin is None:
|
||||
raise FileNotFoundError(
|
||||
"'claude' CLI no encontrado en PATH. Instala Claude Code."
|
||||
"'claude' CLI no encontrado en PATH ni rutas convencionales "
|
||||
"(~/.local/bin, /usr/local/bin, /opt/homebrew/bin). Instala Claude Code."
|
||||
)
|
||||
|
||||
cmd = ["claude", "-p", prompt]
|
||||
cmd = [claude_bin, "-p", prompt]
|
||||
if model:
|
||||
cmd.extend(["--model", model])
|
||||
if extra_args:
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
---
|
||||
name: codegen_app_modules
|
||||
kind: function
|
||||
lang: py
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int"
|
||||
description: "Reads app.md uses_modules + modules/<name>/module.md frontmatters, emits <app>_modules_generated.cpp with fn::app_modules_array[] + fn::app_modules_count. CMake hook for add_imgui_app. Pure YAML parsing, no registry.db dep."
|
||||
tags: [codegen, modules, cmake, cpp, build]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- yaml
|
||||
example: |
|
||||
python python/functions/infra/codegen_app_modules.py \
|
||||
--app-md apps/data_factory/app.md \
|
||||
--modules-root modules \
|
||||
--app-name data_factory \
|
||||
--out cpp/build/apps/data_factory/data_factory_modules_generated.cpp
|
||||
file_path: "python/functions/infra/codegen_app_modules.py"
|
||||
params:
|
||||
- name: app_md
|
||||
desc: "Path absoluto al app.md de la app consumidora. Lee uses_modules del frontmatter YAML."
|
||||
- name: modules_root
|
||||
desc: "Raiz del directorio modules/. Cada modulo es modules/<name>/module.md."
|
||||
- name: app_name
|
||||
desc: "Nombre de la app (solo para el comment-header del .cpp generado)."
|
||||
- name: out_path
|
||||
desc: "Path donde escribir el .cpp generado. Idempotente: skip si contenido coincide."
|
||||
output: "Exit code: 0 si OK, 2 si OK pero algun modulo declarado no existe (warning), >0 si error."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
Generar el .cpp para `data_factory`:
|
||||
|
||||
```bash
|
||||
python python/functions/infra/codegen_app_modules.py \
|
||||
--app-md apps/data_factory/app.md \
|
||||
--modules-root modules \
|
||||
--app-name data_factory \
|
||||
--out /tmp/data_factory_modules_generated.cpp
|
||||
```
|
||||
|
||||
Si `data_factory/app.md` declara `uses_modules: [data_table_cpp]`, el .cpp generado es:
|
||||
|
||||
```cpp
|
||||
// Auto-generated by codegen_app_modules.py — do not edit.
|
||||
// App: data_factory
|
||||
// Source of truth: apps/data_factory/app.md (uses_modules)
|
||||
|
||||
#include "app_modules.h"
|
||||
|
||||
namespace fn {
|
||||
const ModuleInfo app_modules_array[] = {
|
||||
{ "data_table", "1.4.0", "Reusable C++ ImGui module..." },
|
||||
};
|
||||
const unsigned long app_modules_count = 1;
|
||||
} // namespace fn
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
CMake hook automatico — la macro `add_imgui_app` la invoca al configurar el build. Apps no la llaman manualmente. Manual override: solo si quieres regenerar fuera del flujo cmake (debugging).
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Resuelve `<name>_cpp` strippeando el sufijo `_cpp/_py/_ts/_bash/_go`. Mismo patron que `GenerateModuleID`.
|
||||
- Si un modulo declarado en `uses_modules` no existe, emite warning a stderr y EXIT=2 (no falla el build).
|
||||
- Idempotente: solo reescribe si el contenido cambia. Evita rebuilds innecesarios cuando los modulos no cambiaron.
|
||||
- Requiere `pyyaml`. Disponible en `python/.venv` del registry.
|
||||
@@ -0,0 +1,149 @@
|
||||
"""Generate <app>_modules_generated.cpp from app.md uses_modules + modules/*/module.md.
|
||||
|
||||
Stand-alone — no dependencies beyond PyYAML. Invoked from CMake at configure time.
|
||||
Reads YAML frontmatter directly (no registry.db dependency, no Go binary).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def _read_frontmatter(md_path: Path) -> dict:
|
||||
if not md_path.exists():
|
||||
return {}
|
||||
text = md_path.read_text(encoding="utf-8")
|
||||
if not text.startswith("---\n") and not text.startswith("---\r\n"):
|
||||
return {}
|
||||
end = text.find("\n---", 4)
|
||||
if end < 0:
|
||||
return {}
|
||||
raw = text[4:end]
|
||||
try:
|
||||
return yaml.safe_load(raw) or {}
|
||||
except yaml.YAMLError:
|
||||
return {}
|
||||
|
||||
|
||||
def _escape_c_string(s: str) -> str:
|
||||
out = []
|
||||
for ch in s or "":
|
||||
if ch == "\\":
|
||||
out.append("\\\\")
|
||||
elif ch == '"':
|
||||
out.append('\\"')
|
||||
elif ch == "\n":
|
||||
out.append("\\n")
|
||||
elif ch == "\r":
|
||||
out.append("\\r")
|
||||
elif ch == "\t":
|
||||
out.append("\\t")
|
||||
elif ord(ch) < 32:
|
||||
out.append(f"\\x{ord(ch):02x}")
|
||||
else:
|
||||
out.append(ch)
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def _resolve_module(modules_root: Path, mod_id: str) -> Optional[dict]:
|
||||
"""mod_id is e.g. `data_table_cpp`. Lookup module.md by name (strip _<lang>)."""
|
||||
name = mod_id
|
||||
for suffix in ("_cpp", "_py", "_ts", "_bash", "_go"):
|
||||
if name.endswith(suffix):
|
||||
name = name[: -len(suffix)]
|
||||
break
|
||||
md = modules_root / name / "module.md"
|
||||
fm = _read_frontmatter(md)
|
||||
if not fm:
|
||||
return None
|
||||
return {
|
||||
"name": fm.get("name", name),
|
||||
"version": fm.get("version", "0.0.0"),
|
||||
"description": fm.get("description", ""),
|
||||
}
|
||||
|
||||
|
||||
def generate(app_md: Path, modules_root: Path, app_name: str, out_path: Path) -> int:
|
||||
fm = _read_frontmatter(app_md)
|
||||
uses_modules = fm.get("uses_modules") or []
|
||||
if not isinstance(uses_modules, list):
|
||||
uses_modules = []
|
||||
|
||||
entries: list[dict] = []
|
||||
missing: list[str] = []
|
||||
for mid in uses_modules:
|
||||
info = _resolve_module(modules_root, str(mid))
|
||||
if info is None:
|
||||
missing.append(str(mid))
|
||||
continue
|
||||
entries.append(info)
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append(f"// Auto-generated by codegen_app_modules.py — do not edit.")
|
||||
lines.append(f"// App: {app_name}")
|
||||
lines.append(f"// Source of truth: {app_md.as_posix()} (uses_modules)")
|
||||
lines.append("")
|
||||
lines.append('#include "app_modules.h"')
|
||||
lines.append("")
|
||||
lines.append("namespace fn {")
|
||||
if entries:
|
||||
lines.append("const ModuleInfo app_modules_array[] = {")
|
||||
for e in entries:
|
||||
lines.append(
|
||||
' { "%s", "%s", "%s" },'
|
||||
% (
|
||||
_escape_c_string(e["name"]),
|
||||
_escape_c_string(e["version"]),
|
||||
_escape_c_string(e["description"]),
|
||||
)
|
||||
)
|
||||
lines.append("};")
|
||||
lines.append(f"const unsigned long app_modules_count = {len(entries)};")
|
||||
else:
|
||||
lines.append("const ModuleInfo app_modules_array[1] = { { nullptr, nullptr, nullptr } };")
|
||||
lines.append("const unsigned long app_modules_count = 0;")
|
||||
lines.append("} // namespace fn")
|
||||
lines.append("")
|
||||
|
||||
new_content = "\n".join(lines)
|
||||
|
||||
# Idempotent: skip rewrite when content matches.
|
||||
if out_path.exists() and out_path.read_text(encoding="utf-8") == new_content:
|
||||
return 0 if not missing else 2
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(new_content, encoding="utf-8")
|
||||
|
||||
if missing:
|
||||
sys.stderr.write(
|
||||
f"codegen_app_modules: WARNING — module(s) not found: {', '.join(missing)} "
|
||||
f"(app {app_name})\n"
|
||||
)
|
||||
return 2
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Generate <app>_modules_generated.cpp from app.md")
|
||||
ap.add_argument("--app-md", required=True, help="Path to app.md")
|
||||
ap.add_argument("--modules-root", required=True, help="Path to modules/ root")
|
||||
ap.add_argument("--app-name", required=True, help="App name (for comment header)")
|
||||
ap.add_argument("--out", required=True, help="Output path for generated .cpp")
|
||||
args = ap.parse_args()
|
||||
|
||||
rc = generate(
|
||||
app_md=Path(args.app_md),
|
||||
modules_root=Path(args.modules_root),
|
||||
app_name=args.app_name,
|
||||
out_path=Path(args.out),
|
||||
)
|
||||
return 0 if rc in (0, 2) else rc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,72 @@
|
||||
---
|
||||
name: export_hub_manifest
|
||||
kind: function
|
||||
lang: py
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict"
|
||||
description: "Genera el TSV sidecar para app_hub_launcher: consulta registry.db por todas las apps cpp/imgui, lee su app.md para extraer nombre, descripcion y accent_hex, y escribe un archivo TSV con cabecera a out_path. Retorna {ok, count, out_path}."
|
||||
tags: [hub, launcher, manifest, suite, cpp-windows]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [sqlite3, yaml, pathlib]
|
||||
params:
|
||||
- name: out_path
|
||||
desc: "Ruta de destino del archivo TSV. Puede ser absoluta o relativa al cwd. El directorio padre se crea si no existe."
|
||||
- name: registry_root
|
||||
desc: "Raiz del fn_registry. Si None, usa la variable de entorno FN_REGISTRY_ROOT o /home/lucas/fn_registry como fallback."
|
||||
output: "Dict {ok: True, count: N, out_path: str} con la ruta absoluta del TSV escrito y el numero de apps incluidas."
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/infra/export_hub_manifest.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```bash
|
||||
# Uso directo con fn run (la salida JSON se imprime en stdout)
|
||||
./fn run export_hub_manifest_py_infra /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
|
||||
```
|
||||
|
||||
```python
|
||||
# Desde un heredoc o pipeline Python
|
||||
import sys
|
||||
sys.path.insert(0, "python/functions")
|
||||
from infra import export_hub_manifest
|
||||
|
||||
result = export_hub_manifest(
|
||||
"/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv"
|
||||
)
|
||||
print(result)
|
||||
# {'ok': True, 'count': 12, 'out_path': '/mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv'}
|
||||
```
|
||||
|
||||
```bash
|
||||
# Ver el contenido del TSV generado
|
||||
head -5 /mnt/c/Users/lucas/Desktop/apps/app_hub_launcher/local_files/hub_manifest.tsv
|
||||
# name display_name description accent_hex
|
||||
# chart_demo Chart Demo Demo ImGui de primitivos viz... #0ea5e9
|
||||
# dag_engine_ui Dag Engine Ui Motor de DAGs con frontend... #f59e0b
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Antes de desplegar `app_hub_launcher` a Windows: genera el `hub_manifest.tsv` que el hub lee al arrancar para listar y colorear los botones de cada app. El hub en runtime no tiene acceso a `registry.db` ni a los `app.md` del WSL, por lo que necesita este sidecar. Ejecutar tras añadir o modificar una app C++ imgui en el registry.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **PyYAML en el venv**: requiere `yaml` disponible en `python/.venv`. Ya instalado por defecto. Si falta: `cd python && uv pip install pyyaml`.
|
||||
- **app.md faltante no aborta**: si un `app.md` no existe o tiene frontmatter malformado, la app sigue apareciendo en el TSV con `description` vacía y accent `#64748b` (slate). Se imprime un WARN a stderr.
|
||||
- **Filtro estricto `lang='cpp' AND framework='imgui'`**: solo apps C++ con el shell `fn::run_app`. Apps Python, Bash o C++ sin imgui quedan excluidas. Correcto para el hub.
|
||||
- **La ruta `dir_path` en registry.db es relativa a la raiz del registry**: la funcion la combina con `registry_root` para construir el path absoluto al `app.md`. Si una app tiene `dir_path` incorrecto en su `app.md`, el WARN indicara cual falló.
|
||||
- **TSV UTF-8**: el hub debe abrir el archivo con encoding UTF-8. Tabs y saltos de linea en los campos se limpian automaticamente (reemplazados por espacio).
|
||||
- **`display_name` es generado, no leido**: se deriva del `name` de la app convirtiendo snake_case a Title Case. No se puede personalizar desde el `app.md` en esta version.
|
||||
|
||||
## Capability growth log
|
||||
|
||||
*(sin cambios desde v1.0.0)*
|
||||
@@ -0,0 +1,142 @@
|
||||
"""export_hub_manifest — genera el TSV sidecar para app_hub_launcher."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def _read_frontmatter(md_path: Path) -> dict[str, Any]:
|
||||
"""Parse YAML frontmatter from a .md file. Returns {} on any error."""
|
||||
try:
|
||||
import yaml # PyYAML — available in python/.venv
|
||||
|
||||
text = md_path.read_text(encoding="utf-8")
|
||||
if not text.startswith("---"):
|
||||
return {}
|
||||
# Find the closing ---
|
||||
end = text.find("\n---", 3)
|
||||
if end == -1:
|
||||
return {}
|
||||
yaml_block = text[3:end].strip()
|
||||
data = yaml.safe_load(yaml_block)
|
||||
return data if isinstance(data, dict) else {}
|
||||
except Exception as exc:
|
||||
print(f"[export_hub_manifest] WARN: could not parse {md_path}: {exc}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def _snake_to_display(name: str) -> str:
|
||||
"""Convert snake_case name to Title Case With Spaces.
|
||||
|
||||
Examples:
|
||||
graph_explorer -> Graph Explorer
|
||||
dag_engine_ui -> Dag Engine Ui
|
||||
app_hub_launcher -> App Hub Launcher
|
||||
"""
|
||||
return " ".join(part.capitalize() for part in name.split("_"))
|
||||
|
||||
|
||||
def export_hub_manifest(out_path: str, *, registry_root: str | None = None) -> dict:
|
||||
"""Generate TSV sidecar manifest for app_hub_launcher.
|
||||
|
||||
Queries registry.db for all cpp/imgui apps, reads their app.md
|
||||
frontmatter to extract name, description and accent color, then
|
||||
writes a UTF-8 TSV to out_path.
|
||||
|
||||
Args:
|
||||
out_path: Destination path for the TSV manifest file.
|
||||
registry_root: Path to the fn_registry root directory.
|
||||
Defaults to FN_REGISTRY_ROOT env var or /home/lucas/fn_registry.
|
||||
|
||||
Returns:
|
||||
{"ok": True, "count": N, "out_path": "<abs_path>"}
|
||||
"""
|
||||
root = Path(
|
||||
registry_root
|
||||
or os.environ.get("FN_REGISTRY_ROOT", "/home/lucas/fn_registry")
|
||||
).resolve()
|
||||
|
||||
db_path = root / "registry.db"
|
||||
if not db_path.exists():
|
||||
raise FileNotFoundError(f"registry.db not found at {db_path}")
|
||||
|
||||
con = sqlite3.connect(str(db_path))
|
||||
con.row_factory = sqlite3.Row
|
||||
try:
|
||||
rows = con.execute(
|
||||
"SELECT id, name, dir_path FROM apps WHERE lang='cpp' AND framework='imgui' ORDER BY name"
|
||||
).fetchall()
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
DEFAULT_ACCENT = "#64748b"
|
||||
TSV_HEADER = "name\tdisplay_name\tdescription\taccent_hex\n"
|
||||
|
||||
lines: list[str] = [TSV_HEADER]
|
||||
count = 0
|
||||
|
||||
for row in rows:
|
||||
app_name: str = row["name"]
|
||||
dir_path: str = row["dir_path"]
|
||||
|
||||
# Derive defaults in case app.md is missing / malformed
|
||||
display_name = _snake_to_display(app_name)
|
||||
description = ""
|
||||
accent_hex = DEFAULT_ACCENT
|
||||
|
||||
md_path = root / dir_path / "app.md"
|
||||
if md_path.exists():
|
||||
fm = _read_frontmatter(md_path)
|
||||
if fm:
|
||||
description = fm.get("description", "") or ""
|
||||
icon_block = fm.get("icon")
|
||||
if isinstance(icon_block, dict):
|
||||
accent_hex = icon_block.get("accent", DEFAULT_ACCENT) or DEFAULT_ACCENT
|
||||
else:
|
||||
print(
|
||||
f"[export_hub_manifest] WARN: empty/malformed frontmatter in {md_path}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[export_hub_manifest] WARN: app.md missing for {app_name} at {md_path}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Sanitize: TSV values must not contain tabs or newlines
|
||||
def clean(s: str) -> str:
|
||||
return s.replace("\t", " ").replace("\n", " ").replace("\r", "")
|
||||
|
||||
lines.append(
|
||||
f"{clean(app_name)}\t{clean(display_name)}\t{clean(description)}\t{clean(accent_hex)}\n"
|
||||
)
|
||||
count += 1
|
||||
|
||||
out = Path(out_path).resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text("".join(lines), encoding="utf-8")
|
||||
|
||||
return {"ok": True, "count": count, "out_path": str(out)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Export hub manifest TSV for app_hub_launcher."
|
||||
)
|
||||
parser.add_argument("out_path", help="Destination .tsv file path")
|
||||
parser.add_argument(
|
||||
"--registry-root",
|
||||
default=None,
|
||||
help="Path to fn_registry root (default: FN_REGISTRY_ROOT env or /home/lucas/fn_registry)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = export_hub_manifest(args.out_path, registry_root=args.registry_root)
|
||||
print(json.dumps(result, indent=2))
|
||||
@@ -3,7 +3,7 @@ name: cdp_extract_recipe
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
version: "1.2.0"
|
||||
purity: impure
|
||||
signature: "def cdp_extract_recipe(recipe_path: str, debug_port: int = 9222, tab_id: str | None = None, record_run: bool = True) -> dict"
|
||||
description: "Ejecuta una recipe YAML contra Chrome remoto via CDP. Valida recipe, busca tab por url_pattern, ejecuta steps (wait_selector/js) y envia resultado al sink declarado."
|
||||
@@ -22,7 +22,7 @@ params:
|
||||
- name: tab_id
|
||||
desc: "ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern de la recipe."
|
||||
- name: record_run
|
||||
desc: "Si True y output.sink=='data_factory.runs', registra la ejecucion en data_factory."
|
||||
desc: "Si True, registra la ejecucion en data_factory.runs (para sink 'data_factory.runs' y 'duckdb')."
|
||||
output: "dict {status: ok|error, rows_out: int, kb_out: float, duration_ms: int, error: str, sample_rows: list}"
|
||||
tested: false
|
||||
tests: []
|
||||
@@ -60,6 +60,10 @@ output:
|
||||
|
||||
Cuando tienes una recipe YAML validada y Chrome corriendo con remote debugging, y quieres extraer datos en un solo paso sin montar pipeline manualmente. Encadena con `cdp_open_url_and_wait` si necesitas abrir la URL primero.
|
||||
|
||||
## Capability growth log
|
||||
|
||||
- v1.2.0 (2026-05-16) — sink `duckdb` writes rows to a DuckDB file + registers run in data_factory.runs with storage_db_id/storage_table for traceability.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Chrome debe estar corriendo con `--remote-debugging-port=<debug_port>`.
|
||||
|
||||
@@ -41,9 +41,14 @@ def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 1
|
||||
|
||||
|
||||
def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
|
||||
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null."""
|
||||
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null.
|
||||
|
||||
Drena eventos CDP (paginas con Page.enable emiten loads, frames, etc.) y
|
||||
matchea por `id` para evitar leer respuestas ajenas o eventos del server.
|
||||
"""
|
||||
deadline = time.time() + timeout_s
|
||||
msg_id = 1000
|
||||
ws.settimeout(0.5)
|
||||
while time.time() < deadline:
|
||||
ws.send(json.dumps({
|
||||
"id": msg_id,
|
||||
@@ -53,19 +58,28 @@ def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
|
||||
"returnByValue": True,
|
||||
}
|
||||
}))
|
||||
time.sleep(0.2)
|
||||
msg_id += 1
|
||||
# Leer respuesta en loop simple (websocket-client sync)
|
||||
# Para modo sync usamos recv()
|
||||
try:
|
||||
raw = ws.sock.recv()
|
||||
if raw:
|
||||
# Leer hasta 30 frames buscando uno con nuestro id; ignorar eventos.
|
||||
got_response = False
|
||||
for _ in range(30):
|
||||
try:
|
||||
raw = ws.recv()
|
||||
except Exception:
|
||||
break
|
||||
if not raw:
|
||||
break
|
||||
try:
|
||||
msg = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
if msg.get("id") == msg_id:
|
||||
got_response = True
|
||||
val = msg.get("result", {}).get("result", {}).get("value", False)
|
||||
if val:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
msg_id += 1
|
||||
if not got_response:
|
||||
time.sleep(0.2)
|
||||
return False
|
||||
|
||||
|
||||
@@ -188,16 +202,114 @@ def cdp_extract_recipe(
|
||||
out_path = output_cfg.get("path", "output.json")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(rows, f, ensure_ascii=False, indent=2)
|
||||
elif sink == "duckdb":
|
||||
duckdb_path = output_cfg.get("duckdb_path", "")
|
||||
table_name = output_cfg.get("table", "")
|
||||
if not duckdb_path or not table_name:
|
||||
# not fatal: rows already returned via sample_rows
|
||||
pass
|
||||
else:
|
||||
import duckdb
|
||||
import uuid
|
||||
import datetime
|
||||
# resolve duckdb_path relative to FN_REGISTRY_ROOT if not absolute
|
||||
if not os.path.isabs(duckdb_path):
|
||||
duckdb_path = os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), duckdb_path)
|
||||
os.makedirs(os.path.dirname(duckdb_path), exist_ok=True)
|
||||
conn = duckdb.connect(duckdb_path)
|
||||
try:
|
||||
if rows:
|
||||
# Detect columns from first row keys (assumes list of dicts).
|
||||
if not isinstance(rows[0], dict):
|
||||
# Fallback: wrap scalar rows as {"value": v}.
|
||||
rows = [{"value": r} for r in rows]
|
||||
cols = list(rows[0].keys())
|
||||
# Build CREATE TABLE IF NOT EXISTS with VARCHAR for safety
|
||||
# plus extracted_at TIMESTAMP and run_id VARCHAR for lineage.
|
||||
col_defs = ", ".join(f'"{c}" VARCHAR' for c in cols)
|
||||
ddl = (
|
||||
f'CREATE TABLE IF NOT EXISTS "{table_name}" ('
|
||||
f' run_id VARCHAR, extracted_at TIMESTAMP, {col_defs}'
|
||||
f')'
|
||||
)
|
||||
conn.execute(ddl)
|
||||
run_id_str = uuid.uuid4().hex[:16]
|
||||
now_iso = datetime.datetime.utcnow().isoformat() + "Z"
|
||||
placeholders = ", ".join(["?"] * (len(cols) + 2))
|
||||
insert_sql = (
|
||||
f'INSERT INTO "{table_name}" '
|
||||
f'(run_id, extracted_at, {", ".join(chr(34) + c + chr(34) for c in cols)}) '
|
||||
f'VALUES ({placeholders})'
|
||||
)
|
||||
for r in rows:
|
||||
vals = [run_id_str, now_iso] + [str(r.get(c, "")) for c in cols]
|
||||
conn.execute(insert_sql, vals)
|
||||
# Also record into data_factory.runs with storage info
|
||||
registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
|
||||
if registry_root and record_run:
|
||||
import sqlite3
|
||||
df_db = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
|
||||
if os.path.exists(df_db):
|
||||
try:
|
||||
df_conn = sqlite3.connect(df_db)
|
||||
df_conn.execute("PRAGMA foreign_keys = ON")
|
||||
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
|
||||
db_id = output_cfg.get("database_id", recipe.get("name", "unknown") + "_db")
|
||||
df_run_id = uuid.uuid4().hex[:16]
|
||||
df_conn.execute(
|
||||
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
|
||||
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes,"
|
||||
" storage_db_id, storage_table)"
|
||||
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(
|
||||
df_run_id, recipe.get("name", "unknown"),
|
||||
now_iso, now_iso, "success",
|
||||
0, rows_out, 0, int(round(kb_out)), duration_ms,
|
||||
trigger, "",
|
||||
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
|
||||
db_id, table_name,
|
||||
),
|
||||
)
|
||||
df_conn.commit()
|
||||
df_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
conn.close()
|
||||
elif sink == "data_factory.runs" and record_run:
|
||||
# Escribe DIRECTO a data_factory.db evitando spawn `fn run` (loop infinito
|
||||
# si data_factory_record_run re-ejecuta esta misma funcion). Confia en que
|
||||
# el node ya existe en `nodes` con id == recipe.name.
|
||||
try:
|
||||
from pipelines.data_factory_record_run import data_factory_record_run
|
||||
data_factory_record_run(
|
||||
node_id=recipe.get("name", "unknown"),
|
||||
function_id="cdp_extract_recipe_py_pipelines",
|
||||
args={"recipe_path": recipe_path, "debug_port": debug_port},
|
||||
import sqlite3
|
||||
import datetime
|
||||
import uuid
|
||||
registry_root = os.environ.get("FN_REGISTRY_ROOT", "").strip()
|
||||
if not registry_root:
|
||||
# No fatal — el dato ya fue extraido / impreso por otro sink
|
||||
raise RuntimeError("FN_REGISTRY_ROOT not set; cannot locate data_factory.db")
|
||||
db_path = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
|
||||
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
|
||||
run_id = uuid.uuid4().hex[:16]
|
||||
now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
node_id = recipe.get("name", "unknown")
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.execute(
|
||||
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
|
||||
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes)"
|
||||
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(
|
||||
run_id, node_id, now, now, "success",
|
||||
0, rows_out, 0, int(round(kb_out)), duration_ms,
|
||||
trigger, "",
|
||||
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
# No fatal — el dato ya fue extraido
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception:
|
||||
# No fatal — el dato ya fue extraido (sample_rows en retorno)
|
||||
pass
|
||||
|
||||
return {
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
name: dedup_duckdb_table_by_hash
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
purity: impure
|
||||
version: "1.0.0"
|
||||
signature: "def dedup_duckdb_table_by_hash(duckdb_path: str, table: str, exclude_cols: list[str] | None = None) -> dict"
|
||||
description: "Elimina filas duplicadas de una tabla DuckDB calculando un md5 de las columnas de datos. Anade columna row_hash idempotentemente, actualiza hashes nulos y borra duplicados conservando la primera insercion por rowid."
|
||||
tags: [dedup, duckdb, transformer, pipeline, dataops]
|
||||
uses_functions: [cdp_extract_recipe_py_pipelines]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: error_go_core
|
||||
imports: [duckdb]
|
||||
tested: true
|
||||
tests:
|
||||
- "dedup elimina filas duplicadas y conserva unicas"
|
||||
test_file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash_test.py"
|
||||
file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash.py"
|
||||
params:
|
||||
- name: duckdb_path
|
||||
desc: "Ruta DuckDB file (absoluta o relativa a FN_REGISTRY_ROOT)."
|
||||
- name: table
|
||||
desc: "Nombre tabla a deduplicar."
|
||||
- name: exclude_cols
|
||||
desc: "Cols a excluir del hash (metadata como run_id, extracted_at, row_hash). None usa default [run_id, extracted_at, row_hash]."
|
||||
output: "dict {status, rows_before, rows_after, dedup_removed, duration_ms, hash_column}"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
|
||||
|
||||
r = dedup_duckdb_table_by_hash("apps/data_factory/data/hn_top_stories.duckdb", "hn_stories")
|
||||
print(r)
|
||||
# {"status": "ok", "rows_before": 120, "rows_after": 30, "dedup_removed": 90, "duration_ms": 45, "hash_column": "row_hash"}
|
||||
```
|
||||
|
||||
CLI directo:
|
||||
|
||||
```bash
|
||||
/home/lucas/fn_registry/python/.venv/bin/python3 \
|
||||
python/functions/pipelines/dedup_duckdb_table_by_hash.py \
|
||||
apps/data_factory/data/hn_top_stories.duckdb hn_stories
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando un extractor periodico re-inserta filas iguales (mismo contenido, distinto `run_id`/`extracted_at`) y quieres deduplicar in-place sin tocar el pipeline upstream. Tipicamente como paso `transformer` despues de `cdp_extract_recipe` en un DAG de scraping.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **rowid y VACUUM**: DuckDB rowid puede recalcularse tras `VACUUM`. En esta funcion solo se usa dentro de la misma transaccion de DELETE, por lo que no hay inconsistencia practica.
|
||||
- **Colisiones md5**: md5 no colisiona en practica para tablas de escala HN (miles de filas). Si la tabla crece a millones de filas con datos binarios, cambiar `md5(...)` por `sha256(...)` en el SQL.
|
||||
- **Tabla inexistente**: si `<table>` no existe en el DuckDB, retorna `status=error` con mensaje descriptivo en lugar de lanzar excepcion.
|
||||
- **exclude_cols case**: la comparacion de columnas excluidas es case-insensitive (`c.lower()`), pero el nombre en la query se usa tal cual lo devuelve `DESCRIBE`.
|
||||
- **Primera ejecucion**: si la tabla ya tiene `row_hash` de una ejecucion anterior, solo se actualizan las filas con `row_hash IS NULL` (idempotente).
|
||||
@@ -0,0 +1,141 @@
|
||||
"""dedup_duckdb_table_by_hash — Remove duplicate rows from a DuckDB table using md5 hash of data columns."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
||||
def dedup_duckdb_table_by_hash(
|
||||
duckdb_path: str,
|
||||
table: str,
|
||||
exclude_cols: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Remove duplicate rows from a DuckDB table by computing md5 hash of data columns.
|
||||
|
||||
Args:
|
||||
duckdb_path: Path to DuckDB file. Absolute or relative to FN_REGISTRY_ROOT.
|
||||
table: Table name to deduplicate.
|
||||
exclude_cols: Columns to exclude from hash computation (metadata cols).
|
||||
Defaults to ["run_id", "extracted_at", "row_hash"].
|
||||
|
||||
Returns:
|
||||
dict with keys: status, rows_before, rows_after, dedup_removed,
|
||||
duration_ms, hash_column.
|
||||
"""
|
||||
import duckdb # type: ignore
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
# Resolve path against FN_REGISTRY_ROOT if relative
|
||||
if not os.path.isabs(duckdb_path):
|
||||
root = os.environ.get("FN_REGISTRY_ROOT", os.getcwd())
|
||||
duckdb_path = os.path.join(root, duckdb_path)
|
||||
|
||||
if exclude_cols is None:
|
||||
exclude_cols = ["run_id", "extracted_at", "row_hash"]
|
||||
|
||||
exclude_set = {c.lower() for c in exclude_cols}
|
||||
|
||||
conn = duckdb.connect(duckdb_path)
|
||||
try:
|
||||
# Verify table exists
|
||||
tables = [r[0] for r in conn.execute("SHOW TABLES").fetchall()]
|
||||
if table not in tables:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Table '{table}' not found in {duckdb_path}. Available: {tables}",
|
||||
"rows_before": 0,
|
||||
"rows_after": 0,
|
||||
"dedup_removed": 0,
|
||||
"duration_ms": int((time.monotonic() - t0) * 1000),
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
# Introspect columns
|
||||
desc = conn.execute(f'DESCRIBE "{table}"').fetchall()
|
||||
all_cols = [r[0] for r in desc]
|
||||
existing_col_names_lower = {c.lower() for c in all_cols}
|
||||
|
||||
# Add row_hash column if missing (idempotent)
|
||||
if "row_hash" not in existing_col_names_lower:
|
||||
conn.execute(f'ALTER TABLE "{table}" ADD COLUMN row_hash VARCHAR')
|
||||
all_cols.append("row_hash")
|
||||
existing_col_names_lower.add("row_hash")
|
||||
|
||||
# Data columns = all columns minus excluded
|
||||
data_cols = [c for c in all_cols if c.lower() not in exclude_set]
|
||||
|
||||
if not data_cols:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": "No data columns remaining after exclusion.",
|
||||
"rows_before": 0,
|
||||
"rows_after": 0,
|
||||
"dedup_removed": 0,
|
||||
"duration_ms": int((time.monotonic() - t0) * 1000),
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
# Build md5 expression: md5(col1 || '\t' || col2 || ...)
|
||||
# Each col: COALESCE(CAST("colname" AS VARCHAR), '')
|
||||
parts = " || '\t' || ".join(
|
||||
f"COALESCE(CAST(\"{c}\" AS VARCHAR), '')" for c in data_cols
|
||||
)
|
||||
hash_expr = f"md5({parts})"
|
||||
|
||||
# Update row_hash where NULL
|
||||
conn.execute(
|
||||
f'UPDATE "{table}" SET row_hash = {hash_expr} WHERE row_hash IS NULL'
|
||||
)
|
||||
|
||||
# Count rows before dedup
|
||||
rows_before = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
|
||||
|
||||
# Delete duplicates, keeping row with smallest rowid (earliest insert)
|
||||
conn.execute(
|
||||
f"""
|
||||
DELETE FROM "{table}"
|
||||
WHERE rowid NOT IN (
|
||||
SELECT min(rowid) FROM "{table}" GROUP BY row_hash
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Count rows after dedup
|
||||
rows_after = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
|
||||
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
dedup_removed = rows_before - rows_after
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"rows_before": rows_before,
|
||||
"rows_after": rows_after,
|
||||
"dedup_removed": dedup_removed,
|
||||
"duration_ms": duration_ms,
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Dedup a DuckDB table by row hash.")
|
||||
parser.add_argument("duckdb_path", help="Path to DuckDB file")
|
||||
parser.add_argument("table", help="Table name to deduplicate")
|
||||
parser.add_argument(
|
||||
"--exclude-cols",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Columns to exclude from hash (default: run_id extracted_at row_hash)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = dedup_duckdb_table_by_hash(args.duckdb_path, args.table, args.exclude_cols)
|
||||
print(json.dumps(result, indent=2))
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Tests para dedup_duckdb_table_by_hash."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
|
||||
|
||||
|
||||
def _make_test_db(path: str) -> None:
|
||||
"""Create a test DuckDB with 5 rows: 3 unique data, 2 duplicates."""
|
||||
conn = duckdb.connect(path)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE stories (
|
||||
run_id VARCHAR,
|
||||
extracted_at TIMESTAMP,
|
||||
rank INTEGER,
|
||||
title VARCHAR,
|
||||
url VARCHAR,
|
||||
points INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO stories VALUES
|
||||
('run-001', '2026-05-16 10:00:00', 1, 'Story A', 'https://a.com', 100),
|
||||
('run-001', '2026-05-16 10:00:00', 2, 'Story B', 'https://b.com', 200),
|
||||
('run-001', '2026-05-16 10:00:00', 3, 'Story C', 'https://c.com', 300),
|
||||
('run-002', '2026-05-16 10:30:00', 1, 'Story A', 'https://a.com', 100),
|
||||
('run-002', '2026-05-16 10:30:00', 2, 'Story B', 'https://b.com', 200)
|
||||
"""
|
||||
)
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_dedup_elimina_filas_duplicadas_y_conserva_unicas():
|
||||
"""dedup elimina filas duplicadas y conserva unicas"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "test.duckdb")
|
||||
_make_test_db(db_path)
|
||||
|
||||
result = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
|
||||
assert result["status"] == "ok", f"Expected ok, got: {result}"
|
||||
assert result["rows_before"] == 5
|
||||
assert result["rows_after"] == 3, f"Expected 3 unique rows, got {result['rows_after']}"
|
||||
assert result["dedup_removed"] == 2
|
||||
assert result["hash_column"] == "row_hash"
|
||||
assert result["duration_ms"] >= 0
|
||||
|
||||
# Verify row_hash column exists and is populated
|
||||
conn = duckdb.connect(db_path)
|
||||
hashes = conn.execute("SELECT DISTINCT row_hash FROM stories").fetchall()
|
||||
conn.close()
|
||||
assert len(hashes) == 3, f"Expected 3 distinct hashes, got {len(hashes)}"
|
||||
# All hashes should be non-null
|
||||
assert all(h[0] is not None for h in hashes), "Some row_hash values are NULL"
|
||||
|
||||
|
||||
def test_dedup_idempotente():
|
||||
"""Running dedup twice leaves rows_after unchanged."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "test.duckdb")
|
||||
_make_test_db(db_path)
|
||||
|
||||
r1 = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
r2 = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
|
||||
assert r1["status"] == "ok"
|
||||
assert r2["status"] == "ok"
|
||||
assert r2["rows_before"] == 3
|
||||
assert r2["rows_after"] == 3
|
||||
assert r2["dedup_removed"] == 0
|
||||
|
||||
|
||||
def test_dedup_tabla_inexistente():
|
||||
"""Returns status=error when table does not exist."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "empty.duckdb")
|
||||
conn = duckdb.connect(db_path)
|
||||
conn.close()
|
||||
|
||||
result = dedup_duckdb_table_by_hash(db_path, "nonexistent_table")
|
||||
assert result["status"] == "error"
|
||||
assert "nonexistent_table" in result["error"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: regenerate_app_icons
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def regenerate_app_icons(only: list[str] | None = None) -> dict"
|
||||
description: "Escanea todas las apps C++ del registry, lee el bloque `icon: {phosphor, accent}` de cada app.md y regenera el appicon.ico via generate_app_icon. Reemplaza el script ad-hoc dev/gen_app_icons.py."
|
||||
tags: [cpp-windows, icon, phosphor, batch]
|
||||
uses_functions: [generate_app_icon_py_infra]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [os, sys, pathlib, typing, yaml]
|
||||
params:
|
||||
- name: only
|
||||
desc: "Lista opcional de nombres de app (campo `name` del frontmatter) a procesar. Si None, regenera todas las apps C++ con icon: declarado."
|
||||
output: "dict {ok: [name], skipped: [{name, reason}], failed: [{name, error}]}"
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/pipelines/regenerate_app_icons.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```bash
|
||||
# Regenerar todas las apps C++ con icon: declarado
|
||||
./fn run regenerate_app_icons
|
||||
|
||||
# Solo una app
|
||||
./fn run regenerate_app_icons chart_demo
|
||||
|
||||
# Varias apps
|
||||
./fn run regenerate_app_icons chart_demo registry_dashboard
|
||||
```
|
||||
|
||||
```python
|
||||
import sys
|
||||
sys.path.insert(0, "python/functions")
|
||||
from pipelines.regenerate_app_icons import regenerate_app_icons
|
||||
|
||||
result = regenerate_app_icons()
|
||||
print(f"OK: {len(result['ok'])}, FAIL: {len(result['failed'])}")
|
||||
```
|
||||
|
||||
Bloque `icon:` esperado en `app.md`:
|
||||
```yaml
|
||||
icon:
|
||||
phosphor: "chart-bar"
|
||||
accent: "#0ea5e9"
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando anades una app C++ nueva (anades `icon:` a su `app.md` y corres el pipeline), cambias el color/glyph de una app existente, o pulleas cambios de iconos desde otra rama. Antes de `redeploy_cpp_app_windows` para que el `.exe` lleve el icono actualizado.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Sobreescribe `appicon.ico` sin warning** — igual que `generate_app_icon`. Hacer backup si necesitas preservar version anterior.
|
||||
- **Requiere `sources/phosphor-core/`**: clonar con `git clone --depth=1 https://github.com/phosphor-icons/core.git sources/phosphor-core` si no existe.
|
||||
- **Solo procesa apps con `lang: cpp`** en frontmatter — apps Go/Python se ignoran aunque tengan `icon:`.
|
||||
- **Apps sin `icon:` se reportan en `skipped`**, no son error. Util para detectar apps C++ a las que falta declarar el icono.
|
||||
- **No invalida el cache de iconos de Windows** — si Explorer no muestra el icono nuevo tras redeploy: `ie4uinit.exe -show` o reiniciar Explorer.
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Regenera el appicon.ico de todas las apps C++ que declaren bloque icon: en su app.md."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from infra.generate_app_icon import generate_app_icon
|
||||
|
||||
|
||||
def _find_registry_root() -> Path:
|
||||
env_root = os.environ.get("FN_REGISTRY_ROOT")
|
||||
if env_root:
|
||||
return Path(env_root).resolve()
|
||||
current = Path(__file__).resolve()
|
||||
for parent in current.parents:
|
||||
if (parent / "registry.db").exists():
|
||||
return parent
|
||||
raise FileNotFoundError("registry.db no encontrado; define FN_REGISTRY_ROOT")
|
||||
|
||||
|
||||
def _read_frontmatter(md_path: Path) -> Optional[dict]:
|
||||
text = md_path.read_text(encoding="utf-8")
|
||||
if not text.startswith("---"):
|
||||
return None
|
||||
end = text.find("\n---", 3)
|
||||
if end < 0:
|
||||
return None
|
||||
try:
|
||||
return yaml.safe_load(text[3:end])
|
||||
except yaml.YAMLError:
|
||||
return None
|
||||
|
||||
|
||||
def _iter_cpp_app_mds(root: Path):
|
||||
for pattern in ("apps/*/app.md", "projects/*/apps/*/app.md"):
|
||||
for md in sorted(root.glob(pattern)):
|
||||
fm = _read_frontmatter(md)
|
||||
if not fm or fm.get("lang") != "cpp":
|
||||
continue
|
||||
yield md, fm
|
||||
|
||||
|
||||
def regenerate_app_icons(only: Optional[list[str]] = None) -> dict:
|
||||
"""Recorre apps C++ con bloque icon: en su frontmatter y regenera appicon.ico.
|
||||
|
||||
Args:
|
||||
only: Lista opcional de nombres de app a filtrar (campo `name`). Si None,
|
||||
procesa todas las apps C++ con `icon:` declarado.
|
||||
|
||||
Returns:
|
||||
dict con keys: ok (list[str]), skipped (list[dict]), failed (list[dict]).
|
||||
"""
|
||||
root = _find_registry_root()
|
||||
ok, skipped, failed = [], [], []
|
||||
|
||||
for md, fm in _iter_cpp_app_mds(root):
|
||||
name = fm.get("name", md.parent.name)
|
||||
if only and name not in only:
|
||||
continue
|
||||
icon = fm.get("icon")
|
||||
if not icon or not isinstance(icon, dict):
|
||||
skipped.append({"name": name, "reason": "no icon: block"})
|
||||
continue
|
||||
phosphor = icon.get("phosphor")
|
||||
accent = icon.get("accent")
|
||||
if not phosphor or not accent:
|
||||
skipped.append({"name": name, "reason": "icon: missing phosphor/accent"})
|
||||
continue
|
||||
out_ico = md.parent / "appicon.ico"
|
||||
try:
|
||||
generate_app_icon(
|
||||
phosphor_icon_name=phosphor,
|
||||
accent_hex=accent,
|
||||
out_ico_path=str(out_ico),
|
||||
)
|
||||
ok.append(name)
|
||||
except Exception as e:
|
||||
failed.append({"name": name, "error": str(e)})
|
||||
|
||||
return {"ok": ok, "skipped": skipped, "failed": failed}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
only = sys.argv[1:] or None
|
||||
result = regenerate_app_icons(only=only)
|
||||
for name in result["ok"]:
|
||||
print(f"OK {name}")
|
||||
for s in result["skipped"]:
|
||||
print(f"SKIP {s['name']}: {s['reason']}")
|
||||
for f in result["failed"]:
|
||||
print(f"FAIL {f['name']}: {f['error']}")
|
||||
sys.exit(1 if result["failed"] else 0)
|
||||
Reference in New Issue
Block a user