docs(flows): DoD obligatorio con user-facing surface + abrir issues 0100-0103 (taxonomia, frontmatter migration, dev_console, work dashboard)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ name: cdp_extract_recipe
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
version: "1.2.0"
|
||||
purity: impure
|
||||
signature: "def cdp_extract_recipe(recipe_path: str, debug_port: int = 9222, tab_id: str | None = None, record_run: bool = True) -> dict"
|
||||
description: "Ejecuta una recipe YAML contra Chrome remoto via CDP. Valida recipe, busca tab por url_pattern, ejecuta steps (wait_selector/js) y envia resultado al sink declarado."
|
||||
@@ -22,7 +22,7 @@ params:
|
||||
- name: tab_id
|
||||
desc: "ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern de la recipe."
|
||||
- name: record_run
|
||||
desc: "Si True y output.sink=='data_factory.runs', registra la ejecucion en data_factory."
|
||||
desc: "Si True, registra la ejecucion en data_factory.runs (para sink 'data_factory.runs' y 'duckdb')."
|
||||
output: "dict {status: ok|error, rows_out: int, kb_out: float, duration_ms: int, error: str, sample_rows: list}"
|
||||
tested: false
|
||||
tests: []
|
||||
@@ -60,6 +60,10 @@ output:
|
||||
|
||||
Cuando tienes una recipe YAML validada y Chrome corriendo con remote debugging, y quieres extraer datos en un solo paso sin montar pipeline manualmente. Encadena con `cdp_open_url_and_wait` si necesitas abrir la URL primero.
|
||||
|
||||
## Capability growth log
|
||||
|
||||
- v1.2.0 (2026-05-16) — sink `duckdb` writes rows to a DuckDB file + registers run in data_factory.runs with storage_db_id/storage_table for traceability.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Chrome debe estar corriendo con `--remote-debugging-port=<debug_port>`.
|
||||
|
||||
@@ -41,9 +41,14 @@ def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 1
|
||||
|
||||
|
||||
def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
|
||||
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null."""
|
||||
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null.
|
||||
|
||||
Drena eventos CDP (paginas con Page.enable emiten loads, frames, etc.) y
|
||||
matchea por `id` para evitar leer respuestas ajenas o eventos del server.
|
||||
"""
|
||||
deadline = time.time() + timeout_s
|
||||
msg_id = 1000
|
||||
ws.settimeout(0.5)
|
||||
while time.time() < deadline:
|
||||
ws.send(json.dumps({
|
||||
"id": msg_id,
|
||||
@@ -53,19 +58,28 @@ def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
|
||||
"returnByValue": True,
|
||||
}
|
||||
}))
|
||||
time.sleep(0.2)
|
||||
msg_id += 1
|
||||
# Leer respuesta en loop simple (websocket-client sync)
|
||||
# Para modo sync usamos recv()
|
||||
try:
|
||||
raw = ws.sock.recv()
|
||||
if raw:
|
||||
# Leer hasta 30 frames buscando uno con nuestro id; ignorar eventos.
|
||||
got_response = False
|
||||
for _ in range(30):
|
||||
try:
|
||||
raw = ws.recv()
|
||||
except Exception:
|
||||
break
|
||||
if not raw:
|
||||
break
|
||||
try:
|
||||
msg = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
if msg.get("id") == msg_id:
|
||||
got_response = True
|
||||
val = msg.get("result", {}).get("result", {}).get("value", False)
|
||||
if val:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
msg_id += 1
|
||||
if not got_response:
|
||||
time.sleep(0.2)
|
||||
return False
|
||||
|
||||
|
||||
@@ -188,16 +202,114 @@ def cdp_extract_recipe(
|
||||
out_path = output_cfg.get("path", "output.json")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(rows, f, ensure_ascii=False, indent=2)
|
||||
elif sink == "duckdb":
|
||||
duckdb_path = output_cfg.get("duckdb_path", "")
|
||||
table_name = output_cfg.get("table", "")
|
||||
if not duckdb_path or not table_name:
|
||||
# not fatal: rows already returned via sample_rows
|
||||
pass
|
||||
else:
|
||||
import duckdb
|
||||
import uuid
|
||||
import datetime
|
||||
# resolve duckdb_path relative to FN_REGISTRY_ROOT if not absolute
|
||||
if not os.path.isabs(duckdb_path):
|
||||
duckdb_path = os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), duckdb_path)
|
||||
os.makedirs(os.path.dirname(duckdb_path), exist_ok=True)
|
||||
conn = duckdb.connect(duckdb_path)
|
||||
try:
|
||||
if rows:
|
||||
# Detect columns from first row keys (assumes list of dicts).
|
||||
if not isinstance(rows[0], dict):
|
||||
# Fallback: wrap scalar rows as {"value": v}.
|
||||
rows = [{"value": r} for r in rows]
|
||||
cols = list(rows[0].keys())
|
||||
# Build CREATE TABLE IF NOT EXISTS with VARCHAR for safety
|
||||
# plus extracted_at TIMESTAMP and run_id VARCHAR for lineage.
|
||||
col_defs = ", ".join(f'"{c}" VARCHAR' for c in cols)
|
||||
ddl = (
|
||||
f'CREATE TABLE IF NOT EXISTS "{table_name}" ('
|
||||
f' run_id VARCHAR, extracted_at TIMESTAMP, {col_defs}'
|
||||
f')'
|
||||
)
|
||||
conn.execute(ddl)
|
||||
run_id_str = uuid.uuid4().hex[:16]
|
||||
now_iso = datetime.datetime.utcnow().isoformat() + "Z"
|
||||
placeholders = ", ".join(["?"] * (len(cols) + 2))
|
||||
insert_sql = (
|
||||
f'INSERT INTO "{table_name}" '
|
||||
f'(run_id, extracted_at, {", ".join(chr(34) + c + chr(34) for c in cols)}) '
|
||||
f'VALUES ({placeholders})'
|
||||
)
|
||||
for r in rows:
|
||||
vals = [run_id_str, now_iso] + [str(r.get(c, "")) for c in cols]
|
||||
conn.execute(insert_sql, vals)
|
||||
# Also record into data_factory.runs with storage info
|
||||
registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
|
||||
if registry_root and record_run:
|
||||
import sqlite3
|
||||
df_db = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
|
||||
if os.path.exists(df_db):
|
||||
try:
|
||||
df_conn = sqlite3.connect(df_db)
|
||||
df_conn.execute("PRAGMA foreign_keys = ON")
|
||||
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
|
||||
db_id = output_cfg.get("database_id", recipe.get("name", "unknown") + "_db")
|
||||
df_run_id = uuid.uuid4().hex[:16]
|
||||
df_conn.execute(
|
||||
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
|
||||
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes,"
|
||||
" storage_db_id, storage_table)"
|
||||
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(
|
||||
df_run_id, recipe.get("name", "unknown"),
|
||||
now_iso, now_iso, "success",
|
||||
0, rows_out, 0, int(round(kb_out)), duration_ms,
|
||||
trigger, "",
|
||||
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
|
||||
db_id, table_name,
|
||||
),
|
||||
)
|
||||
df_conn.commit()
|
||||
df_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
conn.close()
|
||||
elif sink == "data_factory.runs" and record_run:
|
||||
# Escribe DIRECTO a data_factory.db evitando spawn `fn run` (loop infinito
|
||||
# si data_factory_record_run re-ejecuta esta misma funcion). Confia en que
|
||||
# el node ya existe en `nodes` con id == recipe.name.
|
||||
try:
|
||||
from pipelines.data_factory_record_run import data_factory_record_run
|
||||
data_factory_record_run(
|
||||
node_id=recipe.get("name", "unknown"),
|
||||
function_id="cdp_extract_recipe_py_pipelines",
|
||||
args={"recipe_path": recipe_path, "debug_port": debug_port},
|
||||
import sqlite3
|
||||
import datetime
|
||||
import uuid
|
||||
registry_root = os.environ.get("FN_REGISTRY_ROOT", "").strip()
|
||||
if not registry_root:
|
||||
# No fatal — el dato ya fue extraido / impreso por otro sink
|
||||
raise RuntimeError("FN_REGISTRY_ROOT not set; cannot locate data_factory.db")
|
||||
db_path = os.path.join(registry_root, "apps", "data_factory", "data_factory.db")
|
||||
trigger = "dag" if os.environ.get("DAGU_ENV") else "manual"
|
||||
run_id = uuid.uuid4().hex[:16]
|
||||
now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
node_id = recipe.get("name", "unknown")
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.execute(
|
||||
"INSERT INTO runs(id, node_id, started_at, finished_at, status,"
|
||||
" rows_in, rows_out, kb_in, kb_out, duration_ms, trigger, error, notes)"
|
||||
" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(
|
||||
run_id, node_id, now, now, "success",
|
||||
0, rows_out, 0, int(round(kb_out)), duration_ms,
|
||||
trigger, "",
|
||||
json.dumps({"sample": sample_rows[:2]}, ensure_ascii=False)[:1000],
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
# No fatal — el dato ya fue extraido
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception:
|
||||
# No fatal — el dato ya fue extraido (sample_rows en retorno)
|
||||
pass
|
||||
|
||||
return {
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
name: dedup_duckdb_table_by_hash
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
purity: impure
|
||||
version: "1.0.0"
|
||||
signature: "def dedup_duckdb_table_by_hash(duckdb_path: str, table: str, exclude_cols: list[str] | None = None) -> dict"
|
||||
description: "Elimina filas duplicadas de una tabla DuckDB calculando un md5 de las columnas de datos. Anade columna row_hash idempotentemente, actualiza hashes nulos y borra duplicados conservando la primera insercion por rowid."
|
||||
tags: [dedup, duckdb, transformer, pipeline, dataops]
|
||||
uses_functions: [cdp_extract_recipe_py_pipelines]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: error_go_core
|
||||
imports: [duckdb]
|
||||
tested: true
|
||||
tests:
|
||||
- "dedup elimina filas duplicadas y conserva unicas"
|
||||
test_file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash_test.py"
|
||||
file_path: "python/functions/pipelines/dedup_duckdb_table_by_hash.py"
|
||||
params:
|
||||
- name: duckdb_path
|
||||
desc: "Ruta DuckDB file (absoluta o relativa a FN_REGISTRY_ROOT)."
|
||||
- name: table
|
||||
desc: "Nombre tabla a deduplicar."
|
||||
- name: exclude_cols
|
||||
desc: "Cols a excluir del hash (metadata como run_id, extracted_at, row_hash). None usa default [run_id, extracted_at, row_hash]."
|
||||
output: "dict {status, rows_before, rows_after, dedup_removed, duration_ms, hash_column}"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
|
||||
|
||||
r = dedup_duckdb_table_by_hash("apps/data_factory/data/hn_top_stories.duckdb", "hn_stories")
|
||||
print(r)
|
||||
# {"status": "ok", "rows_before": 120, "rows_after": 30, "dedup_removed": 90, "duration_ms": 45, "hash_column": "row_hash"}
|
||||
```
|
||||
|
||||
CLI directo:
|
||||
|
||||
```bash
|
||||
/home/lucas/fn_registry/python/.venv/bin/python3 \
|
||||
python/functions/pipelines/dedup_duckdb_table_by_hash.py \
|
||||
apps/data_factory/data/hn_top_stories.duckdb hn_stories
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando un extractor periodico re-inserta filas iguales (mismo contenido, distinto `run_id`/`extracted_at`) y quieres deduplicar in-place sin tocar el pipeline upstream. Tipicamente como paso `transformer` despues de `cdp_extract_recipe` en un DAG de scraping.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **rowid y VACUUM**: DuckDB rowid puede recalcularse tras `VACUUM`. En esta funcion solo se usa dentro de la misma transaccion de DELETE, por lo que no hay inconsistencia practica.
|
||||
- **Colisiones md5**: md5 no colisiona en practica para tablas de escala HN (miles de filas). Si la tabla crece a millones de filas con datos binarios, cambiar `md5(...)` por `sha256(...)` en el SQL.
|
||||
- **Tabla inexistente**: si `<table>` no existe en el DuckDB, retorna `status=error` con mensaje descriptivo en lugar de lanzar excepcion.
|
||||
- **exclude_cols case**: la comparacion de columnas excluidas es case-insensitive (`c.lower()`), pero el nombre en la query se usa tal cual lo devuelve `DESCRIBE`.
|
||||
- **Primera ejecucion**: si la tabla ya tiene `row_hash` de una ejecucion anterior, solo se actualizan las filas con `row_hash IS NULL` (idempotente).
|
||||
@@ -0,0 +1,141 @@
|
||||
"""dedup_duckdb_table_by_hash — Remove duplicate rows from a DuckDB table using md5 hash of data columns."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
||||
def dedup_duckdb_table_by_hash(
|
||||
duckdb_path: str,
|
||||
table: str,
|
||||
exclude_cols: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Remove duplicate rows from a DuckDB table by computing md5 hash of data columns.
|
||||
|
||||
Args:
|
||||
duckdb_path: Path to DuckDB file. Absolute or relative to FN_REGISTRY_ROOT.
|
||||
table: Table name to deduplicate.
|
||||
exclude_cols: Columns to exclude from hash computation (metadata cols).
|
||||
Defaults to ["run_id", "extracted_at", "row_hash"].
|
||||
|
||||
Returns:
|
||||
dict with keys: status, rows_before, rows_after, dedup_removed,
|
||||
duration_ms, hash_column.
|
||||
"""
|
||||
import duckdb # type: ignore
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
# Resolve path against FN_REGISTRY_ROOT if relative
|
||||
if not os.path.isabs(duckdb_path):
|
||||
root = os.environ.get("FN_REGISTRY_ROOT", os.getcwd())
|
||||
duckdb_path = os.path.join(root, duckdb_path)
|
||||
|
||||
if exclude_cols is None:
|
||||
exclude_cols = ["run_id", "extracted_at", "row_hash"]
|
||||
|
||||
exclude_set = {c.lower() for c in exclude_cols}
|
||||
|
||||
conn = duckdb.connect(duckdb_path)
|
||||
try:
|
||||
# Verify table exists
|
||||
tables = [r[0] for r in conn.execute("SHOW TABLES").fetchall()]
|
||||
if table not in tables:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"Table '{table}' not found in {duckdb_path}. Available: {tables}",
|
||||
"rows_before": 0,
|
||||
"rows_after": 0,
|
||||
"dedup_removed": 0,
|
||||
"duration_ms": int((time.monotonic() - t0) * 1000),
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
# Introspect columns
|
||||
desc = conn.execute(f'DESCRIBE "{table}"').fetchall()
|
||||
all_cols = [r[0] for r in desc]
|
||||
existing_col_names_lower = {c.lower() for c in all_cols}
|
||||
|
||||
# Add row_hash column if missing (idempotent)
|
||||
if "row_hash" not in existing_col_names_lower:
|
||||
conn.execute(f'ALTER TABLE "{table}" ADD COLUMN row_hash VARCHAR')
|
||||
all_cols.append("row_hash")
|
||||
existing_col_names_lower.add("row_hash")
|
||||
|
||||
# Data columns = all columns minus excluded
|
||||
data_cols = [c for c in all_cols if c.lower() not in exclude_set]
|
||||
|
||||
if not data_cols:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": "No data columns remaining after exclusion.",
|
||||
"rows_before": 0,
|
||||
"rows_after": 0,
|
||||
"dedup_removed": 0,
|
||||
"duration_ms": int((time.monotonic() - t0) * 1000),
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
# Build md5 expression: md5(col1 || '\t' || col2 || ...)
|
||||
# Each col: COALESCE(CAST("colname" AS VARCHAR), '')
|
||||
parts = " || '\t' || ".join(
|
||||
f"COALESCE(CAST(\"{c}\" AS VARCHAR), '')" for c in data_cols
|
||||
)
|
||||
hash_expr = f"md5({parts})"
|
||||
|
||||
# Update row_hash where NULL
|
||||
conn.execute(
|
||||
f'UPDATE "{table}" SET row_hash = {hash_expr} WHERE row_hash IS NULL'
|
||||
)
|
||||
|
||||
# Count rows before dedup
|
||||
rows_before = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
|
||||
|
||||
# Delete duplicates, keeping row with smallest rowid (earliest insert)
|
||||
conn.execute(
|
||||
f"""
|
||||
DELETE FROM "{table}"
|
||||
WHERE rowid NOT IN (
|
||||
SELECT min(rowid) FROM "{table}" GROUP BY row_hash
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Count rows after dedup
|
||||
rows_after = conn.execute(f'SELECT count(*) FROM "{table}"').fetchone()[0]
|
||||
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
dedup_removed = rows_before - rows_after
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"rows_before": rows_before,
|
||||
"rows_after": rows_after,
|
||||
"dedup_removed": dedup_removed,
|
||||
"duration_ms": duration_ms,
|
||||
"hash_column": "row_hash",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Dedup a DuckDB table by row hash.")
|
||||
parser.add_argument("duckdb_path", help="Path to DuckDB file")
|
||||
parser.add_argument("table", help="Table name to deduplicate")
|
||||
parser.add_argument(
|
||||
"--exclude-cols",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Columns to exclude from hash (default: run_id extracted_at row_hash)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = dedup_duckdb_table_by_hash(args.duckdb_path, args.table, args.exclude_cols)
|
||||
print(json.dumps(result, indent=2))
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Tests para dedup_duckdb_table_by_hash."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from pipelines.dedup_duckdb_table_by_hash import dedup_duckdb_table_by_hash
|
||||
|
||||
|
||||
def _make_test_db(path: str) -> None:
|
||||
"""Create a test DuckDB with 5 rows: 3 unique data, 2 duplicates."""
|
||||
conn = duckdb.connect(path)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE stories (
|
||||
run_id VARCHAR,
|
||||
extracted_at TIMESTAMP,
|
||||
rank INTEGER,
|
||||
title VARCHAR,
|
||||
url VARCHAR,
|
||||
points INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO stories VALUES
|
||||
('run-001', '2026-05-16 10:00:00', 1, 'Story A', 'https://a.com', 100),
|
||||
('run-001', '2026-05-16 10:00:00', 2, 'Story B', 'https://b.com', 200),
|
||||
('run-001', '2026-05-16 10:00:00', 3, 'Story C', 'https://c.com', 300),
|
||||
('run-002', '2026-05-16 10:30:00', 1, 'Story A', 'https://a.com', 100),
|
||||
('run-002', '2026-05-16 10:30:00', 2, 'Story B', 'https://b.com', 200)
|
||||
"""
|
||||
)
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_dedup_elimina_filas_duplicadas_y_conserva_unicas():
|
||||
"""dedup elimina filas duplicadas y conserva unicas"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "test.duckdb")
|
||||
_make_test_db(db_path)
|
||||
|
||||
result = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
|
||||
assert result["status"] == "ok", f"Expected ok, got: {result}"
|
||||
assert result["rows_before"] == 5
|
||||
assert result["rows_after"] == 3, f"Expected 3 unique rows, got {result['rows_after']}"
|
||||
assert result["dedup_removed"] == 2
|
||||
assert result["hash_column"] == "row_hash"
|
||||
assert result["duration_ms"] >= 0
|
||||
|
||||
# Verify row_hash column exists and is populated
|
||||
conn = duckdb.connect(db_path)
|
||||
hashes = conn.execute("SELECT DISTINCT row_hash FROM stories").fetchall()
|
||||
conn.close()
|
||||
assert len(hashes) == 3, f"Expected 3 distinct hashes, got {len(hashes)}"
|
||||
# All hashes should be non-null
|
||||
assert all(h[0] is not None for h in hashes), "Some row_hash values are NULL"
|
||||
|
||||
|
||||
def test_dedup_idempotente():
|
||||
"""Running dedup twice leaves rows_after unchanged."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "test.duckdb")
|
||||
_make_test_db(db_path)
|
||||
|
||||
r1 = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
r2 = dedup_duckdb_table_by_hash(db_path, "stories")
|
||||
|
||||
assert r1["status"] == "ok"
|
||||
assert r2["status"] == "ok"
|
||||
assert r2["rows_before"] == 3
|
||||
assert r2["rows_after"] == 3
|
||||
assert r2["dedup_removed"] == 0
|
||||
|
||||
|
||||
def test_dedup_tabla_inexistente():
|
||||
"""Returns status=error when table does not exist."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
db_path = os.path.join(tmpdir, "empty.duckdb")
|
||||
conn = duckdb.connect(db_path)
|
||||
conn.close()
|
||||
|
||||
result = dedup_duckdb_table_by_hash(db_path, "nonexistent_table")
|
||||
assert result["status"] == "error"
|
||||
assert "nonexistent_table" in result["error"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: regenerate_app_icons
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def regenerate_app_icons(only: list[str] | None = None) -> dict"
|
||||
description: "Escanea todas las apps C++ del registry, lee el bloque `icon: {phosphor, accent}` de cada app.md y regenera el appicon.ico via generate_app_icon. Reemplaza el script ad-hoc dev/gen_app_icons.py."
|
||||
tags: [cpp-windows, icon, phosphor, batch]
|
||||
uses_functions: [generate_app_icon_py_infra]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [os, sys, pathlib, typing, yaml]
|
||||
params:
|
||||
- name: only
|
||||
desc: "Lista opcional de nombres de app (campo `name` del frontmatter) a procesar. Si None, regenera todas las apps C++ con icon: declarado."
|
||||
output: "dict {ok: [name], skipped: [{name, reason}], failed: [{name, error}]}"
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/pipelines/regenerate_app_icons.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```bash
|
||||
# Regenerar todas las apps C++ con icon: declarado
|
||||
./fn run regenerate_app_icons
|
||||
|
||||
# Solo una app
|
||||
./fn run regenerate_app_icons chart_demo
|
||||
|
||||
# Varias apps
|
||||
./fn run regenerate_app_icons chart_demo registry_dashboard
|
||||
```
|
||||
|
||||
```python
|
||||
import sys
|
||||
sys.path.insert(0, "python/functions")
|
||||
from pipelines.regenerate_app_icons import regenerate_app_icons
|
||||
|
||||
result = regenerate_app_icons()
|
||||
print(f"OK: {len(result['ok'])}, FAIL: {len(result['failed'])}")
|
||||
```
|
||||
|
||||
Bloque `icon:` esperado en `app.md`:
|
||||
```yaml
|
||||
icon:
|
||||
phosphor: "chart-bar"
|
||||
accent: "#0ea5e9"
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando anades una app C++ nueva (anades `icon:` a su `app.md` y corres el pipeline), cambias el color/glyph de una app existente, o pulleas cambios de iconos desde otra rama. Antes de `redeploy_cpp_app_windows` para que el `.exe` lleve el icono actualizado.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Sobreescribe `appicon.ico` sin warning** — igual que `generate_app_icon`. Hacer backup si necesitas preservar version anterior.
|
||||
- **Requiere `sources/phosphor-core/`**: clonar con `git clone --depth=1 https://github.com/phosphor-icons/core.git sources/phosphor-core` si no existe.
|
||||
- **Solo procesa apps con `lang: cpp`** en frontmatter — apps Go/Python se ignoran aunque tengan `icon:`.
|
||||
- **Apps sin `icon:` se reportan en `skipped`**, no son error. Util para detectar apps C++ a las que falta declarar el icono.
|
||||
- **No invalida el cache de iconos de Windows** — si Explorer no muestra el icono nuevo tras redeploy: `ie4uinit.exe -show` o reiniciar Explorer.
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Regenera el appicon.ico de todas las apps C++ que declaren bloque icon: en su app.md."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from infra.generate_app_icon import generate_app_icon
|
||||
|
||||
|
||||
def _find_registry_root() -> Path:
|
||||
env_root = os.environ.get("FN_REGISTRY_ROOT")
|
||||
if env_root:
|
||||
return Path(env_root).resolve()
|
||||
current = Path(__file__).resolve()
|
||||
for parent in current.parents:
|
||||
if (parent / "registry.db").exists():
|
||||
return parent
|
||||
raise FileNotFoundError("registry.db no encontrado; define FN_REGISTRY_ROOT")
|
||||
|
||||
|
||||
def _read_frontmatter(md_path: Path) -> Optional[dict]:
|
||||
text = md_path.read_text(encoding="utf-8")
|
||||
if not text.startswith("---"):
|
||||
return None
|
||||
end = text.find("\n---", 3)
|
||||
if end < 0:
|
||||
return None
|
||||
try:
|
||||
return yaml.safe_load(text[3:end])
|
||||
except yaml.YAMLError:
|
||||
return None
|
||||
|
||||
|
||||
def _iter_cpp_app_mds(root: Path):
|
||||
for pattern in ("apps/*/app.md", "projects/*/apps/*/app.md"):
|
||||
for md in sorted(root.glob(pattern)):
|
||||
fm = _read_frontmatter(md)
|
||||
if not fm or fm.get("lang") != "cpp":
|
||||
continue
|
||||
yield md, fm
|
||||
|
||||
|
||||
def regenerate_app_icons(only: Optional[list[str]] = None) -> dict:
|
||||
"""Recorre apps C++ con bloque icon: en su frontmatter y regenera appicon.ico.
|
||||
|
||||
Args:
|
||||
only: Lista opcional de nombres de app a filtrar (campo `name`). Si None,
|
||||
procesa todas las apps C++ con `icon:` declarado.
|
||||
|
||||
Returns:
|
||||
dict con keys: ok (list[str]), skipped (list[dict]), failed (list[dict]).
|
||||
"""
|
||||
root = _find_registry_root()
|
||||
ok, skipped, failed = [], [], []
|
||||
|
||||
for md, fm in _iter_cpp_app_mds(root):
|
||||
name = fm.get("name", md.parent.name)
|
||||
if only and name not in only:
|
||||
continue
|
||||
icon = fm.get("icon")
|
||||
if not icon or not isinstance(icon, dict):
|
||||
skipped.append({"name": name, "reason": "no icon: block"})
|
||||
continue
|
||||
phosphor = icon.get("phosphor")
|
||||
accent = icon.get("accent")
|
||||
if not phosphor or not accent:
|
||||
skipped.append({"name": name, "reason": "icon: missing phosphor/accent"})
|
||||
continue
|
||||
out_ico = md.parent / "appicon.ico"
|
||||
try:
|
||||
generate_app_icon(
|
||||
phosphor_icon_name=phosphor,
|
||||
accent_hex=accent,
|
||||
out_ico_path=str(out_ico),
|
||||
)
|
||||
ok.append(name)
|
||||
except Exception as e:
|
||||
failed.append({"name": name, "error": str(e)})
|
||||
|
||||
return {"ok": ok, "skipped": skipped, "failed": failed}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
only = sys.argv[1:] or None
|
||||
result = regenerate_app_icons(only=only)
|
||||
for name in result["ok"]:
|
||||
print(f"OK {name}")
|
||||
for s in result["skipped"]:
|
||||
print(f"SKIP {s['name']}: {s['reason']}")
|
||||
for f in result["failed"]:
|
||||
print(f"FAIL {f['name']}: {f['error']}")
|
||||
sys.exit(1 if result["failed"] else 0)
|
||||
Reference in New Issue
Block a user