diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index 6302642f..f1505d22 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -34,6 +34,7 @@ from .theils_u import theils_u from .correlation_ratio import correlation_ratio from .mutual_info_columns import mutual_info_columns from .infer_fk_containment_duckdb import infer_fk_containment_duckdb +from .detect_declared_keys_duckdb import detect_declared_keys_duckdb from .build_join_graph import build_join_graph from .association_matrix import association_matrix from .correlation_matrix_duckdb import correlation_matrix_duckdb @@ -69,8 +70,10 @@ from .build_eda_render_ctx import build_eda_render_ctx from .profile_datetime import profile_datetime from .resample_timeseries import resample_timeseries from .add_pdf_internal_links import add_pdf_internal_links +from .suggest_intratable_fk_candidates import suggest_intratable_fk_candidates __all__ = [ + "suggest_intratable_fk_candidates", "detect_time_column", "extract_timeseries_raw", "build_eda_render_ctx", @@ -97,6 +100,7 @@ __all__ = [ "correlation_ratio", "mutual_info_columns", "infer_fk_containment_duckdb", + "detect_declared_keys_duckdb", "build_join_graph", "association_matrix", "correlation_matrix_duckdb", diff --git a/python/functions/datascience/automatic_eda/chapters/relaciones.py b/python/functions/datascience/automatic_eda/chapters/relaciones.py new file mode 100644 index 00000000..eba05f76 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/relaciones.py @@ -0,0 +1,500 @@ +"""Key-relations chapter (RELACIONES) — the keys / join structure of the data. + +This chapter is the *relational* section of an AutomaticEDA report. It answers a +single question for the table (or the whole DuckDB source it lives in): **how do +the keys relate?** It composes, without reimplementing them, the registry's +relation primitives and degrades honestly when a layer does not apply. + +It renders, in order, only the layers that have something to say: + +1. **Declared keys** (real schema constraints) — when the DuckDB source declares + PRIMARY KEY / FOREIGN KEY / UNIQUE constraints, they are read verbatim via + ``detect_declared_keys_duckdb`` and shown as ground truth: which column is the + PK, which columns are FKs and the table/column they point to. +2. **Primary-key candidates** — the ``key_candidates`` the TableProfile already + carries (columns whose cardinality equals the row count, with no nulls). These + are *candidates*: a column that could serve as the row identifier. +3. **Foreign-key candidates** when none are declared: + - **Inter-table** (the DuckDB source has several tables): real FK candidates by + name signal + value containment via ``infer_fk_containment_duckdb``, plus the + join graph (roles + a pasteable Mermaid diagram) via ``build_join_graph``. + - **Intra-table** (a single table): columns that *look* like a foreign key by a + name+cardinality heuristic (``suggest_intratable_fk_candidates``). This is a + **suggestion**, explicitly flagged as a heuristic, never an assertion. + +``build_relaciones(profile, ctx) -> Chapter | None``: returns ``None`` when there +is nothing to say (no declared key, no key candidates, and no FK candidate — +inter- or intra-table). Reads everything defensively (``.get``) and never raises: +anything missing degrades to a note or is omitted; a failing registry call drops +its layer instead of aborting the chapter. + +ctx keys this chapter consumes (all optional): + db_path, table : str — the DuckDB file and table being profiled (set by + ``build_eda_render_ctx``). ``db_path`` is needed to read declared + constraints, to list the sibling tables, and to run the containment-based + FK inference. Without it, only the profile-derived layers (PK candidates, + intra-table FK heuristic) are available. + glossary : model.GlossaryCollector — shared glossary; the chapter registers + the relational terms (PK, FK, containment, cardinality) and marks their + first appearance clickable. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +# Pure/impure registry functions (group ``eda``) this chapter composes. Imported +# defensively (module-leaf imports, like the AGREGACION chapter) so the chapter +# still builds — degrading the affected layer to nothing — if a function is +# somehow unavailable / not indexed yet. +try: + from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb +except Exception: # noqa: BLE001 — keep the chapter importable no matter what. + detect_declared_keys_duckdb = None # type: ignore[assignment] +try: + from datascience.infer_fk_containment_duckdb import infer_fk_containment_duckdb +except Exception: # noqa: BLE001 + infer_fk_containment_duckdb = None # type: ignore[assignment] +try: + from datascience.build_join_graph import build_join_graph +except Exception: # noqa: BLE001 + build_join_graph = None # type: ignore[assignment] +try: + from datascience.suggest_intratable_fk_candidates import ( + suggest_intratable_fk_candidates, + ) +except Exception: # noqa: BLE001 + suggest_intratable_fk_candidates = None # type: ignore[assignment] +try: + from infra import duckdb_list_tables +except Exception: # noqa: BLE001 + duckdb_list_tables = None # type: ignore[assignment] + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "relaciones" +CHAPTER_TITLE = "Relaciones de clave" + +# Cap the inter-table FK table so a wide schema does not blow up the page; the +# rest is summarized in a closing note (no silent truncation). +MAX_FK_ROWS = 40 + +# --------------------------------------------------------------------------- # +# Glossary terms this chapter explains. Registered in the shared collector and +# marked clickable on their first appearance (contract §11.1). +# --------------------------------------------------------------------------- # +_TERMS = { + "pk": ( + "Clave primaria (PK)", + "Columna (o conjunto de columnas) que identifica de forma única cada fila " + "de una tabla: sus valores no se repiten y no son nulos. Una tabla tiene " + "como mucho una clave primaria; es el ancla por la que otras tablas la " + "referencian.", + ), + "fk": ( + "Clave foránea (FK)", + "Columna de una tabla cuyos valores apuntan a la clave primaria de otra " + "tabla (o de la misma), creando una relación entre ambas. Una FK suele ser " + "N:1: muchas filas de la tabla origen comparten el mismo valor de la tabla " + "destino.", + ), + "containment": ( + "Containment / inclusión", + "Señal con la que se infiere una clave foránea sin que la base la declare: " + "la fracción de valores distintos de una columna A que también aparecen " + "como valores de otra columna B. Si casi todos los valores de A están " + "contenidos en B (inclusión ≈ 1) y B parece una clave, A → B es una FK " + "candidata.", + ), + "cardinalidad": ( + "Cardinalidad", + "Número de valores distintos de una columna. Cardinalidad igual al número " + "de filas (y sin nulos) señala un identificador (candidato a clave " + "primaria); cardinalidad alta pero menor que el número de filas, con " + "valores repetidos, es típica de una clave foránea.", + ), +} + + +def _register_terms(ctx: dict) -> bool: + """Register the relational terms in the shared glossary. Returns whether the + in-text appearances should be marked clickable.""" + glossary = ctx.get("glossary") + if not isinstance(glossary, model.GlossaryCollector): + return False + for key, (label, definition) in _TERMS.items(): + glossary.add(key, label, definition) + return True + + +# --------------------------------------------------------------------------- # +# Formatting helpers (mirror the other chapters' defensive style). +# --------------------------------------------------------------------------- # +def _fmt_int(value) -> str: + if value is None: + return "—" + try: + return f"{int(value):,}".replace(",", ".") + except (TypeError, ValueError): + return model._safe_str(value) + + +def _fmt_pct_fraction(value, decimals: int = 1) -> str: + """Format a 0–1 fraction as a percentage. None -> placeholder.""" + if value is None: + return "—" + try: + v = float(value) + except (TypeError, ValueError): + return model._safe_str(value) + if v <= 1.0: + v *= 100.0 + return f"{v:.{decimals}f}%" + + +def _fmt_ratio(value, decimals: int = 3) -> str: + """Format an already-0–1 ratio (inclusion) as a plain number.""" + if value is None: + return "—" + try: + return f"{float(value):.{decimals}f}".rstrip("0").rstrip(".") + except (TypeError, ValueError): + return model._safe_str(value) + + +def _is_dict(v) -> bool: + return isinstance(v, dict) + + +def _columns_by_name(profile: dict) -> dict: + """Index the profile columns by name for quick metric lookup.""" + out = {} + for col in (profile.get("columns") or []): + if _is_dict(col) and col.get("name") is not None: + out[col.get("name")] = col + return out + + +# --------------------------------------------------------------------------- # +# Layer 1 — declared keys (real schema constraints). +# --------------------------------------------------------------------------- # +def _declared_keys(db_path: str, table: str): + """Read declared PK/FK/UNIQUE for the source, or None if unavailable.""" + if not db_path or detect_declared_keys_duckdb is None: + return None + try: + out = detect_declared_keys_duckdb(db_path, table) + except Exception: # noqa: BLE001 — dict-no-throw: treat as unavailable. + return None + if not _is_dict(out) or out.get("status") != "ok": + return None + return out + + +def _declared_section(declared: dict) -> list: + """Blocks for the declared-keys layer, or [] if there is nothing declared.""" + pks = [p for p in (declared.get("primary_keys") or []) if _is_dict(p)] + fks = [f for f in (declared.get("foreign_keys") or []) if _is_dict(f)] + uqs = [u for u in (declared.get("unique") or []) if _is_dict(u)] + if not (pks or fks or uqs): + return [] + + blocks = [ + model.Heading(text="Claves declaradas en el esquema", level=2), + model.Markdown(text=( + "La base **declara** estas relaciones de clave como restricciones " + "reales del esquema (constraints). Son la verdad de referencia: no se " + "infieren, se leen tal cual de la definición de las tablas.")), + ] + + if pks: + rows = [[model._safe_str(p.get("table")), + ", ".join(model._safe_str(c) for c in (p.get("columns") or []))] + for p in pks] + blocks.append(model.DataTable( + header=["Tabla", "Columna(s) PK"], rows=rows, + title="Claves primarias declaradas", + note="Cada fila: la clave primaria declarada de una tabla.")) + + if fks: + rows = [] + for f in fks: + src = ", ".join(model._safe_str(c) for c in (f.get("columns") or [])) + dst = ", ".join( + model._safe_str(c) for c in (f.get("referenced_columns") or [])) + rows.append([ + model._safe_str(f.get("table")), src, + model._safe_str(f.get("referenced_table")), dst]) + blocks.append(model.DataTable( + header=["Tabla origen", "Columna(s) FK", "→ Tabla destino", + "Columna(s) destino"], + rows=rows, title="Claves foráneas declaradas", + note="Cada fila: una FK declarada — origen → destino.")) + + if uqs: + rows = [[model._safe_str(u.get("table")), + ", ".join(model._safe_str(c) for c in (u.get("columns") or []))] + for u in uqs] + blocks.append(model.DataTable( + header=["Tabla", "Columna(s) UNIQUE"], rows=rows, + title="Restricciones UNIQUE declaradas")) + + return blocks + + +# --------------------------------------------------------------------------- # +# Layer 2 — primary-key candidates (from the profile). +# --------------------------------------------------------------------------- # +def _pk_candidates_section(profile: dict, mark: bool) -> list: + """Blocks for the PK-candidates layer, or [] if there are none.""" + keys = [k for k in (profile.get("key_candidates") or []) if k is not None] + if not keys: + return [] + by_name = _columns_by_name(profile) + + pk = ("[[term:pk]]**clave primaria**[[/term]]" if mark + else "**clave primaria**") + intro = ( + f"Estas columnas son **candidatas a {pk}**: su " + "[[term:cardinalidad]]cardinalidad[[/term]] iguala al número de filas y no " + "tienen nulos, así que cada valor identifica una fila distinta. Son " + "candidatas, no una clave declarada: la base no las marca como tal." + if mark else + "Estas columnas son **candidatas a clave primaria**: su cardinalidad " + "iguala al número de filas y no tienen nulos, así que cada valor " + "identifica una fila distinta.") + + rows = [] + for name in keys: + col = by_name.get(name) or {} + rows.append([ + model._safe_str(name), + _fmt_int(col.get("distinct_count")), + _fmt_pct_fraction(col.get("unique_pct")), + model._safe_str(col.get("inferred_type") or col.get("physical_type") or "—"), + ]) + return [ + model.Heading(text="Candidatos a clave primaria", level=2), + model.Markdown(text=intro), + model.DataTable( + header=["Columna", "Valores distintos", "% único", "Tipo"], + rows=rows, title="Candidatas a clave primaria", + note=f"{_fmt_int(profile.get('n_rows'))} filas en total como referencia."), + ] + + +# --------------------------------------------------------------------------- # +# Layer 3a — inter-table FK candidates (containment) + join graph. +# --------------------------------------------------------------------------- # +def _list_source_tables(db_path: str) -> list: + """List the tables in the DuckDB source, or [] if it can't be listed.""" + if not db_path or duckdb_list_tables is None: + return [] + try: + out = duckdb_list_tables(db_path) + except Exception: # noqa: BLE001 + return [] + if not _is_dict(out) or out.get("status") != "ok": + return [] + return [t for t in (out.get("tables") or []) if isinstance(t, str)] + + +def _inter_table_section(db_path: str, tables: list, mark: bool) -> list: + """Blocks for the inter-table FK layer (containment + join graph), or [].""" + if infer_fk_containment_duckdb is None or len(tables) < 2: + return [] + try: + fk = infer_fk_containment_duckdb(db_path, tables=tables) + except Exception: # noqa: BLE001 + return [] + if not _is_dict(fk) or fk.get("status") != "ok": + return [] + candidates = [c for c in (fk.get("fk_candidates") or []) if _is_dict(c)] + if not candidates: + return [] + + containment = ("[[term:containment]]containment (inclusión de valores)[[/term]]" + if mark else "containment (inclusión de valores)") + fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**" + blocks = [ + model.Heading(text="Claves foráneas candidatas (inter-tabla)", level=2), + model.Markdown(text=( + f"La fuente tiene varias tablas. Estas {fk_term} candidatas se infieren " + f"por señal de nombre y por {containment}: una columna de una tabla cuyos " + "valores están contenidos en la clave de otra. No están declaradas por " + "la base; son la relación más probable según los datos.")), + ] + + shown = candidates[:MAX_FK_ROWS] + rows = [] + for c in shown: + rows.append([ + f"{model._safe_str(c.get('from_table'))}.{model._safe_str(c.get('from_col'))}", + f"{model._safe_str(c.get('to_table'))}.{model._safe_str(c.get('to_col'))}", + _fmt_ratio(c.get("inclusion")), + model._safe_str(c.get("cardinality") or "—"), + "sí" if c.get("name_match") else "no", + ]) + note = "Ordenadas por señal de nombre e inclusión." + if len(candidates) > len(shown): + note += f" Se muestran {len(shown)} de {len(candidates)} candidatas." + blocks.append(model.DataTable( + header=["Origen", "→ Destino", "Inclusión", "Cardinalidad", "Coincide nombre"], + rows=rows, title="FK candidatas por containment", note=note)) + + # Join graph: node roles + a pasteable Mermaid diagram, kept together. + if build_join_graph is not None: + try: + graph = build_join_graph(candidates, tables=tables) + except Exception: # noqa: BLE001 + graph = None + if _is_dict(graph): + graph_blocks = [model.Heading(text="Grafo de relaciones", level=3)] + nodes = [n for n in (graph.get("nodes") or []) if _is_dict(n)] + if nodes: + node_rows = [[ + model._safe_str(n.get("table")), + model._safe_str(n.get("role") or "—"), + _fmt_int(n.get("out_degree")), + _fmt_int(n.get("in_degree")), + ] for n in nodes] + graph_blocks.append(model.DataTable( + header=["Tabla", "Rol", "FK salientes", "FK entrantes"], + rows=node_rows, title="Tablas y su rol en el grafo", + note="Rol: fact (apunta a otras), dimension (referenciada), " + "bridge (ambas), standalone (aislada).")) + hubs = [h for h in (graph.get("hubs") or []) if h] + if hubs: + graph_blocks.append(model.Markdown(text=( + "Tablas con más relaciones salientes (candidatas a tabla de " + "hechos): " + ", ".join(model._safe_str(h) for h in hubs) + "."))) + mermaid = model._safe_str(graph.get("mermaid")).strip() + if mermaid: + graph_blocks.append(model.Markdown(text=( + "Diagrama de las relaciones (pegable en un bloque Mermaid):"))) + graph_blocks.append(model.Markdown( + text="```mermaid\n" + mermaid + "\n```")) + if len(graph_blocks) > 1: + blocks.append(model.Group(blocks=graph_blocks, + title="Grafo de relaciones")) + + skipped = [s for s in (fk.get("skipped") or []) if s] + if skipped: + blocks.append(model.Note( + "Algunos pares se omitieron por tamaño: " + + "; ".join(model._safe_str(s) for s in skipped) + ".")) + return blocks + + +# --------------------------------------------------------------------------- # +# Layer 3b — intra-table FK candidates (name+cardinality heuristic). +# --------------------------------------------------------------------------- # +def _intra_table_section(profile: dict, mark: bool) -> list: + """Blocks for the intra-table FK heuristic layer, or [] if no candidates.""" + if suggest_intratable_fk_candidates is None: + return [] + try: + cands = suggest_intratable_fk_candidates(profile) + except Exception: # noqa: BLE001 + return [] + cands = [c for c in (cands or []) if _is_dict(c)] + if not cands: + return [] + + fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**" + blocks = [ + model.Heading(text="Posibles claves foráneas (heurística de nombre)", level=2), + model.Markdown(text=( + f"No hay otras tablas que referenciar, pero algunas columnas **parecen** " + f"{fk_term} por su nombre (terminan en «id») y su cardinalidad (muchos " + "valores repetidos, N:1). Es una **sugerencia heurística**, no una " + "afirmación: el nombre de la tabla destino es una conjetura y no se " + "comprueba inclusión de valores contra ninguna tabla real.")), + ] + rows = [] + for c in cands: + rows.append([ + model._safe_str(c.get("column")), + model._safe_str(c.get("ref_table_guess") or "—"), + _fmt_int(c.get("distinct_count")), + _fmt_pct_fraction(c.get("unique_pct")), + model._safe_str(c.get("inferred_type") or c.get("physical_type") or "—"), + model._safe_str(c.get("reason") or ""), + ]) + blocks.append(model.DataTable( + header=["Columna", "Posible tabla", "Valores distintos", "% único", + "Tipo", "Motivo"], + rows=rows, title="Posibles FK por nombre y cardinalidad", + note="Heurística: posibles falsos positivos/negativos. No confirma containment.")) + blocks.append(model.Note( + "Estas sugerencias se basan solo en el nombre y la cardinalidad. Para " + "confirmarlas haría falta la tabla destino y comprobar la inclusión de " + "valores (containment).")) + return blocks + + +# --------------------------------------------------------------------------- # +# Entry point. +# --------------------------------------------------------------------------- # +def _intro_blocks(mark: bool) -> list: + pk = "[[term:pk]]clave primaria[[/term]]" if mark else "clave primaria" + fk = "[[term:fk]]clave foránea[[/term]]" if mark else "clave foránea" + text = ( + f"Este capítulo analiza las **relaciones de clave** de la tabla: qué columna " + f"identifica cada fila (la {pk}) y qué columnas referencian a otra tabla (las " + f"{fk}). Cuando la base las **declara** como restricciones del esquema, se " + "muestran tal cual; cuando no, se proponen las más probables a partir de los " + "datos —por inclusión de valores entre tablas (containment) o, en una sola " + "tabla, por una heurística de nombre y cardinalidad— siempre marcadas como " + "candidatas, nunca como hechos.") + return [model.Heading(text=CHAPTER_TITLE, level=1), model.Markdown(text=text)] + + +def build_relaciones(profile: dict, ctx: dict): + """Build the RELACIONES Chapter, or None if there is nothing to say. + + Args: + profile: the ``eda`` group TableProfile dict (may be None/empty). + ctx: presentation context. Consumes ``db_path`` + ``table`` (to read + declared constraints, list sibling tables and run the containment FK + inference) and ``glossary`` (to register the relational terms). + + Returns: + A ``model.Chapter`` with the applicable relation layers; or ``None`` when + the dataset has no declared key, no key candidates and no FK candidate + (neither inter- nor intra-table). + """ + if not isinstance(profile, dict): + profile = {} + ctx = ctx if isinstance(ctx, dict) else {} + db_path = ctx.get("db_path") + table = ctx.get("table") + + mark = _register_terms(ctx) + + # Build each layer; the chapter is the concatenation of the non-empty ones. + declared = _declared_keys(db_path, table) + declared_blocks = _declared_section(declared) if declared else [] + declared_has_fk = bool(declared and declared.get("foreign_keys")) + + pk_blocks = _pk_candidates_section(profile, mark) + + tables = _list_source_tables(db_path) + inter_blocks = _inter_table_section(db_path, tables, mark) + + # The intra-table heuristic only makes sense when no real FK is available for + # this table — neither declared nor inferred inter-table. Otherwise the real + # relations already answer the question and the heuristic is just noise. + if declared_has_fk or inter_blocks: + intra_blocks = [] + else: + intra_blocks = _intra_table_section(profile, mark) + + body = declared_blocks + pk_blocks + inter_blocks + intra_blocks + if not body: + return None # chapter does not apply: nothing to say about relations. + + blocks = _intro_blocks(mark) + body + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/relaciones_test.py b/python/functions/datascience/automatic_eda/chapters/relaciones_test.py new file mode 100644 index 00000000..5ccf620c --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/relaciones_test.py @@ -0,0 +1,273 @@ +"""Tests for the RELACIONES chapter — DoD: golden(s) + edges + no-cut render. + +Two goldens covering the two real paths of the chapter: + +- **Intra-table** (a single table, no db source for relations): the chapter shows + the primary-key candidates from the profile and the heuristic foreign-key + suggestions (name + cardinality), explicitly flagged as a heuristic. Renders to + PDF and PPTX with nothing cut. +- **Inter-table** (a real DuckDB file with two related tables, customers/orders, + with a declared FK): the chapter shows the declared keys, the containment-based + FK candidates and the join graph (roles + a pasteable Mermaid diagram). + +Edges: a profile with no key candidate and no FK-looking column returns None; +``None`` / ``{}`` profiles do not raise. The chapter registers its glossary terms. + +Layers that depend on the sibling registry functions delegated alongside this +chapter (``detect_declared_keys_duckdb``, ``suggest_intratable_fk_candidates``) +are asserted **conditionally on the function being importable**, so the chapter's +honest-degradation contract is what is tested, never a hard dependency on import +timing. +""" + +import os +import tempfile + +import duckdb +from pptx import Presentation +from pypdf import PdfReader + +from datascience.automatic_eda.chapters.relaciones import build_relaciones +from datascience.automatic_eda.model import Chapter, Group, GlossaryCollector +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + +# The optional sibling functions: their layers are asserted only when present. +try: + from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb +except Exception: # noqa: BLE001 + detect_declared_keys_duckdb = None +try: + from datascience.suggest_intratable_fk_candidates import ( + suggest_intratable_fk_candidates, + ) +except Exception: # noqa: BLE001 + suggest_intratable_fk_candidates = None + + +# --------------------------------------------------------------------------- # +# Helpers. +# --------------------------------------------------------------------------- # +def _flatten(blocks) -> list: + """Flatten Group blocks so a test can inspect every leaf block.""" + out = [] + for b in blocks: + if isinstance(b, Group): + out.extend(_flatten(b.blocks)) + else: + out.append(b) + return out + + +def _text_of(chapter: Chapter) -> str: + """Collect all visible text of a chapter's blocks into one string.""" + parts = [] + for b in _flatten(chapter.blocks): + for attr in ("text", "title", "note"): + v = getattr(b, attr, None) + if isinstance(v, str): + parts.append(v) + header = getattr(b, "header", None) + if isinstance(header, list): + parts.extend(str(c) for c in header) + rows = getattr(b, "rows", None) + if isinstance(rows, list): + for r in rows: + if isinstance(r, (list, tuple)): + parts.extend(str(c) for c in r) + else: + parts.append(str(r)) + return "\n".join(parts) + + +def _render_both(chapter: Chapter, tag: str): + """Render the chapter to PDF and PPTX; return (pdf_text, n_slides).""" + tmp = tempfile.mkdtemp(prefix=f"relaciones_{tag}_") + pdf_path = os.path.join(tmp, "out.pdf") + pptx_path = os.path.join(tmp, "out.pptx") + meta = {"title": f"EDA — {tag}"} + render_automatic_eda_pdf([chapter], pdf_path, meta) + render_automatic_eda_pptx([chapter], pptx_path, meta) + assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0 + assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0 + text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages) + n_slides = len(Presentation(pptx_path).slides) + return text, n_slides + + +# --------------------------------------------------------------------------- # +# Fixtures. +# --------------------------------------------------------------------------- # +def _titanic_profile() -> dict: + """A single-table profile: a PK candidate + a column that looks like a FK.""" + return { + "table": "titanic", + "source": "/data/titanic.csv", + "n_rows": 891, + "n_cols": 4, + "key_candidates": ["PassengerId"], + "columns": [ + {"name": "PassengerId", "inferred_type": "numeric", + "physical_type": "BIGINT", "distinct_count": 891, + "unique_pct": 1.0, "flags": ["possible_id"]}, + {"name": "ticket_id", "inferred_type": "numeric", + "physical_type": "BIGINT", "distinct_count": 681, + "unique_pct": 0.76, "flags": []}, + {"name": "fare", "inferred_type": "numeric", + "physical_type": "DOUBLE", "distinct_count": 248, + "unique_pct": 0.28, "flags": []}, + {"name": "sex", "inferred_type": "categorical", + "physical_type": "VARCHAR", "distinct_count": 2, + "unique_pct": 0.002, "flags": []}, + ], + } + + +def _make_relational_db(path: str) -> None: + """Create a small DuckDB with customers(id) <- orders(customer_id), real FK.""" + con = duckdb.connect(path) + con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)") + con.execute( + "CREATE TABLE orders(id INTEGER PRIMARY KEY, " + "customer_id INTEGER REFERENCES customers(id), amount DOUBLE)") + con.execute("INSERT INTO customers VALUES " + "(1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e')") + con.execute("INSERT INTO orders VALUES " + "(1,1,10.0),(2,1,20.0),(3,2,30.0),(4,3,40.0)," + "(5,3,50.0),(6,4,60.0),(7,5,70.0),(8,2,80.0)") + con.close() + + +def _orders_profile() -> dict: + """A profile for the `orders` table of the relational DB.""" + return { + "table": "orders", + "source": "orders", + "n_rows": 8, + "n_cols": 3, + "key_candidates": ["id"], + "columns": [ + {"name": "id", "inferred_type": "numeric", "physical_type": "INTEGER", + "distinct_count": 8, "unique_pct": 1.0, "flags": ["possible_id"]}, + {"name": "customer_id", "inferred_type": "numeric", + "physical_type": "INTEGER", "distinct_count": 5, "unique_pct": 0.625, + "flags": []}, + {"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE", + "distinct_count": 8, "unique_pct": 1.0, "flags": []}, + ], + } + + +# --------------------------------------------------------------------------- # +# Golden 1 — intra-table. +# --------------------------------------------------------------------------- # +def test_golden_intra_table_pk_and_fk_heuristic(): + """Single table: PK candidate shown; FK heuristic shown (if fn available); + renders to PDF + PPTX with nothing cut.""" + prof = _titanic_profile() + glossary = GlossaryCollector() + # No db_path: only the profile-derived layers apply (no declared, no inter). + chapter = build_relaciones(prof, {"glossary": glossary}) + + assert isinstance(chapter, Chapter) + assert chapter.id == "relaciones" + text = _text_of(chapter) + + # PK candidate is always present (comes from the profile). + assert "Candidatos a clave primaria" in text + assert "PassengerId" in text + + # Glossary terms got registered. + for key in ("pk", "fk", "cardinalidad"): + assert glossary.has(key) + + # FK heuristic layer: present iff the delegated function is importable. + if suggest_intratable_fk_candidates is not None: + assert "Posibles claves foráneas" in text + assert "ticket_id" in text + # The float measure and the PK itself are NOT suggested as FKs. + assert "Posibles FK por nombre" in text + + pdf_text, n_slides = _render_both(chapter, "intra") + assert "PassengerId" in pdf_text + assert n_slides >= 1 + + +# --------------------------------------------------------------------------- # +# Golden 2 — inter-table (real DuckDB). +# --------------------------------------------------------------------------- # +def test_golden_inter_table_containment_and_join_graph(): + """Two related tables: declared FK (if fn available) + containment FK + candidate + Mermaid join graph.""" + tmp = tempfile.mkdtemp(prefix="relaciones_db_") + db_path = os.path.join(tmp, "shop.duckdb") + _make_relational_db(db_path) + + prof = _orders_profile() + glossary = GlossaryCollector() + chapter = build_relaciones( + prof, {"db_path": db_path, "table": "orders", "glossary": glossary}) + + assert isinstance(chapter, Chapter) + text = _text_of(chapter) + + # Inter-table containment FK candidate: customer_id -> customers.id. This path + # uses infer_fk_containment_duckdb + build_join_graph, both already in the + # registry, so it must be present. + assert "Claves foráneas candidatas (inter-tabla)" in text + assert "orders.customer_id" in text + assert "customers.id" in text + # Join graph with a pasteable Mermaid diagram. + assert "Grafo de relaciones" in text + assert "mermaid" in text + assert "graph LR" in text + assert "containment" in text.lower() + + # Declared-keys layer: present iff the delegated function is importable. + if detect_declared_keys_duckdb is not None: + assert "Claves declaradas en el esquema" in text + assert "Claves foráneas declaradas" in text + + pdf_text, n_slides = _render_both(chapter, "inter") + assert "customer_id" in pdf_text + assert n_slides >= 1 + + +# --------------------------------------------------------------------------- # +# Edges. +# --------------------------------------------------------------------------- # +def test_none_when_no_relations(): + """No key candidates, no FK-looking columns, no db source -> None.""" + prof = { + "table": "flat", "n_rows": 100, "n_cols": 2, "key_candidates": [], + "columns": [ + {"name": "value", "inferred_type": "numeric", "physical_type": "DOUBLE", + "distinct_count": 50, "unique_pct": 0.5, "flags": []}, + {"name": "label", "inferred_type": "categorical", + "physical_type": "VARCHAR", "distinct_count": 3, "unique_pct": 0.03, + "flags": []}, + ], + } + assert build_relaciones(prof, {}) is None + + +def test_empty_and_none_profile_do_not_raise(): + """None / {} profile and missing ctx degrade to None without raising.""" + assert build_relaciones(None, None) is None + assert build_relaciones({}, {}) is None + assert build_relaciones({}, {"glossary": GlossaryCollector()}) is None + + +def test_pk_candidate_only_builds_chapter(): + """A profile with only a key candidate (no FK anything, no db) still builds: + the relations chapter applies because there is a PK candidate to report.""" + prof = { + "table": "t", "n_rows": 10, "n_cols": 1, "key_candidates": ["row_id"], + "columns": [ + {"name": "row_id", "inferred_type": "numeric", "physical_type": "BIGINT", + "distinct_count": 10, "unique_pct": 1.0, "flags": ["possible_id"]}, + ], + } + chapter = build_relaciones(prof, {}) + assert isinstance(chapter, Chapter) + assert "Candidatos a clave primaria" in _text_of(chapter) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index d4dc329d..d9030999 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -33,6 +33,7 @@ CHAPTER_ORDER = [ "cat_distr", # categorical distributions "calidad", # data quality "correlacion", # correlations / associations + "relaciones", # key relations: declared/candidate PK + FK (inter/intra-table) "modelos", # cheap models (PCA/KMeans/outliers) "timeseries", # time-series analysis "geospatial", # geospatial diff --git a/python/functions/datascience/detect_declared_keys_duckdb.md b/python/functions/datascience/detect_declared_keys_duckdb.md new file mode 100644 index 00000000..32b9351d --- /dev/null +++ b/python/functions/datascience/detect_declared_keys_duckdb.md @@ -0,0 +1,107 @@ +--- +name: detect_declared_keys_duckdb +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict" +description: "Detecta las claves DECLARADAS (constraints reales) de un schema DuckDB leyendo la table function duckdb_constraints(): extrae PRIMARY KEY, FOREIGN KEY y UNIQUE (ignora NOT NULL y CHECK) y las devuelve normalizadas con sus columnas, y para las FK con su tabla y columnas referenciadas. Con table=None procesa todas las tablas; con table='X' filtra a PK/UNIQUE de X y a FK cuyo origen es X (case-sensitive). A diferencia de infer_fk_containment_duckdb (que INFIERE FKs candidatas por containment de valores cuando el schema no las declara), esta funcion devuelve las relaciones de clave REALES del schema. Estilo dict-no-throw: nunca lanza. Parte del grupo eda (relaciones de clave)." +tags: [eda, duckdb, datascience, relations, primary-key, foreign-key, schema, exploratory-data-analysis] +params: + - name: db_path + desc: "Ruta al archivo DuckDB. Debe existir (lectura read-only via duckdb_query_readonly; no se crea). Un path inexistente devuelve {status:'error', ...}." + - name: table + desc: "Si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea `table` (no la referenciada). None (default) devuelve los constraints de todas las tablas. La comparacion es case-sensitive (nombres tal cual los devuelve DuckDB)." +output: "dict dict-no-throw. En exito {status:'ok', primary_keys:[{table:str, columns:[str,...]}, ...], foreign_keys:[{table:str, columns:[str,...], referenced_table:str, referenced_columns:[str,...]}, ...], unique:[{table:str, columns:[str,...]}, ...], tables:[str,...]} donde tables es la lista ordenada de tablas (origen) que poseen al menos un constraint PK/FK/UNIQUE emitido. Solo se emiten constraints de clave: NOT NULL y CHECK se ignoran. En error {status:'error', error:str}." +uses_functions: [duckdb_query_readonly_py_infra] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +tested: true +tests: ["test_golden_detecta_pks_y_fk", "test_golden_ignora_not_null_y_check", "test_edge_filtra_por_tabla_orders", "test_edge_filtra_por_tabla_customers", "test_edge_unique_declarado", "test_edge_sin_constraints_listas_vacias", "test_error_db_inexistente_no_lanza", "test_shape_resultado"] +test_file_path: "python/functions/datascience/detect_declared_keys_duckdb_test.py" +file_path: "python/functions/datascience/detect_declared_keys_duckdb.py" +--- + +## Ejemplo + +```python +import sys, os, duckdb +sys.path.insert(0, os.path.join("python", "functions")) +from datascience import detect_declared_keys_duckdb + +# Base de ejemplo en /tmp: orders.customer_id -> customers.id (FK declarada) +path = "/tmp/declared_keys_demo.duckdb" +if os.path.exists(path): + os.remove(path) +con = duckdb.connect(path) +con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)") +con.execute( + "CREATE TABLE orders(" + " id INTEGER PRIMARY KEY," + " customer_id INTEGER REFERENCES customers(id)," + " amt DOUBLE)" +) +con.close() + +res = detect_declared_keys_duckdb(path) +if res["status"] == "ok": + for pk in res["primary_keys"]: + print(f"PK {pk['table']}({', '.join(pk['columns'])})") + for fk in res["foreign_keys"]: + print(f"FK {fk['table']}({', '.join(fk['columns'])}) -> " + f"{fk['referenced_table']}({', '.join(fk['referenced_columns'])})") + # PK customers(id) + # PK orders(id) + # FK orders(customer_id) -> customers(id) +else: + print("error:", res["error"]) + +# Filtrar a una tabla concreta (PK/UNIQUE de orders + FK con origen orders): +solo_orders = detect_declared_keys_duckdb(path, table="orders") +print(solo_orders["tables"]) # ['orders'] +``` + +## Cuando usarla + +- Cuando exploras un esquema DuckDB y quieres mostrar las relaciones de clave REALES (PK/FK/UNIQUE) que el schema ha declarado, sin inferir nada. +- Como paso del capitulo RELACIONES del grupo `eda`: primero mira las claves declaradas con esta funcion; si el schema no declara FKs, complementa con `infer_fk_containment_duckdb` (inferencia por containment). +- Antes de documentar o migrar un esquema, para listar el contrato de integridad referencial que el motor ya conoce. +- Para validar que las constraints que esperas (esa FK que creaste con `REFERENCES`) realmente estan declaradas en la base materializada. + +## Gotchas + +- **Impura**: lee de disco via la primitiva read-only `duckdb_query_readonly` (no crea ni modifica la base). El `db_path` debe existir; un path inexistente devuelve `{status:'error'}` (read_only NO crea la base). +- **Requiere `duckdb_constraints()`**: usa la table function `duckdb_constraints()`, disponible en DuckDB modernos (verificado en 1.5.2). En versiones antiguas sin esa funcion, la query falla y se devuelve `{status:'error'}`. +- **Solo claves DECLARADAS**: devuelve lo que el schema declaro con `PRIMARY KEY` / `FOREIGN KEY (... REFERENCES ...)` / `UNIQUE`. Una tabla materializada con `CREATE TABLE AS SELECT` NO lleva constraints — para esos casos no habra claves que mostrar y hay que INFERIRLAS (`infer_fk_containment_duckdb`). +- **NOT NULL y CHECK se ignoran**: `duckdb_constraints()` tambien emite filas `NOT NULL` (DuckDB genera una por cada columna PK) y `CHECK`; esta funcion las descarta y solo conserva PK/FK/UNIQUE. +- **Nombres case-sensitive**: el filtro `table='Orders'` no casa con una tabla `orders`. Se comparan los nombres tal cual los devuelve DuckDB. +- **FK atribuida al origen**: una FOREIGN KEY se atribuye a su tabla ORIGEN (el `table` de la entrada), no a la referenciada. El filtro `table='X'` trae las FK cuyo origen es X, no las que apuntan a X. +- **`tables` = tablas dueñas de constraints emitidos**: la lista `tables` contiene solo las tablas que poseen al menos un PK/FK/UNIQUE en el resultado (su campo `table`), ordenadas. No incluye tablas referenciadas que no tengan constraint propio en la salida. +- **Columnas como listas**: `constraint_column_names` y `referenced_column_names` son columnas LIST de DuckDB; en 1.5.2 llegan como listas Python. La funcion las normaliza a listas de strings con una red de seguridad por si llegaran como string. + +## Notas + +`duckdb_constraints()` devuelve una fila por constraint con los campos +`table_name`, `constraint_type`, `constraint_column_names`, `referenced_table`, +`referenced_column_names`. Mapeo a la salida: + +```text +PRIMARY KEY -> primary_keys[]: {table, columns} +UNIQUE -> unique[]: {table, columns} +FOREIGN KEY -> foreign_keys[]: {table, columns, referenced_table, referenced_columns} +NOT NULL -> ignorado +CHECK -> ignorado +``` + +Para una FK, `referenced_table` y `referenced_column_names` vienen poblados; para +PK/UNIQUE, `referenced_table` es NULL y `referenced_column_names` una lista vacia. + +Complementa a `infer_fk_containment_duckdb`: esta funcion devuelve las relaciones +de clave REALES del schema (declaradas); la otra INFIERE FKs candidatas por +containment de valores cuando el schema no las declaro. En el capitulo RELACIONES +de AutomaticEDA se usan en orden: primero las declaradas, luego la inferencia como +respaldo. diff --git a/python/functions/datascience/detect_declared_keys_duckdb.py b/python/functions/datascience/detect_declared_keys_duckdb.py new file mode 100644 index 00000000..b17151b9 --- /dev/null +++ b/python/functions/datascience/detect_declared_keys_duckdb.py @@ -0,0 +1,127 @@ +"""detect_declared_keys_duckdb — lee las claves DECLARADAS de un schema DuckDB. + +Funcion impura: lee de disco a traves de la primitiva read-only del grupo +`duckdb` (duckdb_query_readonly). Pertenece al grupo de capacidad `eda` +(relaciones de clave): a diferencia de infer_fk_containment_duckdb, que INFIERE +FOREIGN KEYs candidatas por containment de valores, esta funcion devuelve las +constraints REALES que el schema ha declarado (PRIMARY KEY / FOREIGN KEY / +UNIQUE) leyendo la table function `duckdb_constraints()`. + +Es la pieza del capitulo RELACIONES de AutomaticEDA que muestra las relaciones de +clave reales cuando existen — frente a la inferencia, que se usa cuando el schema +no las declaro. + +Estilo dict-no-throw del grupo duckdb: nunca lanza; captura cualquier error y +devuelve {status:'error', error:str}. +""" + +from infra import duckdb_query_readonly + + +def _as_list(value) -> list: + """Normaliza el valor de una columna LIST de DuckDB a una lista de strings. + + En DuckDB 1.5.2, `constraint_column_names` y `referenced_column_names` llegan + ya como listas Python a traves de duckdb_query_readonly. Este helper es solo + una red de seguridad: si por cualquier motivo llegara como string (p.ej. la + representacion `[id, customer_id]`), la parsea de forma defensiva. + """ + if value is None: + return [] + if isinstance(value, (list, tuple)): + return [str(v) for v in value] + if isinstance(value, str): + s = value.strip() + if s.startswith("[") and s.endswith("]"): + s = s[1:-1] + if not s.strip(): + return [] + return [ + part.strip().strip("'\"") + for part in s.split(",") + if part.strip().strip("'\"") + ] + return [str(value)] + + +def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict: + """Detecta las claves PRIMARY KEY / FOREIGN KEY / UNIQUE declaradas en DuckDB. + + Lee la table function `duckdb_constraints()` y extrae solo las constraints de + clave (PRIMARY KEY, FOREIGN KEY, UNIQUE), ignorando NOT NULL y CHECK. + + Args: + db_path: ruta al archivo DuckDB. Debe existir (lectura read-only; no se + crea). Un path inexistente devuelve {status:'error', ...} sin lanzar. + table: si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY + y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea + `table`. None (default) devuelve los constraints de todas las tablas. + La comparacion de nombres es case-sensitive (tal cual los devuelve + DuckDB). + + Returns: + dict dict-no-throw. En exito: + {status:'ok', + primary_keys:[{table:str, columns:[str, ...]}, ...], + foreign_keys:[{table:str, columns:[str, ...], + referenced_table:str, + referenced_columns:[str, ...]}, ...], + unique:[{table:str, columns:[str, ...]}, ...], + tables:[str, ...]} # tablas (origen) con algun PK/FK/UNIQUE emitido + En error (sin lanzar): {status:'error', error:str}. + """ + try: + sql = ( + "SELECT table_name, constraint_type, constraint_column_names, " + "referenced_table, referenced_column_names FROM duckdb_constraints()" + ) + res = duckdb_query_readonly(db_path, sql) + if res["status"] != "ok": + return {"status": "error", "error": res["error"]} + + primary_keys = [] + foreign_keys = [] + unique = [] + tables = set() + + for row in res["rows"]: + ctype = row["constraint_type"] + tname = row["table_name"] + + # Filtro por tabla origen: para PK/FK/UNIQUE el dueño del constraint es + # `table_name`. Una FK se atribuye a su tabla origen (no a la + # referenciada), igual que el filtro pide. + if table is not None and tname != table: + continue + + cols = _as_list(row["constraint_column_names"]) + + if ctype == "PRIMARY KEY": + primary_keys.append({"table": tname, "columns": cols}) + tables.add(tname) + elif ctype == "UNIQUE": + unique.append({"table": tname, "columns": cols}) + tables.add(tname) + elif ctype == "FOREIGN KEY": + foreign_keys.append( + { + "table": tname, + "columns": cols, + "referenced_table": row["referenced_table"], + "referenced_columns": _as_list( + row["referenced_column_names"] + ), + } + ) + tables.add(tname) + # NOT NULL y CHECK se ignoran: no son relaciones de clave. + + return { + "status": "ok", + "primary_keys": primary_keys, + "foreign_keys": foreign_keys, + "unique": unique, + "tables": sorted(tables), + } + except Exception as e: # noqa: BLE001 + return {"status": "error", "error": str(e)} diff --git a/python/functions/datascience/detect_declared_keys_duckdb_test.py b/python/functions/datascience/detect_declared_keys_duckdb_test.py new file mode 100644 index 00000000..c014ed85 --- /dev/null +++ b/python/functions/datascience/detect_declared_keys_duckdb_test.py @@ -0,0 +1,167 @@ +"""Tests para detect_declared_keys_duckdb.""" + +import duckdb +import pytest + +from .detect_declared_keys_duckdb import detect_declared_keys_duckdb + + +@pytest.fixture +def db(tmp_path): + """DuckDB temporal con claves declaradas. + + - customers(id PRIMARY KEY, name) + - orders(id PRIMARY KEY, customer_id REFERENCES customers(id), amt) + + Esto declara dos PRIMARY KEY (customers.id, orders.id) y una FOREIGN KEY + (orders.customer_id -> customers.id). DuckDB ademas genera constraints + NOT NULL para las columnas PK, que la funcion debe ignorar. + """ + path = str(tmp_path / "keys_test.duckdb") + con = duckdb.connect(path) + con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)") + con.execute( + "CREATE TABLE orders(" + " id INTEGER PRIMARY KEY," + " customer_id INTEGER REFERENCES customers(id)," + " amt DOUBLE" + ")" + ) + con.close() + return path + + +def _pk_for(res, table): + """Devuelve la entrada primary_keys cuya tabla es `table`, o None.""" + for pk in res["primary_keys"]: + if pk["table"] == table: + return pk + return None + + +def test_golden_detecta_pks_y_fk(db): + """Golden: detecta las dos PK y la FK declaradas, con valores concretos.""" + res = detect_declared_keys_duckdb(db) + assert res["status"] == "ok" + + # PRIMARY KEY de customers y de orders. + pk_customers = _pk_for(res, "customers") + pk_orders = _pk_for(res, "orders") + assert pk_customers is not None + assert pk_customers["columns"] == ["id"] + assert pk_orders is not None + assert pk_orders["columns"] == ["id"] + + # FOREIGN KEY orders.customer_id -> customers.id. + assert len(res["foreign_keys"]) == 1 + fk = res["foreign_keys"][0] + assert fk["table"] == "orders" + assert fk["columns"] == ["customer_id"] + assert fk["referenced_table"] == "customers" + assert fk["referenced_columns"] == ["id"] + + # tables incluye ambas (origen de algun constraint). + assert res["tables"] == ["customers", "orders"] + + +def test_golden_ignora_not_null_y_check(db): + """NOT NULL (auto-generado por las PK) no aparece como clave.""" + res = detect_declared_keys_duckdb(db) + assert res["status"] == "ok" + # Solo 2 PK reales (no las NOT NULL que DuckDB genera por cada columna PK). + assert len(res["primary_keys"]) == 2 + # No hay UNIQUE declarado en este schema. + assert res["unique"] == [] + + +def test_edge_filtra_por_tabla_orders(db): + """Edge table='orders': PK de orders + su FK; NO la PK de customers.""" + res = detect_declared_keys_duckdb(db, table="orders") + assert res["status"] == "ok" + + # Solo la PK de orders. + assert len(res["primary_keys"]) == 1 + assert res["primary_keys"][0]["table"] == "orders" + assert res["primary_keys"][0]["columns"] == ["id"] + # La PK de customers NO esta. + assert _pk_for(res, "customers") is None + + # La FK de orders si esta (origen = orders). + assert len(res["foreign_keys"]) == 1 + assert res["foreign_keys"][0]["table"] == "orders" + assert res["foreign_keys"][0]["referenced_table"] == "customers" + + # tables solo contiene orders (la dueña de los constraints emitidos). + assert res["tables"] == ["orders"] + + +def test_edge_filtra_por_tabla_customers(db): + """Edge table='customers': solo su PK; ninguna FK (orders queda fuera).""" + res = detect_declared_keys_duckdb(db, table="customers") + assert res["status"] == "ok" + assert len(res["primary_keys"]) == 1 + assert res["primary_keys"][0]["table"] == "customers" + assert res["foreign_keys"] == [] + assert res["tables"] == ["customers"] + + +def test_edge_unique_declarado(tmp_path): + """Edge: una constraint UNIQUE declarada aparece en `unique`.""" + path = str(tmp_path / "unique_test.duckdb") + con = duckdb.connect(path) + con.execute("CREATE TABLE products(sku INTEGER UNIQUE, name TEXT)") + con.close() + + res = detect_declared_keys_duckdb(path) + assert res["status"] == "ok" + assert len(res["unique"]) == 1 + assert res["unique"][0]["table"] == "products" + assert res["unique"][0]["columns"] == ["sku"] + assert res["primary_keys"] == [] + assert res["foreign_keys"] == [] + assert res["tables"] == ["products"] + + +def test_edge_sin_constraints_listas_vacias(tmp_path): + """Edge: tabla sin PK/FK/UNIQUE -> todas las listas vacias, status ok.""" + path = str(tmp_path / "no_keys.duckdb") + con = duckdb.connect(path) + con.execute("CREATE TABLE log(a INTEGER, b INTEGER)") + con.close() + + res = detect_declared_keys_duckdb(path) + assert res["status"] == "ok" + assert res["primary_keys"] == [] + assert res["foreign_keys"] == [] + assert res["unique"] == [] + assert res["tables"] == [] + + +def test_error_db_inexistente_no_lanza(tmp_path): + """Error: db_path inexistente -> status error, sin lanzar excepcion.""" + path = str(tmp_path / "does_not_exist.duckdb") + res = detect_declared_keys_duckdb(path) + assert res["status"] == "error" + assert isinstance(res["error"], str) + assert res["error"] != "" + + +def test_shape_resultado(db): + """El retorno tiene exactamente las claves esperadas.""" + res = detect_declared_keys_duckdb(db) + assert set(res.keys()) == { + "status", + "primary_keys", + "foreign_keys", + "unique", + "tables", + } + for pk in res["primary_keys"]: + assert set(pk.keys()) == {"table", "columns"} + for fk in res["foreign_keys"]: + assert set(fk.keys()) == { + "table", + "columns", + "referenced_table", + "referenced_columns", + } diff --git a/python/functions/datascience/suggest_intratable_fk_candidates.md b/python/functions/datascience/suggest_intratable_fk_candidates.md new file mode 100644 index 00000000..b6c5c321 --- /dev/null +++ b/python/functions/datascience/suggest_intratable_fk_candidates.md @@ -0,0 +1,91 @@ +--- +name: suggest_intratable_fk_candidates +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list" +description: "Sobre el TableProfile de UNA tabla (el dict de profile_table), sugiere por heuristica de nombre + cardinalidad que columnas PARECEN una clave foranea hacia otra tabla, cuando no hay relaciones inter-tabla que medir (una sola tabla). Es una SUGERENCIA, no una afirmacion: el ref_table_guess es el stem del nombre (customer_id -> customer) y NO confirma containment. Pura: solo lee el dict, sin I/O; nunca lanza (devuelve [])." +tags: [eda, datascience, relationships, foreign-key, fk, heuristic, schema, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +params: + - name: profile + desc: "TableProfile (dict que produce profile_table / summarize_table_*). Se leen de forma defensiva `columns` (lista de ColumnProfile con name/inferred_type/physical_type/distinct_count/unique_pct/flags), `n_rows` (int) y `key_candidates` (lista de nombres de columna ya candidatos a PK, que se excluyen). Si no es dict o no trae columns -> []." + - name: max_candidates + desc: "Tope de sugerencias devueltas (default 20). Las columnas candidatas se ordenan por distinct_count descendente (mas informativas primero) antes de cortar a este maximo." +output: "list (posiblemente vacia) de dicts, uno por columna sugerida, con claves: `column` (nombre), `ref_table_guess` (tabla conjeturada por el stem del nombre, p.ej. customer_id -> 'customer'), `reason` (frase humana que deja claro que es heuristica sin confirmar containment), `distinct_count` (int|None), `unique_pct` (float|None, fraccion 0-1 tal como viene del profile), `inferred_type` (str), `physical_type` (str). Nunca lanza." +tested: true +tests: ["test_golden_customer_id_detectado_otras_no", "test_camelcase_albumid_detectado", "test_constante_status_id_no_aparece", "test_profile_vacio_y_none_devuelven_lista_vacia", "test_category_id_casi_unico_parece_pk_no_aparece", "test_ref_table_guess_multitoken_y_orden_por_distinct", "test_max_candidates_corta_la_lista", "test_id_generico_solo_nunca_es_fk"] +test_file_path: "python/functions/datascience/suggest_intratable_fk_candidates_test.py" +file_path: "python/functions/datascience/suggest_intratable_fk_candidates.py" +--- + +## Ejemplo + +```python +from datascience import suggest_intratable_fk_candidates + +# TableProfile de UNA tabla (tipo titanic): customer_id es FK N:1; id es la PK; +# amount es una medida float; name es categorica sin sufijo de id. +profile = { + "n_rows": 891, + "key_candidates": ["id"], + "columns": [ + {"name": "id", "inferred_type": "numeric", "physical_type": "BIGINT", + "distinct_count": 891, "unique_pct": 1.0, "flags": ["possible_id"]}, + {"name": "customer_id", "inferred_type": "numeric", "physical_type": "BIGINT", + "distinct_count": 137, "unique_pct": 0.15, "flags": []}, + {"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE", + "distinct_count": 400, "unique_pct": 0.45, "flags": []}, + {"name": "name", "inferred_type": "categorical", "physical_type": "VARCHAR", + "distinct_count": 700, "unique_pct": 0.78, "flags": []}, + ], +} + +out = suggest_intratable_fk_candidates(profile) +[c["column"] for c in out] # -> ["customer_id"] +out[0]["ref_table_guess"] # -> "customer" +out[0]["reason"] +# -> "el nombre termina en '_id' y es N:1 (137 valores distintos < 891 filas): +# parece (heuristica por nombre, sin confirmar containment) una referencia a +# una tabla «customer»" +``` + +## Cuando usarla + +Cuando el EDA tiene SOLO UNA tabla y, por tanto, no se puede inferir una FK +inter-tabla por containment (no hay otra tabla cuyos valores contener). Es el plan B +del capitulo RELACIONES de AutomaticEDA: en vez de medir solapamiento de valores +entre tablas (lo correcto cuando hay varias, ver `infer_fk_containment_duckdb` / +`build_join_graph`), conjetura por el NOMBRE de la columna (`_id`) y por su +CARDINALIDAD N:1 que columnas parecen apuntar a una entidad externa. Usala para +enriquecer el reporte con "estas columnas parecen referencias a otras tablas" sin +prometer que esa tabla exista. NO la uses si tienes varias tablas: ahi mide +containment de verdad. + +## Gotchas + +- Es **heuristica**, no una verdad: produce **falsos positivos** (una columna + `period_id` que en realidad es un codigo libre, no una FK) y **falsos negativos** + (una FK que no se llama `*_id`, p.ej. `parent`, `owner`, `sku`). No la trates como + una afirmacion de esquema. +- `ref_table_guess` es una **conjetura por el nombre** (el stem sin el sufijo id): + `customer_id` -> `customer`, `AlbumId` -> `album`, `manager_staff_id` -> + `manager_staff`. Puede no coincidir con el nombre real de la tabla (plurales, + prefijos, alias). Es una pista, no un join garantizado. +- **NO confirma containment**: no comprueba que los valores de la columna existan en + ninguna otra tabla (no puede — solo recibe el perfil de una tabla). Para confirmar + una FK real con varias tablas usa `infer_fk_containment_duckdb`. +- Excluye deliberadamente: el `id`/`Id`/`ID` generico a secas (suele ser la PK + propia, no una referencia), las columnas constantes, las que parecen unicas + (`unique_pct >= 0.99`, mas PK que FK) y los tipos no-clave (float/decimal son + medidas; date/time/timestamp y boolean no son claves). En camelCase, `paid`, + `valid`, `grid` (con `id` en minuscula y sin separador) NO se confunden con FK. +- `unique_pct` se interpreta como **fraccion 0-1** (tal como la emite el profile), no + como porcentaje 0-100. diff --git a/python/functions/datascience/suggest_intratable_fk_candidates.py b/python/functions/datascience/suggest_intratable_fk_candidates.py new file mode 100644 index 00000000..60f72190 --- /dev/null +++ b/python/functions/datascience/suggest_intratable_fk_candidates.py @@ -0,0 +1,202 @@ +"""suggest_intratable_fk_candidates — heuristica de FK intra-tabla del grupo `eda`. + +Sobre el TableProfile de UNA tabla (el dict que produce ``profile_table``), sugiere +por heuristica de NOMBRE + CARDINALIDAD que columnas PARECEN una clave foranea hacia +otra tabla, util cuando no hay relaciones inter-tabla disponibles (una sola tabla y, +por tanto, sin containment cruzado que medir). Es una SUGERENCIA, no una afirmacion: +no confirma que exista la tabla referida ni que los valores esten contenidos en ella. + +La consume el capitulo RELACIONES de AutomaticEDA cuando solo hay una tabla. + +Funcion PURA: solo lee el dict (lectura defensiva con ``.get``), no hace I/O y nunca +lanza por inputs raros (devuelve ``[]``). +""" + +# inferred_type que es compatible con una clave foranea (entero/categorico). +_FK_INFERRED_OK = {"numeric", "categorical", "integer"} + +# Prefijos de physical_type que admiten ser clave foranea (enteros, texto, uuid). +_FK_PHYSICAL_PREFIXES = ( + "int", "bigint", "smallint", "tinyint", "hugeint", "uint", + "varchar", "text", "char", "bpchar", "string", "uuid", +) + +# Prefijos de physical_type que EXCLUYEN ser clave foranea: medidas en coma flotante +# (float/double/decimal/numeric/real), temporales (date/time/timestamp/interval) y +# boolean. Se comprueban ANTES que las senales positivas (la exclusion gana: una +# columna numeric con physical DOUBLE es una medida, no una FK). +_FK_PHYSICAL_EXCLUDE = ( + "float", "double", "decimal", "numeric", "real", + "date", "time", "timestamp", "interval", + "bool", +) + + +def _fk_name_signal(name): + """Detecta el sufijo de clave foranea en el nombre y devuelve ``(stem, sufijo)``. + + Reconoce ``_id`` (snake), ``Id`` y ``ID`` (camel). NO reconoce + el ``id``/``Id``/``ID`` generico a secas (suele ser la PK propia de la tabla, no + una referencia). En camelCase la ``I`` mayuscula marca el limite de palabra, asi + que ``paid``/``valid``/``grid`` (``id`` en minuscula y sin separador) NO matchean. + + El ``stem`` se devuelve en minusculas y sirve de ``ref_table_guess`` (la tabla a + la que probablemente apunta): ``customer_id`` -> ``"customer"``, ``AlbumId`` -> + ``"album"``, ``manager_staff_id`` -> ``"manager_staff"``. Devuelve ``None`` si no + hay senal de nombre. + """ + if not isinstance(name, str): + return None + raw = name.strip() + if not raw: + return None + # Snake: termina en "_id" (indiferente a mayusculas en la parte "id"). + if raw.lower().endswith("_id"): + stem = raw[:-3].rstrip("_-. ") + if not stem: + return None + return (stem.lower(), "_id") + # Camel todo-mayuscula: "...ID" (p.ej. customerID). + if raw.endswith("ID"): + stem = raw[:-2].rstrip("_-. ") + if not stem: + return None + return (stem.lower(), "ID") + # Camel: "...Id" (p.ej. AlbumId). + if raw.endswith("Id"): + stem = raw[:-2].rstrip("_-. ") + if not stem: + return None + return (stem.lower(), "Id") + return None + + +def _fk_type_compatible(col): + """True si el tipo de la columna admite ser clave foranea. + + Compatible si el ``physical_type`` NO es una medida flotante, una temporal ni + boolean, Y ademas (``inferred_type`` en {numeric, categorical, integer} O el + ``physical_type`` empieza por entero/varchar/text/char/uuid). La comparacion es + indistinta a mayusculas/minusculas. + """ + phys = (col.get("physical_type") or "").strip().lower() + inferred = (col.get("inferred_type") or "").strip().lower() + # Exclusion por tipo fisico (gana sobre cualquier senal positiva). + for bad in _FK_PHYSICAL_EXCLUDE: + if phys.startswith(bad): + return False + # Senal positiva por tipo inferido. + if inferred in _FK_INFERRED_OK: + return True + # Senal positiva por tipo fisico (entero/texto/uuid). + for good in _FK_PHYSICAL_PREFIXES: + if phys.startswith(good): + return True + return False + + +def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list: + """Sugiere columnas que parecen una FK intra-tabla por nombre + cardinalidad. + + Heuristica (no afirma nada): una columna es candidata a clave foranea si su nombre + tiene sufijo de id con stem no vacio (``_id`` / ``Id`` / ``ID``, + NUNCA el ``id`` generico), no es ya candidata a PK, no es constante, tiene + cardinalidad alta pero por debajo del numero de filas (N:1, no unica) y un tipo + compatible con clave (entero/categorico/texto/uuid; nunca float/fecha/boolean). + + Args: + profile: TableProfile (dict de ``profile_table``). Se leen, de forma + defensiva, ``columns`` (lista de ColumnProfile), ``n_rows`` y + ``key_candidates`` (nombres de columna ya candidatos a PK). + max_candidates: tope de sugerencias devueltas (default 20). Las columnas se + ordenan por ``distinct_count`` descendente (mas informativas primero) + antes de cortar. + + Returns: + list de dicts (posiblemente vacia), uno por columna sugerida, con claves: + ``column``, ``ref_table_guess`` (stem del nombre), ``reason`` (frase humana), + ``distinct_count``, ``unique_pct`` (fraccion 0-1 tal como viene del profile), + ``inferred_type``, ``physical_type``. Nunca lanza: si ``profile`` no es dict o + no hay columnas, devuelve ``[]``. + """ + if not isinstance(profile, dict): + return [] + columns = profile.get("columns") + if not isinstance(columns, list): + return [] + + n_rows = profile.get("n_rows") + has_n_rows = ( + isinstance(n_rows, int) and not isinstance(n_rows, bool) and n_rows > 0 + ) + + key_candidates = profile.get("key_candidates") + if not isinstance(key_candidates, (list, tuple, set)): + key_candidates = [] + key_set = set(key_candidates) + + out = [] + for col in columns: + if not isinstance(col, dict): + continue + name = col.get("name") + + # 1) Senal de nombre: sufijo de id con stem no vacio. + signal = _fk_name_signal(name) + if signal is None: + continue + ref_guess, suffix = signal + + # 2) No es ya candidata a PK (clave primaria de la propia tabla). + if name in key_set: + continue + + # 3) No constante y con >= 2 valores distintos. + flags = col.get("flags") or [] + if "constant" in flags: + continue + dc = col.get("distinct_count") + if not (isinstance(dc, int) and not isinstance(dc, bool) and dc >= 2): + continue + + # 4) Cardinalidad alta pero < n_rows (no es PK) y no parece unica. + if has_n_rows and dc >= n_rows: + continue + unique_pct = col.get("unique_pct") + has_unique = ( + isinstance(unique_pct, (int, float)) and not isinstance(unique_pct, bool) + ) + if has_unique and unique_pct >= 0.99: + continue + + # 5) Tipo compatible con clave foranea (entero/categorico/texto; no medida). + if not _fk_type_compatible(col): + continue + + out.append( + { + "column": name, + "ref_table_guess": ref_guess, + "reason": _build_reason(suffix, dc, n_rows if has_n_rows else None, ref_guess), + "distinct_count": dc, + "unique_pct": float(unique_pct) if has_unique else None, + "inferred_type": col.get("inferred_type") or "", + "physical_type": col.get("physical_type") or "", + } + ) + + # Mas informativas primero (mayor cardinalidad), luego corte. + out.sort(key=lambda d: d.get("distinct_count") or 0, reverse=True) + return out[: max(0, int(max_candidates))] + + +def _build_reason(suffix, dc, n_rows, ref_guess): + """Frase humana que deja claro que la sugerencia es heuristica, no confirmada.""" + if n_rows is not None: + card = f"es N:1 ({dc} valores distintos < {n_rows} filas)" + else: + card = f"tiene {dc} valores distintos que se repiten (cardinalidad N:1)" + return ( + f"el nombre termina en '{suffix}' y {card}: parece (heuristica por nombre, " + f"sin confirmar containment) una referencia a una tabla «{ref_guess}»" + ) diff --git a/python/functions/datascience/suggest_intratable_fk_candidates_test.py b/python/functions/datascience/suggest_intratable_fk_candidates_test.py new file mode 100644 index 00000000..f06e928e --- /dev/null +++ b/python/functions/datascience/suggest_intratable_fk_candidates_test.py @@ -0,0 +1,157 @@ +"""Tests para suggest_intratable_fk_candidates (funcion pura, sin I/O).""" + +from suggest_intratable_fk_candidates import suggest_intratable_fk_candidates + + +def _col(name, inferred_type="numeric", physical_type="BIGINT", distinct_count=10, + unique_pct=0.1, flags=None): + """Construye un ColumnProfile minimo a mano (el dict que emite profile_table).""" + return { + "name": name, + "inferred_type": inferred_type, + "physical_type": physical_type, + "semantic_type": "", + "distinct_count": distinct_count, + "unique_pct": unique_pct, + "null_count": 0, + "null_pct": 0.0, + "flags": list(flags) if flags else [], + } + + +def test_golden_customer_id_detectado_otras_no(): + # Tabla tipo titanic: customer_id es FK N:1; id es la PK; amount es medida; + # name es categorica sin sufijo de id. Solo customer_id debe aparecer. + profile = { + "n_rows": 891, + "key_candidates": ["id"], + "columns": [ + _col("id", inferred_type="numeric", physical_type="BIGINT", + distinct_count=891, unique_pct=1.0, flags=["possible_id"]), + _col("customer_id", inferred_type="numeric", physical_type="BIGINT", + distinct_count=137, unique_pct=0.15, flags=[]), + _col("amount", inferred_type="numeric", physical_type="DOUBLE", + distinct_count=400, unique_pct=0.45), + _col("name", inferred_type="categorical", physical_type="VARCHAR", + distinct_count=700, unique_pct=0.78), + ], + } + out = suggest_intratable_fk_candidates(profile) + assert isinstance(out, list) + assert [c["column"] for c in out] == ["customer_id"] + cand = out[0] + assert cand["ref_table_guess"] == "customer" + assert cand["distinct_count"] == 137 + assert cand["unique_pct"] == 0.15 + assert cand["inferred_type"] == "numeric" + assert cand["physical_type"] == "BIGINT" + # La razon deja claro que es heuristica + cita el sufijo y la tabla. + assert "customer" in cand["reason"] + assert "_id" in cand["reason"] + + +def test_camelcase_albumid_detectado(): + # AlbumId (camelCase, VARCHAR) -> detectada, ref_table_guess "album". + profile = { + "n_rows": 3503, + "key_candidates": ["TrackId"], + "columns": [ + _col("AlbumId", inferred_type="categorical", physical_type="VARCHAR", + distinct_count=347, unique_pct=0.10), + ], + } + out = suggest_intratable_fk_candidates(profile) + # TrackId es PK candidata (en key_candidates), AlbumId no -> AlbumId aparece. + assert [c["column"] for c in out] == ["AlbumId"] + assert out[0]["ref_table_guess"] == "album" + + +def test_constante_status_id_no_aparece(): + # status_id constante (flag "constant", distinct_count 1) NO es FK util. + profile = { + "n_rows": 1000, + "key_candidates": [], + "columns": [ + _col("status_id", inferred_type="numeric", physical_type="INTEGER", + distinct_count=1, unique_pct=0.001, flags=["constant"]), + ], + } + out = suggest_intratable_fk_candidates(profile) + assert out == [] + + +def test_profile_vacio_y_none_devuelven_lista_vacia(): + # Lectura defensiva: ni {} ni None lanzan; devuelven []. + assert suggest_intratable_fk_candidates({}) == [] + assert suggest_intratable_fk_candidates(None) == [] + # profile sin columns o con columns no-lista tampoco lanza. + assert suggest_intratable_fk_candidates({"n_rows": 10}) == [] + assert suggest_intratable_fk_candidates({"columns": "no-soy-lista"}) == [] + + +def test_category_id_casi_unico_parece_pk_no_aparece(): + # unique_pct 0.999 -> parece PK (no N:1) -> NO se sugiere como FK. + profile = { + "n_rows": 891, + "key_candidates": [], + "columns": [ + _col("category_id", inferred_type="numeric", physical_type="BIGINT", + distinct_count=890, unique_pct=0.999), + ], + } + out = suggest_intratable_fk_candidates(profile) + assert out == [] + + +def test_ref_table_guess_multitoken_y_orden_por_distinct(): + # manager_staff_id conserva los underscores del stem -> "manager_staff". + # Ademas, con varias candidatas, se ordenan por distinct_count descendente. + profile = { + "n_rows": 10000, + "key_candidates": ["staff_id"], # staff_id es PK aqui, no debe aparecer + "columns": [ + _col("staff_id", inferred_type="numeric", physical_type="BIGINT", + distinct_count=10000, unique_pct=1.0, flags=["possible_id"]), + _col("store_id", inferred_type="numeric", physical_type="INTEGER", + distinct_count=2, unique_pct=0.0002), + _col("manager_staff_id", inferred_type="numeric", physical_type="INTEGER", + distinct_count=40, unique_pct=0.004), + ], + } + out = suggest_intratable_fk_candidates(profile) + cols = [c["column"] for c in out] + # staff_id excluida (PK); las otras dos ordenadas por distinct desc. + assert cols == ["manager_staff_id", "store_id"] + refs = {c["column"]: c["ref_table_guess"] for c in out} + assert refs["manager_staff_id"] == "manager_staff" + assert refs["store_id"] == "store" + + +def test_max_candidates_corta_la_lista(): + # max_candidates limita el numero de sugerencias devueltas. + profile = { + "n_rows": 10000, + "key_candidates": [], + "columns": [ + _col("a_id", distinct_count=300, unique_pct=0.03), + _col("b_id", distinct_count=200, unique_pct=0.02), + _col("c_id", distinct_count=100, unique_pct=0.01), + ], + } + out = suggest_intratable_fk_candidates(profile, max_candidates=2) + assert [c["column"] for c in out] == ["a_id", "b_id"] + + +def test_id_generico_solo_nunca_es_fk(): + # 'id'/'Id'/'ID' a secas (sin stem) jamas se sugieren como FK. + profile = { + "n_rows": 500, + "key_candidates": [], + "columns": [ + _col("id", distinct_count=500, unique_pct=1.0), + _col("Id", distinct_count=120, unique_pct=0.24), + _col("ID", distinct_count=80, unique_pct=0.16), + ], + } + out = suggest_intratable_fk_candidates(profile) + assert out == []