"""Key-relations chapter (RELACIONES) — the keys / join structure of the data. This chapter is the *relational* section of an AutomaticEDA report. It answers a single question for the table (or the whole DuckDB source it lives in): **how do the keys relate?** It composes, without reimplementing them, the registry's relation primitives and degrades honestly when a layer does not apply. It renders, in order, only the layers that have something to say: 1. **Declared keys** (real schema constraints) — when the DuckDB source declares PRIMARY KEY / FOREIGN KEY / UNIQUE constraints, they are read verbatim via ``detect_declared_keys_duckdb`` and shown as ground truth: which column is the PK, which columns are FKs and the table/column they point to. 2. **Primary-key candidates** — the ``key_candidates`` the TableProfile already carries (columns whose cardinality equals the row count, with no nulls). These are *candidates*: a column that could serve as the row identifier. 3. **Foreign-key candidates** when none are declared: - **Inter-table** (the DuckDB source has several tables): real FK candidates by name signal + value containment via ``infer_fk_containment_duckdb``, plus the join graph (roles + a pasteable Mermaid diagram) via ``build_join_graph``. - **Intra-table** (a single table): columns that *look* like a foreign key by a name+cardinality heuristic (``suggest_intratable_fk_candidates``). This is a **suggestion**, explicitly flagged as a heuristic, never an assertion. ``build_relaciones(profile, ctx) -> Chapter | None``: returns ``None`` when there is nothing to say (no declared key, no key candidates, and no FK candidate — inter- or intra-table). Reads everything defensively (``.get``) and never raises: anything missing degrades to a note or is omitted; a failing registry call drops its layer instead of aborting the chapter. ctx keys this chapter consumes (all optional): db_path, table : str — the DuckDB file and table being profiled (set by ``build_eda_render_ctx``). ``db_path`` is needed to read declared constraints, to list the sibling tables, and to run the containment-based FK inference. Without it, only the profile-derived layers (PK candidates, intra-table FK heuristic) are available. glossary : model.GlossaryCollector — shared glossary; the chapter registers the relational terms (PK, FK, containment, cardinality) and marks their first appearance clickable. Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". """ from __future__ import annotations from .. import model # Pure/impure registry functions (group ``eda``) this chapter composes. Imported # defensively (module-leaf imports, like the AGREGACION chapter) so the chapter # still builds — degrading the affected layer to nothing — if a function is # somehow unavailable / not indexed yet. try: from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb except Exception: # noqa: BLE001 — keep the chapter importable no matter what. detect_declared_keys_duckdb = None # type: ignore[assignment] try: from datascience.infer_fk_containment_duckdb import infer_fk_containment_duckdb except Exception: # noqa: BLE001 infer_fk_containment_duckdb = None # type: ignore[assignment] try: from datascience.build_join_graph import build_join_graph except Exception: # noqa: BLE001 build_join_graph = None # type: ignore[assignment] try: from datascience.suggest_intratable_fk_candidates import ( suggest_intratable_fk_candidates, ) except Exception: # noqa: BLE001 suggest_intratable_fk_candidates = None # type: ignore[assignment] try: from infra import duckdb_list_tables except Exception: # noqa: BLE001 duckdb_list_tables = None # type: ignore[assignment] CHAPTER_VERSION = "1.0.0" CHAPTER_ID = "relaciones" CHAPTER_TITLE = "Relaciones de clave" # Cap the inter-table FK table so a wide schema does not blow up the page; the # rest is summarized in a closing note (no silent truncation). MAX_FK_ROWS = 40 # --------------------------------------------------------------------------- # # Glossary terms this chapter explains. Registered in the shared collector and # marked clickable on their first appearance (contract §11.1). # --------------------------------------------------------------------------- # _TERMS = { "pk": ( "Clave primaria (PK)", "Columna (o conjunto de columnas) que identifica de forma única cada fila " "de una tabla: sus valores no se repiten y no son nulos. Una tabla tiene " "como mucho una clave primaria; es el ancla por la que otras tablas la " "referencian.", ), "fk": ( "Clave foránea (FK)", "Columna de una tabla cuyos valores apuntan a la clave primaria de otra " "tabla (o de la misma), creando una relación entre ambas. Una FK suele ser " "N:1: muchas filas de la tabla origen comparten el mismo valor de la tabla " "destino.", ), "containment": ( "Containment / inclusión", "Señal con la que se infiere una clave foránea sin que la base la declare: " "la fracción de valores distintos de una columna A que también aparecen " "como valores de otra columna B. Si casi todos los valores de A están " "contenidos en B (inclusión ≈ 1) y B parece una clave, A → B es una FK " "candidata.", ), "cardinalidad": ( "Cardinalidad", "Número de valores distintos de una columna. Cardinalidad igual al número " "de filas (y sin nulos) señala un identificador (candidato a clave " "primaria); cardinalidad alta pero menor que el número de filas, con " "valores repetidos, es típica de una clave foránea.", ), } def _register_terms(ctx: dict) -> bool: """Register the relational terms in the shared glossary. Returns whether the in-text appearances should be marked clickable.""" glossary = ctx.get("glossary") if not isinstance(glossary, model.GlossaryCollector): return False for key, (label, definition) in _TERMS.items(): glossary.add(key, label, definition) return True # --------------------------------------------------------------------------- # # Formatting helpers (mirror the other chapters' defensive style). # --------------------------------------------------------------------------- # def _fmt_int(value) -> str: if value is None: return "—" try: return f"{int(value):,}".replace(",", ".") except (TypeError, ValueError): return model._safe_str(value) def _fmt_pct_fraction(value, decimals: int = 1) -> str: """Format a 0–1 fraction as a percentage. None -> placeholder.""" if value is None: return "—" try: v = float(value) except (TypeError, ValueError): return model._safe_str(value) if v <= 1.0: v *= 100.0 return f"{v:.{decimals}f}%" def _fmt_ratio(value, decimals: int = 3) -> str: """Format an already-0–1 ratio (inclusion) as a plain number.""" if value is None: return "—" try: return f"{float(value):.{decimals}f}".rstrip("0").rstrip(".") except (TypeError, ValueError): return model._safe_str(value) def _is_dict(v) -> bool: return isinstance(v, dict) def _columns_by_name(profile: dict) -> dict: """Index the profile columns by name for quick metric lookup.""" out = {} for col in (profile.get("columns") or []): if _is_dict(col) and col.get("name") is not None: out[col.get("name")] = col return out # --------------------------------------------------------------------------- # # Layer 1 — declared keys (real schema constraints). # --------------------------------------------------------------------------- # def _declared_keys(db_path: str, table: str): """Read declared PK/FK/UNIQUE for the source, or None if unavailable.""" if not db_path or detect_declared_keys_duckdb is None: return None try: out = detect_declared_keys_duckdb(db_path, table) except Exception: # noqa: BLE001 — dict-no-throw: treat as unavailable. return None if not _is_dict(out) or out.get("status") != "ok": return None return out def _declared_section(declared: dict) -> list: """Blocks for the declared-keys layer, or [] if there is nothing declared.""" pks = [p for p in (declared.get("primary_keys") or []) if _is_dict(p)] fks = [f for f in (declared.get("foreign_keys") or []) if _is_dict(f)] uqs = [u for u in (declared.get("unique") or []) if _is_dict(u)] if not (pks or fks or uqs): return [] blocks = [ model.Heading(text="Claves declaradas en el esquema", level=2), model.Markdown(text=( "La base **declara** estas relaciones de clave como restricciones " "reales del esquema (constraints). Son la verdad de referencia: no se " "infieren, se leen tal cual de la definición de las tablas.")), ] if pks: rows = [[model._safe_str(p.get("table")), ", ".join(model._safe_str(c) for c in (p.get("columns") or []))] for p in pks] blocks.append(model.DataTable( header=["Tabla", "Columna(s) PK"], rows=rows, title="Claves primarias declaradas", note="Cada fila: la clave primaria declarada de una tabla.")) if fks: rows = [] for f in fks: src = ", ".join(model._safe_str(c) for c in (f.get("columns") or [])) dst = ", ".join( model._safe_str(c) for c in (f.get("referenced_columns") or [])) rows.append([ model._safe_str(f.get("table")), src, model._safe_str(f.get("referenced_table")), dst]) blocks.append(model.DataTable( header=["Tabla origen", "Columna(s) FK", "→ Tabla destino", "Columna(s) destino"], rows=rows, title="Claves foráneas declaradas", note="Cada fila: una FK declarada — origen → destino.")) if uqs: rows = [[model._safe_str(u.get("table")), ", ".join(model._safe_str(c) for c in (u.get("columns") or []))] for u in uqs] blocks.append(model.DataTable( header=["Tabla", "Columna(s) UNIQUE"], rows=rows, title="Restricciones UNIQUE declaradas")) return blocks # --------------------------------------------------------------------------- # # Layer 2 — primary-key candidates (from the profile). # --------------------------------------------------------------------------- # def _pk_candidates_section(profile: dict, mark: bool) -> list: """Blocks for the PK-candidates layer, or [] if there are none.""" keys = [k for k in (profile.get("key_candidates") or []) if k is not None] if not keys: return [] by_name = _columns_by_name(profile) pk = ("[[term:pk]]**clave primaria**[[/term]]" if mark else "**clave primaria**") intro = ( f"Estas columnas son **candidatas a {pk}**: su " "[[term:cardinalidad]]cardinalidad[[/term]] iguala al número de filas y no " "tienen nulos, así que cada valor identifica una fila distinta. Son " "candidatas, no una clave declarada: la base no las marca como tal." if mark else "Estas columnas son **candidatas a clave primaria**: su cardinalidad " "iguala al número de filas y no tienen nulos, así que cada valor " "identifica una fila distinta.") rows = [] for name in keys: col = by_name.get(name) or {} rows.append([ model._safe_str(name), _fmt_int(col.get("distinct_count")), _fmt_pct_fraction(col.get("unique_pct")), model._safe_str(col.get("inferred_type") or col.get("physical_type") or "—"), ]) return [ model.Heading(text="Candidatos a clave primaria", level=2), model.Markdown(text=intro), model.DataTable( header=["Columna", "Valores distintos", "% único", "Tipo"], rows=rows, title="Candidatas a clave primaria", note=f"{_fmt_int(profile.get('n_rows'))} filas en total como referencia."), ] # --------------------------------------------------------------------------- # # Layer 3a — inter-table FK candidates (containment) + join graph. # --------------------------------------------------------------------------- # def _list_source_tables(db_path: str) -> list: """List the tables in the DuckDB source, or [] if it can't be listed.""" if not db_path or duckdb_list_tables is None: return [] try: out = duckdb_list_tables(db_path) except Exception: # noqa: BLE001 return [] if not _is_dict(out) or out.get("status") != "ok": return [] return [t for t in (out.get("tables") or []) if isinstance(t, str)] def _inter_table_section(db_path: str, tables: list, mark: bool) -> list: """Blocks for the inter-table FK layer (containment + join graph), or [].""" if infer_fk_containment_duckdb is None or len(tables) < 2: return [] try: fk = infer_fk_containment_duckdb(db_path, tables=tables) except Exception: # noqa: BLE001 return [] if not _is_dict(fk) or fk.get("status") != "ok": return [] candidates = [c for c in (fk.get("fk_candidates") or []) if _is_dict(c)] if not candidates: return [] containment = ("[[term:containment]]containment (inclusión de valores)[[/term]]" if mark else "containment (inclusión de valores)") fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**" blocks = [ model.Heading(text="Claves foráneas candidatas (inter-tabla)", level=2), model.Markdown(text=( f"La fuente tiene varias tablas. Estas {fk_term} candidatas se infieren " f"por señal de nombre y por {containment}: una columna de una tabla cuyos " "valores están contenidos en la clave de otra. No están declaradas por " "la base; son la relación más probable según los datos.")), ] shown = candidates[:MAX_FK_ROWS] rows = [] for c in shown: rows.append([ f"{model._safe_str(c.get('from_table'))}.{model._safe_str(c.get('from_col'))}", f"{model._safe_str(c.get('to_table'))}.{model._safe_str(c.get('to_col'))}", _fmt_ratio(c.get("inclusion")), model._safe_str(c.get("cardinality") or "—"), "sí" if c.get("name_match") else "no", ]) note = "Ordenadas por señal de nombre e inclusión." if len(candidates) > len(shown): note += f" Se muestran {len(shown)} de {len(candidates)} candidatas." blocks.append(model.DataTable( header=["Origen", "→ Destino", "Inclusión", "Cardinalidad", "Coincide nombre"], rows=rows, title="FK candidatas por containment", note=note)) # Join graph: node roles + a pasteable Mermaid diagram, kept together. if build_join_graph is not None: try: graph = build_join_graph(candidates, tables=tables) except Exception: # noqa: BLE001 graph = None if _is_dict(graph): graph_blocks = [model.Heading(text="Grafo de relaciones", level=3)] nodes = [n for n in (graph.get("nodes") or []) if _is_dict(n)] if nodes: node_rows = [[ model._safe_str(n.get("table")), model._safe_str(n.get("role") or "—"), _fmt_int(n.get("out_degree")), _fmt_int(n.get("in_degree")), ] for n in nodes] graph_blocks.append(model.DataTable( header=["Tabla", "Rol", "FK salientes", "FK entrantes"], rows=node_rows, title="Tablas y su rol en el grafo", note="Rol: fact (apunta a otras), dimension (referenciada), " "bridge (ambas), standalone (aislada).")) hubs = [h for h in (graph.get("hubs") or []) if h] if hubs: graph_blocks.append(model.Markdown(text=( "Tablas con más relaciones salientes (candidatas a tabla de " "hechos): " + ", ".join(model._safe_str(h) for h in hubs) + "."))) mermaid = model._safe_str(graph.get("mermaid")).strip() if mermaid: graph_blocks.append(model.Markdown(text=( "Diagrama de las relaciones (pegable en un bloque Mermaid):"))) graph_blocks.append(model.Markdown( text="```mermaid\n" + mermaid + "\n```")) if len(graph_blocks) > 1: blocks.append(model.Group(blocks=graph_blocks, title="Grafo de relaciones")) skipped = [s for s in (fk.get("skipped") or []) if s] if skipped: blocks.append(model.Note( "Algunos pares se omitieron por tamaño: " + "; ".join(model._safe_str(s) for s in skipped) + ".")) return blocks # --------------------------------------------------------------------------- # # Layer 3b — intra-table FK candidates (name+cardinality heuristic). # --------------------------------------------------------------------------- # def _intra_table_section(profile: dict, mark: bool) -> list: """Blocks for the intra-table FK heuristic layer, or [] if no candidates.""" if suggest_intratable_fk_candidates is None: return [] try: cands = suggest_intratable_fk_candidates(profile) except Exception: # noqa: BLE001 return [] cands = [c for c in (cands or []) if _is_dict(c)] if not cands: return [] fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**" blocks = [ model.Heading(text="Posibles claves foráneas (heurística de nombre)", level=2), model.Markdown(text=( f"No hay otras tablas que referenciar, pero algunas columnas **parecen** " f"{fk_term} por su nombre (terminan en «id») y su cardinalidad (muchos " "valores repetidos, N:1). Es una **sugerencia heurística**, no una " "afirmación: el nombre de la tabla destino es una conjetura y no se " "comprueba inclusión de valores contra ninguna tabla real.")), ] rows = [] for c in cands: rows.append([ model._safe_str(c.get("column")), model._safe_str(c.get("ref_table_guess") or "—"), _fmt_int(c.get("distinct_count")), _fmt_pct_fraction(c.get("unique_pct")), model._safe_str(c.get("inferred_type") or c.get("physical_type") or "—"), model._safe_str(c.get("reason") or ""), ]) blocks.append(model.DataTable( header=["Columna", "Posible tabla", "Valores distintos", "% único", "Tipo", "Motivo"], rows=rows, title="Posibles FK por nombre y cardinalidad", note="Heurística: posibles falsos positivos/negativos. No confirma containment.")) blocks.append(model.Note( "Estas sugerencias se basan solo en el nombre y la cardinalidad. Para " "confirmarlas haría falta la tabla destino y comprobar la inclusión de " "valores (containment).")) return blocks # --------------------------------------------------------------------------- # # Entry point. # --------------------------------------------------------------------------- # def _intro_blocks(mark: bool) -> list: pk = "[[term:pk]]clave primaria[[/term]]" if mark else "clave primaria" fk = "[[term:fk]]clave foránea[[/term]]" if mark else "clave foránea" text = ( f"Este capítulo analiza las **relaciones de clave** de la tabla: qué columna " f"identifica cada fila (la {pk}) y qué columnas referencian a otra tabla (las " f"{fk}). Cuando la base las **declara** como restricciones del esquema, se " "muestran tal cual; cuando no, se proponen las más probables a partir de los " "datos —por inclusión de valores entre tablas (containment) o, en una sola " "tabla, por una heurística de nombre y cardinalidad— siempre marcadas como " "candidatas, nunca como hechos.") return [model.Heading(text=CHAPTER_TITLE, level=1), model.Markdown(text=text)] def build_relaciones(profile: dict, ctx: dict): """Build the RELACIONES Chapter, or None if there is nothing to say. Args: profile: the ``eda`` group TableProfile dict (may be None/empty). ctx: presentation context. Consumes ``db_path`` + ``table`` (to read declared constraints, list sibling tables and run the containment FK inference) and ``glossary`` (to register the relational terms). Returns: A ``model.Chapter`` with the applicable relation layers; or ``None`` when the dataset has no declared key, no key candidates and no FK candidate (neither inter- nor intra-table). """ if not isinstance(profile, dict): profile = {} ctx = ctx if isinstance(ctx, dict) else {} db_path = ctx.get("db_path") table = ctx.get("table") mark = _register_terms(ctx) # Build each layer; the chapter is the concatenation of the non-empty ones. declared = _declared_keys(db_path, table) declared_blocks = _declared_section(declared) if declared else [] declared_has_fk = bool(declared and declared.get("foreign_keys")) pk_blocks = _pk_candidates_section(profile, mark) tables = _list_source_tables(db_path) inter_blocks = _inter_table_section(db_path, tables, mark) # The intra-table heuristic only makes sense when no real FK is available for # this table — neither declared nor inferred inter-table. Otherwise the real # relations already answer the question and the heuristic is just noise. if declared_has_fk or inter_blocks: intra_blocks = [] else: intra_blocks = _intra_table_section(profile, mark) body = declared_blocks + pk_blocks + inter_blocks + intra_blocks if not body: return None # chapter does not apply: nothing to say about relations. blocks = _intro_blocks(mark) + body return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks)