feat(eda): funciones de agregación/OLAP para AutomaticEDA (groupby/pivot push-down + selección LLM)

Cuatro funciones nuevas del grupo eda que nutren el capítulo AGREGACION: - select_groupby_keys (pure): elige categóricas agrupables + numéricas medida desde el TableProfile. - groupby_stats_duckdb (impure): GROUP BY push-down en DuckDB (count/mean/median/std/min/max por grupo). - pivot_table_duckdb (impure): pivot A×B push-down, limitado a top filas/cols para no cortar. - suggest_aggregations_llm (impure): el LLM elige las agregaciones interesantes con fallback determinista. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 15:33:55 +02:00
parent 415154d9a3
commit 96da9e3015
13 changed files with 2146 additions and 0 deletions
@@ -0,0 +1,310 @@
+"""Pure EDA helper: pick GROUP BY keys and measures from a TableProfile.
+
+Given a ``TableProfile`` of the ``eda`` group (the dict produced by, e.g.,
+``summarize_table_duckdb``), this function deterministically selects the most
+interesting categorical columns to group by (GROUP BY), the numeric measure
+columns to aggregate, and a couple of categorical x categorical pivot pairs.
+
+It is the quantitative backbone for the aggregation / OLAP chapter of an
+AutomaticEDA: a pure, deterministic ranking over the profile, with no I/O, no
+mutation of the input and no external dependencies (stdlib only). It never
+raises — a missing or malformed profile yields an empty, well-formed result.
+"""
+
+
+def select_groupby_keys(
+    profile: dict,
+    max_keys: int = 3,
+    max_card: int = 20,
+    max_measures: int = 4,
+) -> dict:
+    """Select GROUP BY keys, measures and pivot pairs from a TableProfile.
+
+    Reads everything defensively (``.get(...)``, ``or []``, ``isinstance``) and
+    never raises. With an empty/None profile it returns every list empty.
+
+    Selection rules (deterministic):
+
+    - **group_keys** (categorical columns to group by): candidates have
+      ``inferred_type`` in ``("categorical", "boolean")``. Discarded if they are
+      in ``profile['key_candidates']``, carry a ``possible_id`` /
+      ``high_cardinality`` / ``constant`` flag, have ``distinct_count`` outside
+      ``[2, max_card]``, or are all-null (``null_pct >= 0.999``). Each survivor
+      gets ``score = card_score * balance_score`` where ``card_score`` keeps a
+      plateau for moderate cardinality (2..12) and decays towards ``max_card``,
+      and ``balance_score = 1 / imbalance`` (``categorical.imbalance`` if
+      present, else approximated from ``mode_pct``, else a neutral default).
+      The top ``max_keys`` by score (desc, ties by column order) are returned.
+
+    - **measures** (numeric columns to aggregate): candidates have
+      ``inferred_type`` in ``("numeric", "integer", "float")``. Discarded if
+      id-like (``possible_id`` flag *and* ``unique_pct >= 0.99``) or constant
+      (``numeric.std`` is ``0`` or ``None``). Ranked by informative dispersion:
+      ``abs(cv)`` when available, else ``abs(std)``. The top ``max_measures``
+      **names** are returned.
+
+    - **pivots**: up to 2 ``(group_keys[i].col, group_keys[j].col)`` pairs with
+      ``i < j``, using the first measure as the aggregated value. Empty when
+      fewer than 2 group keys were selected.
+
+    Args:
+        profile: TableProfile dict of the ``eda`` group. Relevant keys:
+            ``columns`` (list[ColumnProfile]), ``key_candidates`` (list of
+            column names or ``{name}`` dicts), ``n_rows``. Each ColumnProfile
+            uses: ``name``, ``inferred_type``, ``distinct_count``,
+            ``unique_pct`` (0..1), ``null_pct`` (0..1), ``flags`` (list[str]),
+            ``numeric`` ({std, cv, ...}|None), ``categorical``
+            ({imbalance, mode_pct, ...}|None).
+        max_keys: Maximum number of group-by keys to return. Default 3.
+        max_card: Maximum cardinality (``distinct_count``) a categorical column
+            may have to still qualify as a group key. Default 20.
+        max_measures: Maximum number of measure names to return. Default 4.
+
+    Returns:
+        dict with:
+          group_keys (list[{col, cardinality, score}], ordered by score desc),
+          measures   (list[str], numeric column names ordered by dispersion),
+          pivots     (list[{index, columns, value}], up to 2 pairs),
+          note       (str, short summary of what was chosen).
+    """
+    if not isinstance(profile, dict):
+        profile = {}
+
+    try:
+        max_keys = int(max_keys)
+    except (TypeError, ValueError):
+        max_keys = 3
+    try:
+        max_card = int(max_card)
+    except (TypeError, ValueError):
+        max_card = 20
+    try:
+        max_measures = int(max_measures)
+    except (TypeError, ValueError):
+        max_measures = 4
+    max_keys = max(max_keys, 0)
+    max_card = max(max_card, 2)
+    max_measures = max(max_measures, 0)
+
+    columns = profile.get("columns") or []
+    if not isinstance(columns, (list, tuple)):
+        columns = []
+
+    key_names = _key_candidate_names(profile.get("key_candidates"))
+
+    group_keys = _select_group_keys(columns, key_names, max_keys, max_card)
+    measures = _select_measures(columns, max_measures)
+    pivots = _select_pivots(group_keys, measures)
+
+    return {
+        "group_keys": group_keys,
+        "measures": measures,
+        "pivots": pivots,
+        "note": _build_note(group_keys, measures, pivots),
+    }
+
+
+# ---------------------------------------------------------------------------
+# group_keys
+# ---------------------------------------------------------------------------
+
+_GROUP_TYPES = ("categorical", "boolean")
+_DISQUALIFYING_FLAGS = frozenset({"possible_id", "high_cardinality", "constant"})
+_CARD_PLATEAU_HI = 12  # cardinalities 2..12 are all "moderate" (best).
+
+
+def _select_group_keys(columns, key_names, max_keys, max_card) -> list:
+    """Rank categorical/boolean columns suitable for GROUP BY."""
+    scored = []
+    for idx, col in enumerate(columns):
+        if not isinstance(col, dict):
+            continue
+        if (col.get("inferred_type") or "") not in _GROUP_TYPES:
+            continue
+
+        name = col.get("name")
+        if name is None:
+            continue
+        if name in key_names:
+            continue
+
+        flags = _as_set(col.get("flags"))
+        if flags & _DISQUALIFYING_FLAGS:
+            continue
+
+        if _num(col.get("null_pct"), 0.0) >= 0.999:
+            continue
+
+        card = _num(col.get("distinct_count"), 0.0)
+        if card < 2 or card > max_card:
+            continue
+        card_i = int(card)
+
+        score = _card_score(card_i, max_card) * _balance_score(col.get("categorical"))
+        scored.append((round(score, 6), idx, name, card_i))
+
+    # Deterministic: higher score first, ties broken by original column order.
+    scored.sort(key=lambda t: (-t[0], t[1]))
+
+    out = []
+    for score, _idx, name, card_i in scored[:max_keys]:
+        out.append({"col": name, "cardinality": card_i, "score": score})
+    return out
+
+
+def _card_score(card: int, max_card: int) -> float:
+    """Prefer moderate cardinality; plateau at 2..12, decay towards max_card."""
+    if card <= 1:
+        return 0.0
+    if card <= _CARD_PLATEAU_HI:
+        return 1.0
+    denom = max(max_card - _CARD_PLATEAU_HI, 1)
+    over = card - _CARD_PLATEAU_HI
+    return max(0.1, 1.0 - over / denom)
+
+
+def _balance_score(categorical) -> float:
+    """1.0 for a perfectly balanced category, decaying as imbalance grows.
+
+    Uses ``categorical.imbalance`` (max_count/min_count, >= 1) when available;
+    otherwise approximates from ``mode_pct`` (top-class dominance); otherwise a
+    neutral default so the column is still selectable.
+    """
+    if isinstance(categorical, dict):
+        imbalance = categorical.get("imbalance")
+        if isinstance(imbalance, (int, float)) and imbalance >= 1.0:
+            return 1.0 / float(imbalance)
+        mode_pct = categorical.get("mode_pct")
+        if isinstance(mode_pct, (int, float)):
+            return _clamp(1.0 - float(mode_pct), 0.0, 1.0)
+    return 0.5
+
+
+# ---------------------------------------------------------------------------
+# measures
+# ---------------------------------------------------------------------------
+
+_NUMERIC_TYPES = ("numeric", "integer", "float")
+
+
+def _select_measures(columns, max_measures) -> list:
+    """Rank numeric columns by informative dispersion (cv, else std)."""
+    scored = []
+    for idx, col in enumerate(columns):
+        if not isinstance(col, dict):
+            continue
+        if (col.get("inferred_type") or "") not in _NUMERIC_TYPES:
+            continue
+
+        name = col.get("name")
+        if name is None:
+            continue
+
+        flags = _as_set(col.get("flags"))
+        unique_pct = _num(col.get("unique_pct"), 0.0)
+        if "possible_id" in flags and unique_pct >= 0.99:
+            continue  # sequential id, not a measure.
+
+        numeric = col.get("numeric")
+        std = numeric.get("std") if isinstance(numeric, dict) else None
+        if not isinstance(std, (int, float)) or std == 0:
+            continue  # constant or unknown spread -> not informative.
+
+        cv = numeric.get("cv") if isinstance(numeric, dict) else None
+        if isinstance(cv, (int, float)):
+            dispersion = abs(float(cv))
+        else:
+            dispersion = abs(float(std))
+
+        scored.append((dispersion, idx, name))
+
+    # Higher dispersion first, ties broken by original column order.
+    scored.sort(key=lambda t: (-t[0], t[1]))
+    return [name for _disp, _idx, name in scored[:max_measures]]
+
+
+# ---------------------------------------------------------------------------
+# pivots
+# ---------------------------------------------------------------------------
+
+
+def _select_pivots(group_keys, measures) -> list:
+    """Up to 2 (cat_a, cat_b) pairs from the chosen group keys."""
+    if not isinstance(group_keys, list) or len(group_keys) < 2:
+        return []
+    value = measures[0] if measures else None
+    pairs = []
+    n = len(group_keys)
+    for i in range(n):
+        for j in range(i + 1, n):
+            pairs.append({
+                "index": group_keys[i].get("col"),
+                "columns": group_keys[j].get("col"),
+                "value": value,
+            })
+            if len(pairs) >= 2:
+                return pairs
+    return pairs
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _build_note(group_keys, measures, pivots) -> str:
+    """One-line Spanish summary of the selection."""
+    parts = []
+    if group_keys:
+        cols = ", ".join(str(g.get("col")) for g in group_keys)
+        parts.append(f"{len(group_keys)} clave(s) de grupo: {cols}")
+    else:
+        parts.append("sin categóricas agrupables")
+    if measures:
+        parts.append(f"{len(measures)} medida(s): " + ", ".join(str(m) for m in measures))
+    else:
+        parts.append("sin medidas numéricas")
+    if pivots:
+        parts.append(f"{len(pivots)} pivot(s)")
+    return "; ".join(parts) + "."
+
+
+def _key_candidate_names(key_candidates) -> set:
+    """Normalize ``key_candidates`` (strings or ``{name}`` dicts) to a name set."""
+    names = set()
+    if not isinstance(key_candidates, (list, tuple)):
+        return names
+    for entry in key_candidates:
+        if isinstance(entry, str):
+            names.add(entry)
+        elif isinstance(entry, dict):
+            nm = entry.get("name") or entry.get("col")
+            if nm is not None:
+                names.add(nm)
+    return names
+
+
+def _as_set(flags) -> set:
+    """Coerce a flags value into a set, tolerating None / non-iterables."""
+    if isinstance(flags, (list, tuple, set)):
+        return set(flags)
+    return set()
+
+
+def _num(value, default: float) -> float:
+    """Best-effort float conversion with a fallback default."""
+    if value is None:
+        return default
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _clamp(x: float, lo: float, hi: float) -> float:
+    """Recorta x al rango [lo, hi]."""
+    if x < lo:
+        return lo
+    if x > hi:
+        return hi
+    return x