"""Pure EDA helper: cardinality metrics block from a `summarize_categorical` output. Part of the `eda` capability group. Consumes the per-column dict produced by ``summarize_categorical`` (for a single categorical/text column) plus the total row count of the dataset and derives render-ready cardinality metrics: distinct ratio, normalized entropy, singleton count, and the ``id_like`` / ``dominated`` flags. It does NOT recompute the entropy nor reimplement ``summarize_categorical`` — it only reads that function's output. Dict-no-throw style of the `eda` group: it never raises. Missing or malformed inputs yield ``None``/``False``/``0`` for the affected keys, never an exception. Stdlib only (``math.log2``). """ from math import log2 def _num(value): """Return ``value`` unchanged if it is a real (non-bool) number, else ``None``. ``bool`` is rejected on purpose: in Python ``True`` is an ``int`` but it is never a meaningful count/ratio here. """ if isinstance(value, bool): return None if isinstance(value, (int, float)): return value return None def categorical_cardinality_block(cat: dict, n_rows: int) -> dict: """Derive cardinality metrics for one categorical column. Args: cat: The per-column dict produced by ``summarize_categorical`` for a single categorical/text column. Expected (all optional, read defensively) keys: ``top`` (list of ``{value, count, pct}``), ``mode``, ``mode_pct``, ``n_distinct``, ``entropy`` (Shannon, bits), ``imbalance``, ``len_min``, ``len_mean``, ``len_max``. ``None`` or a non-dict is treated as ``{}``. n_rows: Total number of rows in the dataset (used for ``pct_distinct``). Returns: Dict with exactly these keys, every one always present: ``n_distinct``, ``n_rows``, ``pct_distinct``, ``entropy``, ``entropy_max``, ``entropy_norm``, ``mode``, ``mode_pct``, ``imbalance``, ``n_singletons``, ``n_singletons_partial``, ``len_min``, ``len_mean``, ``len_max``, ``id_like``, ``dominated``. Values are ``None``/``False`` when not derivable; the function never raises. """ cat = cat if isinstance(cat, dict) else {} # --- passthroughs (numeric-validated, type preserved) --- n_distinct = _num(cat.get("n_distinct")) n_rows_out = _num(n_rows) entropy = _num(cat.get("entropy")) imbalance = _num(cat.get("imbalance")) len_min = _num(cat.get("len_min")) len_mean = _num(cat.get("len_mean")) len_max = _num(cat.get("len_max")) mode = cat.get("mode") # any value (or None); passthrough as-is # --- pct_distinct --- if n_distinct is None or n_rows_out is None or n_rows_out == 0: pct_distinct = None else: pct_distinct = n_distinct / n_rows_out * 100.0 # --- entropy_max = log2(n_distinct) --- if n_distinct is None: entropy_max = None elif n_distinct > 1: entropy_max = log2(n_distinct) else: # n_distinct in {0, 1} entropy_max = 0.0 # --- entropy_norm = entropy / entropy_max, clipped to [0, 1] --- if entropy_max is not None and entropy_max > 0 and entropy is not None: entropy_norm = entropy / entropy_max entropy_norm = max(0.0, min(1.0, entropy_norm)) else: entropy_norm = None # --- mode_pct: prefer cat['mode_pct']; else derive from top[0].pct --- mode_pct = _num(cat.get("mode_pct")) top = cat.get("top") has_top = isinstance(top, (list, tuple)) and len(top) > 0 if mode_pct is None and has_top: first = top[0] if isinstance(first, dict): first_pct = _num(first.get("pct")) if first_pct is not None: # Normalize to 0-100: a fraction (<= 1) becomes a percentage. mode_pct = first_pct * 100.0 if first_pct <= 1 else first_pct # --- singletons (count == 1) within the visible top --- if has_top: n_singletons = sum( 1 for item in top if isinstance(item, dict) and _num(item.get("count")) == 1 ) else: n_singletons = None # The singleton count only covers the visible top; there may be more # distinct values (and thus more singletons) outside it. top_len = len(top) if isinstance(top, (list, tuple)) else 0 n_singletons_partial = bool(n_distinct is not None and n_distinct > top_len) # --- derived flags --- id_like = pct_distinct is not None and pct_distinct >= 99.0 dominated = mode_pct is not None and mode_pct >= 90.0 return { "n_distinct": n_distinct, "n_rows": n_rows_out, "pct_distinct": pct_distinct, "entropy": entropy, "entropy_max": entropy_max, "entropy_norm": entropy_norm, "mode": mode, "mode_pct": mode_pct, "imbalance": imbalance, "n_singletons": n_singletons, "n_singletons_partial": n_singletons_partial, "len_min": len_min, "len_mean": len_mean, "len_max": len_max, "id_like": id_like, "dominated": dominated, }