"""Pure EDA helper: categorical/text column profiling for the `eda` group. Computes the ``categorical`` sub-block of a ColumnProfile from a list of categorical or text values. No external dependencies (stdlib only). """ import math from collections import Counter def summarize_categorical(values: list, top_k: int = 10) -> dict: """Summarize a list of categorical/text values into an EDA profile block. ``None`` entries are dropped from every computation. An empty string (``""``) is treated as a regular value (it counts and has length 0). Args: values: List of categorical or text values. ``None`` is discarded; ``""`` is kept as the empty-string category. top_k: Maximum number of most-frequent values to include in ``top``. Returns: Dict with the exact keys of the `eda` group ``categorical_sub`` contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``, ``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``. ``top`` is a list of ``{value, count, pct}`` sorted by ``count`` descending (``pct`` is over the non-null total). When there are no non-null values, ``top`` is ``[]`` and every other key is ``None``. """ non_null = [v for v in values if v is not None] total = len(non_null) if total == 0: return { "top": [], "mode": None, "mode_pct": None, "n_distinct": None, "entropy": None, "imbalance": None, "len_mean": None, "len_min": None, "len_max": None, } counter = Counter(non_null) # most_common is sorted by count descending (insertion order for ties). ordered = counter.most_common() top = [ {"value": value, "count": count, "pct": count / total} for value, count in ordered[:top_k] ] mode_value, mode_count = ordered[0] n_distinct = len(counter) # Shannon entropy (base 2) of the frequency distribution. if n_distinct <= 1: entropy = 0.0 else: entropy = 0.0 for count in counter.values(): p = count / total entropy -= p * math.log2(p) counts = list(counter.values()) max_count = max(counts) min_count = min(counts) imbalance = 1.0 if n_distinct <= 1 else max_count / min_count lengths = [len(str(v)) for v in non_null] len_mean = sum(lengths) / total len_min = min(lengths) len_max = max(lengths) return { "top": top, "mode": mode_value, "mode_pct": mode_count / total, "n_distinct": n_distinct, "entropy": entropy, "imbalance": imbalance, "len_mean": len_mean, "len_min": len_min, "len_max": len_max, }