fn_registry/python/functions/datascience/summarize_categorical.py

"""Pure EDA helper: categorical/text column profiling for the `eda` group.

Computes the ``categorical`` sub-block of a ColumnProfile from a list of
categorical or text values. No external dependencies (stdlib only).
"""

import math
from collections import Counter


def summarize_categorical(values: list, top_k: int = 10) -> dict:
    """Summarize a list of categorical/text values into an EDA profile block.

    ``None`` entries are dropped from every computation. An empty string
    (``""``) is treated as a regular value (it counts and has length 0).

    Args:
        values: List of categorical or text values. ``None`` is discarded;
            ``""`` is kept as the empty-string category.
        top_k: Maximum number of most-frequent values to include in ``top``.

    Returns:
        Dict with the exact keys of the `eda` group ``categorical_sub``
        contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``,
        ``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``.
        ``top`` is a list of ``{value, count, pct}`` sorted by ``count``
        descending (``pct`` is over the non-null total). When there are no
        non-null values, ``top`` is ``[]`` and every other key is ``None``.
    """
    non_null = [v for v in values if v is not None]
    total = len(non_null)

    if total == 0:
        return {
            "top": [],
            "mode": None,
            "mode_pct": None,
            "n_distinct": None,
            "entropy": None,
            "imbalance": None,
            "len_mean": None,
            "len_min": None,
            "len_max": None,
        }

    counter = Counter(non_null)
    # most_common is sorted by count descending (insertion order for ties).
    ordered = counter.most_common()

    top = [
        {"value": value, "count": count, "pct": count / total}
        for value, count in ordered[:top_k]
    ]

    mode_value, mode_count = ordered[0]
    n_distinct = len(counter)

    # Shannon entropy (base 2) of the frequency distribution.
    if n_distinct <= 1:
        entropy = 0.0
    else:
        entropy = 0.0
        for count in counter.values():
            p = count / total
            entropy -= p * math.log2(p)

    counts = list(counter.values())
    max_count = max(counts)
    min_count = min(counts)
    imbalance = 1.0 if n_distinct <= 1 else max_count / min_count

    lengths = [len(str(v)) for v in non_null]
    len_mean = sum(lengths) / total
    len_min = min(lengths)
    len_max = max(lengths)

    return {
        "top": top,
        "mode": mode_value,
        "mode_pct": mode_count / total,
        "n_distinct": n_distinct,
        "entropy": entropy,
        "imbalance": imbalance,
        "len_mean": len_mean,
        "len_min": len_min,
        "len_max": len_max,
    }