763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
"""Pure EDA helper: categorical/text column profiling for the `eda` group.
|
|
|
|
Computes the ``categorical`` sub-block of a ColumnProfile from a list of
|
|
categorical or text values. No external dependencies (stdlib only).
|
|
"""
|
|
|
|
import math
|
|
from collections import Counter
|
|
|
|
|
|
def summarize_categorical(values: list, top_k: int = 10) -> dict:
|
|
"""Summarize a list of categorical/text values into an EDA profile block.
|
|
|
|
``None`` entries are dropped from every computation. An empty string
|
|
(``""``) is treated as a regular value (it counts and has length 0).
|
|
|
|
Args:
|
|
values: List of categorical or text values. ``None`` is discarded;
|
|
``""`` is kept as the empty-string category.
|
|
top_k: Maximum number of most-frequent values to include in ``top``.
|
|
|
|
Returns:
|
|
Dict with the exact keys of the `eda` group ``categorical_sub``
|
|
contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``,
|
|
``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``.
|
|
``top`` is a list of ``{value, count, pct}`` sorted by ``count``
|
|
descending (``pct`` is over the non-null total). When there are no
|
|
non-null values, ``top`` is ``[]`` and every other key is ``None``.
|
|
"""
|
|
non_null = [v for v in values if v is not None]
|
|
total = len(non_null)
|
|
|
|
if total == 0:
|
|
return {
|
|
"top": [],
|
|
"mode": None,
|
|
"mode_pct": None,
|
|
"n_distinct": None,
|
|
"entropy": None,
|
|
"imbalance": None,
|
|
"len_mean": None,
|
|
"len_min": None,
|
|
"len_max": None,
|
|
}
|
|
|
|
counter = Counter(non_null)
|
|
# most_common is sorted by count descending (insertion order for ties).
|
|
ordered = counter.most_common()
|
|
|
|
top = [
|
|
{"value": value, "count": count, "pct": count / total}
|
|
for value, count in ordered[:top_k]
|
|
]
|
|
|
|
mode_value, mode_count = ordered[0]
|
|
n_distinct = len(counter)
|
|
|
|
# Shannon entropy (base 2) of the frequency distribution.
|
|
if n_distinct <= 1:
|
|
entropy = 0.0
|
|
else:
|
|
entropy = 0.0
|
|
for count in counter.values():
|
|
p = count / total
|
|
entropy -= p * math.log2(p)
|
|
|
|
counts = list(counter.values())
|
|
max_count = max(counts)
|
|
min_count = min(counts)
|
|
imbalance = 1.0 if n_distinct <= 1 else max_count / min_count
|
|
|
|
lengths = [len(str(v)) for v in non_null]
|
|
len_mean = sum(lengths) / total
|
|
len_min = min(lengths)
|
|
len_max = max(lengths)
|
|
|
|
return {
|
|
"top": top,
|
|
"mode": mode_value,
|
|
"mode_pct": mode_count / total,
|
|
"n_distinct": n_distinct,
|
|
"entropy": entropy,
|
|
"imbalance": imbalance,
|
|
"len_mean": len_mean,
|
|
"len_min": len_min,
|
|
"len_max": len_max,
|
|
}
|