feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
"""Pure EDA helper: categorical/text column profiling for the `eda` group.
|
||||
|
||||
Computes the ``categorical`` sub-block of a ColumnProfile from a list of
|
||||
categorical or text values. No external dependencies (stdlib only).
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def summarize_categorical(values: list, top_k: int = 10) -> dict:
|
||||
"""Summarize a list of categorical/text values into an EDA profile block.
|
||||
|
||||
``None`` entries are dropped from every computation. An empty string
|
||||
(``""``) is treated as a regular value (it counts and has length 0).
|
||||
|
||||
Args:
|
||||
values: List of categorical or text values. ``None`` is discarded;
|
||||
``""`` is kept as the empty-string category.
|
||||
top_k: Maximum number of most-frequent values to include in ``top``.
|
||||
|
||||
Returns:
|
||||
Dict with the exact keys of the `eda` group ``categorical_sub``
|
||||
contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``,
|
||||
``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``.
|
||||
``top`` is a list of ``{value, count, pct}`` sorted by ``count``
|
||||
descending (``pct`` is over the non-null total). When there are no
|
||||
non-null values, ``top`` is ``[]`` and every other key is ``None``.
|
||||
"""
|
||||
non_null = [v for v in values if v is not None]
|
||||
total = len(non_null)
|
||||
|
||||
if total == 0:
|
||||
return {
|
||||
"top": [],
|
||||
"mode": None,
|
||||
"mode_pct": None,
|
||||
"n_distinct": None,
|
||||
"entropy": None,
|
||||
"imbalance": None,
|
||||
"len_mean": None,
|
||||
"len_min": None,
|
||||
"len_max": None,
|
||||
}
|
||||
|
||||
counter = Counter(non_null)
|
||||
# most_common is sorted by count descending (insertion order for ties).
|
||||
ordered = counter.most_common()
|
||||
|
||||
top = [
|
||||
{"value": value, "count": count, "pct": count / total}
|
||||
for value, count in ordered[:top_k]
|
||||
]
|
||||
|
||||
mode_value, mode_count = ordered[0]
|
||||
n_distinct = len(counter)
|
||||
|
||||
# Shannon entropy (base 2) of the frequency distribution.
|
||||
if n_distinct <= 1:
|
||||
entropy = 0.0
|
||||
else:
|
||||
entropy = 0.0
|
||||
for count in counter.values():
|
||||
p = count / total
|
||||
entropy -= p * math.log2(p)
|
||||
|
||||
counts = list(counter.values())
|
||||
max_count = max(counts)
|
||||
min_count = min(counts)
|
||||
imbalance = 1.0 if n_distinct <= 1 else max_count / min_count
|
||||
|
||||
lengths = [len(str(v)) for v in non_null]
|
||||
len_mean = sum(lengths) / total
|
||||
len_min = min(lengths)
|
||||
len_max = max(lengths)
|
||||
|
||||
return {
|
||||
"top": top,
|
||||
"mode": mode_value,
|
||||
"mode_pct": mode_count / total,
|
||||
"n_distinct": n_distinct,
|
||||
"entropy": entropy,
|
||||
"imbalance": imbalance,
|
||||
"len_mean": len_mean,
|
||||
"len_min": len_min,
|
||||
"len_max": len_max,
|
||||
}
|
||||
Reference in New Issue
Block a user