feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,87 @@
"""Pure EDA helper: categorical/text column profiling for the `eda` group.
Computes the ``categorical`` sub-block of a ColumnProfile from a list of
categorical or text values. No external dependencies (stdlib only).
"""
import math
from collections import Counter
def summarize_categorical(values: list, top_k: int = 10) -> dict:
"""Summarize a list of categorical/text values into an EDA profile block.
``None`` entries are dropped from every computation. An empty string
(``""``) is treated as a regular value (it counts and has length 0).
Args:
values: List of categorical or text values. ``None`` is discarded;
``""`` is kept as the empty-string category.
top_k: Maximum number of most-frequent values to include in ``top``.
Returns:
Dict with the exact keys of the `eda` group ``categorical_sub``
contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``,
``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``.
``top`` is a list of ``{value, count, pct}`` sorted by ``count``
descending (``pct`` is over the non-null total). When there are no
non-null values, ``top`` is ``[]`` and every other key is ``None``.
"""
non_null = [v for v in values if v is not None]
total = len(non_null)
if total == 0:
return {
"top": [],
"mode": None,
"mode_pct": None,
"n_distinct": None,
"entropy": None,
"imbalance": None,
"len_mean": None,
"len_min": None,
"len_max": None,
}
counter = Counter(non_null)
# most_common is sorted by count descending (insertion order for ties).
ordered = counter.most_common()
top = [
{"value": value, "count": count, "pct": count / total}
for value, count in ordered[:top_k]
]
mode_value, mode_count = ordered[0]
n_distinct = len(counter)
# Shannon entropy (base 2) of the frequency distribution.
if n_distinct <= 1:
entropy = 0.0
else:
entropy = 0.0
for count in counter.values():
p = count / total
entropy -= p * math.log2(p)
counts = list(counter.values())
max_count = max(counts)
min_count = min(counts)
imbalance = 1.0 if n_distinct <= 1 else max_count / min_count
lengths = [len(str(v)) for v in non_null]
len_mean = sum(lengths) / total
len_min = min(lengths)
len_max = max(lengths)
return {
"top": top,
"mode": mode_value,
"mode_pct": mode_count / total,
"n_distinct": n_distinct,
"entropy": entropy,
"imbalance": imbalance,
"len_mean": len_mean,
"len_min": len_min,
"len_max": len_max,
}