feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,87 @@
+"""Pure EDA helper: categorical/text column profiling for the `eda` group.
+
+Computes the ``categorical`` sub-block of a ColumnProfile from a list of
+categorical or text values. No external dependencies (stdlib only).
+"""
+
+import math
+from collections import Counter
+
+
+def summarize_categorical(values: list, top_k: int = 10) -> dict:
+    """Summarize a list of categorical/text values into an EDA profile block.
+
+    ``None`` entries are dropped from every computation. An empty string
+    (``""``) is treated as a regular value (it counts and has length 0).
+
+    Args:
+        values: List of categorical or text values. ``None`` is discarded;
+            ``""`` is kept as the empty-string category.
+        top_k: Maximum number of most-frequent values to include in ``top``.
+
+    Returns:
+        Dict with the exact keys of the `eda` group ``categorical_sub``
+        contract: ``top``, ``mode``, ``mode_pct``, ``n_distinct``,
+        ``entropy``, ``imbalance``, ``len_mean``, ``len_min``, ``len_max``.
+        ``top`` is a list of ``{value, count, pct}`` sorted by ``count``
+        descending (``pct`` is over the non-null total). When there are no
+        non-null values, ``top`` is ``[]`` and every other key is ``None``.
+    """
+    non_null = [v for v in values if v is not None]
+    total = len(non_null)
+
+    if total == 0:
+        return {
+            "top": [],
+            "mode": None,
+            "mode_pct": None,
+            "n_distinct": None,
+            "entropy": None,
+            "imbalance": None,
+            "len_mean": None,
+            "len_min": None,
+            "len_max": None,
+        }
+
+    counter = Counter(non_null)
+    # most_common is sorted by count descending (insertion order for ties).
+    ordered = counter.most_common()
+
+    top = [
+        {"value": value, "count": count, "pct": count / total}
+        for value, count in ordered[:top_k]
+    ]
+
+    mode_value, mode_count = ordered[0]
+    n_distinct = len(counter)
+
+    # Shannon entropy (base 2) of the frequency distribution.
+    if n_distinct <= 1:
+        entropy = 0.0
+    else:
+        entropy = 0.0
+        for count in counter.values():
+            p = count / total
+            entropy -= p * math.log2(p)
+
+    counts = list(counter.values())
+    max_count = max(counts)
+    min_count = min(counts)
+    imbalance = 1.0 if n_distinct <= 1 else max_count / min_count
+
+    lengths = [len(str(v)) for v in non_null]
+    len_mean = sum(lengths) / total
+    len_min = min(lengths)
+    len_max = max(lengths)
+
+    return {
+        "top": top,
+        "mode": mode_value,
+        "mode_pct": mode_count / total,
+        "n_distinct": n_distinct,
+        "entropy": entropy,
+        "imbalance": imbalance,
+        "len_mean": len_mean,
+        "len_min": len_min,
+        "len_max": len_max,
+    }