feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,159 @@
+"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile.
+
+Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a
+ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and
+missing values (None, NaN, non-numeric strings) are discarded before computing.
+
+Reuses registry functions instead of reimplementing their logic:
+  - detect_distribution_type (skew, kurtosis, distribution label)
+  - detect_outliers (z-score outlier flags)
+  - histogram (counts per equal-width bucket)
+"""
+
+import math
+import os
+import sys
+
+import numpy as np
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datascience import detect_outliers, histogram  # noqa: E402
+from detect_distribution_type import detect_distribution_type  # noqa: E402
+
+
+# Keys of the numeric sub-block contract for the eda group.
+_NULL_KEYS = (
+    "min", "max", "mean", "median", "mode", "std", "variance", "cv",
+    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
+    "skew", "kurtosis", "n_outliers", "outlier_pct",
+    "zero_pct", "negative_pct", "distribution_type",
+)
+
+
+def _clean(values: list) -> list:
+    """Keep only finite numeric values, discarding None/NaN/non-numeric/bool."""
+    out: list = []
+    for v in values:
+        # bool is a subclass of int; treat True/False as non-numeric data.
+        if isinstance(v, bool):
+            continue
+        if isinstance(v, (int, float)):
+            f = float(v)
+            if not math.isnan(f) and not math.isinf(f):
+                out.append(f)
+    return out
+
+
+def _mode(values: list) -> float:
+    """Most frequent value; on a tie, the smallest value wins."""
+    counts: dict = {}
+    for v in values:
+        counts[v] = counts.get(v, 0) + 1
+    best_count = max(counts.values())
+    return min(v for v, c in counts.items() if c == best_count)
+
+
+def describe_numeric(values: list, bins: int = 20) -> dict:
+    """Compute the fine-grained numeric statistics block for an EDA ColumnProfile.
+
+    Designed to run on a SAMPLE of a single column, not the whole table.
+    None, NaN, infinities and non-numeric values are discarded first. If no
+    numeric value survives the cleaning, every key is None and histogram is [].
+
+    Args:
+        values: List of raw column values (may contain None/NaN/strings).
+        bins: Number of equal-width buckets for the histogram (default 20).
+
+    Returns:
+        Dict with the exact keys of the eda `numeric_sub` contract:
+        {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
+         p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
+         negative_pct, distribution_type, histogram}.
+    """
+    clean = _clean(values)
+    n = len(clean)
+
+    if n == 0:
+        result = {k: None for k in _NULL_KEYS}
+        result["histogram"] = []
+        return result
+
+    arr = np.array(clean, dtype=float)
+
+    minimum = float(np.min(arr))
+    maximum = float(np.max(arr))
+    mean = float(np.mean(arr))
+    std = float(np.std(arr))
+    variance = float(np.var(arr))
+    cv = (std / mean) if mean != 0 else None
+
+    p1 = float(np.percentile(arr, 1))
+    p5 = float(np.percentile(arr, 5))
+    p25 = float(np.percentile(arr, 25))
+    p50 = float(np.percentile(arr, 50))
+    p75 = float(np.percentile(arr, 75))
+    p95 = float(np.percentile(arr, 95))
+    p99 = float(np.percentile(arr, 99))
+    median = p50
+    iqr = p75 - p25
+
+    mode = _mode(clean)
+
+    # Distribution shape: reuse detect_distribution_type for skew/kurtosis/type.
+    dist = detect_distribution_type(clean)
+    distribution_type = dist.get("type")
+    dist_stats = dist.get("stats", {})
+    skew = dist_stats.get("skew")
+    kurtosis = dist_stats.get("kurtosis")
+
+    # Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True.
+    outlier_flags = detect_outliers(clean, 3.0)
+    n_outliers = sum(1 for flag in outlier_flags if flag)
+    outlier_pct = 100.0 * n_outliers / n
+
+    zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n
+    negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n
+
+    # Histogram: reuse histogram for the per-bucket counts, then attach the
+    # equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}.
+    counts = histogram(clean, bins)
+    hist: list = []
+    if counts:
+        if maximum == minimum:
+            # Degenerate range: histogram() places everything in bucket 0.
+            for i, count in enumerate(counts):
+                hist.append({"lo": minimum, "hi": maximum, "count": int(count)})
+        else:
+            width = (maximum - minimum) / bins
+            for i, count in enumerate(counts):
+                lo = minimum + i * width
+                hi = minimum + (i + 1) * width
+                hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})
+
+    return {
+        "min": minimum,
+        "max": maximum,
+        "mean": mean,
+        "median": median,
+        "mode": mode,
+        "std": std,
+        "variance": variance,
+        "cv": cv,
+        "p1": p1,
+        "p5": p5,
+        "p25": p25,
+        "p50": p50,
+        "p75": p75,
+        "p95": p95,
+        "p99": p99,
+        "iqr": iqr,
+        "skew": skew,
+        "kurtosis": kurtosis,
+        "n_outliers": n_outliers,
+        "outlier_pct": outlier_pct,
+        "zero_pct": zero_pct,
+        "negative_pct": negative_pct,
+        "distribution_type": distribution_type,
+        "histogram": hist,
+    }