feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,159 @@
"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile.
Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a
ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and
missing values (None, NaN, non-numeric strings) are discarded before computing.
Reuses registry functions instead of reimplementing their logic:
- detect_distribution_type (skew, kurtosis, distribution label)
- detect_outliers (z-score outlier flags)
- histogram (counts per equal-width bucket)
"""
import math
import os
import sys
import numpy as np
sys.path.insert(0, os.path.dirname(__file__))
from datascience import detect_outliers, histogram # noqa: E402
from detect_distribution_type import detect_distribution_type # noqa: E402
# Keys of the numeric sub-block contract for the eda group.
_NULL_KEYS = (
"min", "max", "mean", "median", "mode", "std", "variance", "cv",
"p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
"skew", "kurtosis", "n_outliers", "outlier_pct",
"zero_pct", "negative_pct", "distribution_type",
)
def _clean(values: list) -> list:
"""Keep only finite numeric values, discarding None/NaN/non-numeric/bool."""
out: list = []
for v in values:
# bool is a subclass of int; treat True/False as non-numeric data.
if isinstance(v, bool):
continue
if isinstance(v, (int, float)):
f = float(v)
if not math.isnan(f) and not math.isinf(f):
out.append(f)
return out
def _mode(values: list) -> float:
"""Most frequent value; on a tie, the smallest value wins."""
counts: dict = {}
for v in values:
counts[v] = counts.get(v, 0) + 1
best_count = max(counts.values())
return min(v for v, c in counts.items() if c == best_count)
def describe_numeric(values: list, bins: int = 20) -> dict:
"""Compute the fine-grained numeric statistics block for an EDA ColumnProfile.
Designed to run on a SAMPLE of a single column, not the whole table.
None, NaN, infinities and non-numeric values are discarded first. If no
numeric value survives the cleaning, every key is None and histogram is [].
Args:
values: List of raw column values (may contain None/NaN/strings).
bins: Number of equal-width buckets for the histogram (default 20).
Returns:
Dict with the exact keys of the eda `numeric_sub` contract:
{min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
negative_pct, distribution_type, histogram}.
"""
clean = _clean(values)
n = len(clean)
if n == 0:
result = {k: None for k in _NULL_KEYS}
result["histogram"] = []
return result
arr = np.array(clean, dtype=float)
minimum = float(np.min(arr))
maximum = float(np.max(arr))
mean = float(np.mean(arr))
std = float(np.std(arr))
variance = float(np.var(arr))
cv = (std / mean) if mean != 0 else None
p1 = float(np.percentile(arr, 1))
p5 = float(np.percentile(arr, 5))
p25 = float(np.percentile(arr, 25))
p50 = float(np.percentile(arr, 50))
p75 = float(np.percentile(arr, 75))
p95 = float(np.percentile(arr, 95))
p99 = float(np.percentile(arr, 99))
median = p50
iqr = p75 - p25
mode = _mode(clean)
# Distribution shape: reuse detect_distribution_type for skew/kurtosis/type.
dist = detect_distribution_type(clean)
distribution_type = dist.get("type")
dist_stats = dist.get("stats", {})
skew = dist_stats.get("skew")
kurtosis = dist_stats.get("kurtosis")
# Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True.
outlier_flags = detect_outliers(clean, 3.0)
n_outliers = sum(1 for flag in outlier_flags if flag)
outlier_pct = 100.0 * n_outliers / n
zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n
negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n
# Histogram: reuse histogram for the per-bucket counts, then attach the
# equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}.
counts = histogram(clean, bins)
hist: list = []
if counts:
if maximum == minimum:
# Degenerate range: histogram() places everything in bucket 0.
for i, count in enumerate(counts):
hist.append({"lo": minimum, "hi": maximum, "count": int(count)})
else:
width = (maximum - minimum) / bins
for i, count in enumerate(counts):
lo = minimum + i * width
hi = minimum + (i + 1) * width
hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})
return {
"min": minimum,
"max": maximum,
"mean": mean,
"median": median,
"mode": mode,
"std": std,
"variance": variance,
"cv": cv,
"p1": p1,
"p5": p5,
"p25": p25,
"p50": p50,
"p75": p75,
"p95": p95,
"p99": p99,
"iqr": iqr,
"skew": skew,
"kurtosis": kurtosis,
"n_outliers": n_outliers,
"outlier_pct": outlier_pct,
"zero_pct": zero_pct,
"negative_pct": negative_pct,
"distribution_type": distribution_type,
"histogram": hist,
}