feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,159 @@
|
||||
"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile.
|
||||
|
||||
Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a
|
||||
ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and
|
||||
missing values (None, NaN, non-numeric strings) are discarded before computing.
|
||||
|
||||
Reuses registry functions instead of reimplementing their logic:
|
||||
- detect_distribution_type (skew, kurtosis, distribution label)
|
||||
- detect_outliers (z-score outlier flags)
|
||||
- histogram (counts per equal-width bucket)
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from datascience import detect_outliers, histogram # noqa: E402
|
||||
from detect_distribution_type import detect_distribution_type # noqa: E402
|
||||
|
||||
|
||||
# Keys of the numeric sub-block contract for the eda group.
|
||||
_NULL_KEYS = (
|
||||
"min", "max", "mean", "median", "mode", "std", "variance", "cv",
|
||||
"p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
|
||||
"skew", "kurtosis", "n_outliers", "outlier_pct",
|
||||
"zero_pct", "negative_pct", "distribution_type",
|
||||
)
|
||||
|
||||
|
||||
def _clean(values: list) -> list:
|
||||
"""Keep only finite numeric values, discarding None/NaN/non-numeric/bool."""
|
||||
out: list = []
|
||||
for v in values:
|
||||
# bool is a subclass of int; treat True/False as non-numeric data.
|
||||
if isinstance(v, bool):
|
||||
continue
|
||||
if isinstance(v, (int, float)):
|
||||
f = float(v)
|
||||
if not math.isnan(f) and not math.isinf(f):
|
||||
out.append(f)
|
||||
return out
|
||||
|
||||
|
||||
def _mode(values: list) -> float:
|
||||
"""Most frequent value; on a tie, the smallest value wins."""
|
||||
counts: dict = {}
|
||||
for v in values:
|
||||
counts[v] = counts.get(v, 0) + 1
|
||||
best_count = max(counts.values())
|
||||
return min(v for v, c in counts.items() if c == best_count)
|
||||
|
||||
|
||||
def describe_numeric(values: list, bins: int = 20) -> dict:
|
||||
"""Compute the fine-grained numeric statistics block for an EDA ColumnProfile.
|
||||
|
||||
Designed to run on a SAMPLE of a single column, not the whole table.
|
||||
None, NaN, infinities and non-numeric values are discarded first. If no
|
||||
numeric value survives the cleaning, every key is None and histogram is [].
|
||||
|
||||
Args:
|
||||
values: List of raw column values (may contain None/NaN/strings).
|
||||
bins: Number of equal-width buckets for the histogram (default 20).
|
||||
|
||||
Returns:
|
||||
Dict with the exact keys of the eda `numeric_sub` contract:
|
||||
{min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
|
||||
p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
|
||||
negative_pct, distribution_type, histogram}.
|
||||
"""
|
||||
clean = _clean(values)
|
||||
n = len(clean)
|
||||
|
||||
if n == 0:
|
||||
result = {k: None for k in _NULL_KEYS}
|
||||
result["histogram"] = []
|
||||
return result
|
||||
|
||||
arr = np.array(clean, dtype=float)
|
||||
|
||||
minimum = float(np.min(arr))
|
||||
maximum = float(np.max(arr))
|
||||
mean = float(np.mean(arr))
|
||||
std = float(np.std(arr))
|
||||
variance = float(np.var(arr))
|
||||
cv = (std / mean) if mean != 0 else None
|
||||
|
||||
p1 = float(np.percentile(arr, 1))
|
||||
p5 = float(np.percentile(arr, 5))
|
||||
p25 = float(np.percentile(arr, 25))
|
||||
p50 = float(np.percentile(arr, 50))
|
||||
p75 = float(np.percentile(arr, 75))
|
||||
p95 = float(np.percentile(arr, 95))
|
||||
p99 = float(np.percentile(arr, 99))
|
||||
median = p50
|
||||
iqr = p75 - p25
|
||||
|
||||
mode = _mode(clean)
|
||||
|
||||
# Distribution shape: reuse detect_distribution_type for skew/kurtosis/type.
|
||||
dist = detect_distribution_type(clean)
|
||||
distribution_type = dist.get("type")
|
||||
dist_stats = dist.get("stats", {})
|
||||
skew = dist_stats.get("skew")
|
||||
kurtosis = dist_stats.get("kurtosis")
|
||||
|
||||
# Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True.
|
||||
outlier_flags = detect_outliers(clean, 3.0)
|
||||
n_outliers = sum(1 for flag in outlier_flags if flag)
|
||||
outlier_pct = 100.0 * n_outliers / n
|
||||
|
||||
zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n
|
||||
negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n
|
||||
|
||||
# Histogram: reuse histogram for the per-bucket counts, then attach the
|
||||
# equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}.
|
||||
counts = histogram(clean, bins)
|
||||
hist: list = []
|
||||
if counts:
|
||||
if maximum == minimum:
|
||||
# Degenerate range: histogram() places everything in bucket 0.
|
||||
for i, count in enumerate(counts):
|
||||
hist.append({"lo": minimum, "hi": maximum, "count": int(count)})
|
||||
else:
|
||||
width = (maximum - minimum) / bins
|
||||
for i, count in enumerate(counts):
|
||||
lo = minimum + i * width
|
||||
hi = minimum + (i + 1) * width
|
||||
hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})
|
||||
|
||||
return {
|
||||
"min": minimum,
|
||||
"max": maximum,
|
||||
"mean": mean,
|
||||
"median": median,
|
||||
"mode": mode,
|
||||
"std": std,
|
||||
"variance": variance,
|
||||
"cv": cv,
|
||||
"p1": p1,
|
||||
"p5": p5,
|
||||
"p25": p25,
|
||||
"p50": p50,
|
||||
"p75": p75,
|
||||
"p95": p95,
|
||||
"p99": p99,
|
||||
"iqr": iqr,
|
||||
"skew": skew,
|
||||
"kurtosis": kurtosis,
|
||||
"n_outliers": n_outliers,
|
||||
"outlier_pct": outlier_pct,
|
||||
"zero_pct": zero_pct,
|
||||
"negative_pct": negative_pct,
|
||||
"distribution_type": distribution_type,
|
||||
"histogram": hist,
|
||||
}
|
||||
Reference in New Issue
Block a user