763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile.
|
|
|
|
Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a
|
|
ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and
|
|
missing values (None, NaN, non-numeric strings) are discarded before computing.
|
|
|
|
Reuses registry functions instead of reimplementing their logic:
|
|
- detect_distribution_type (skew, kurtosis, distribution label)
|
|
- detect_outliers (z-score outlier flags)
|
|
- histogram (counts per equal-width bucket)
|
|
"""
|
|
|
|
import math
|
|
import os
|
|
import sys
|
|
|
|
import numpy as np
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from datascience import detect_outliers, histogram # noqa: E402
|
|
from detect_distribution_type import detect_distribution_type # noqa: E402
|
|
|
|
|
|
# Keys of the numeric sub-block contract for the eda group.
|
|
_NULL_KEYS = (
|
|
"min", "max", "mean", "median", "mode", "std", "variance", "cv",
|
|
"p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
|
|
"skew", "kurtosis", "n_outliers", "outlier_pct",
|
|
"zero_pct", "negative_pct", "distribution_type",
|
|
)
|
|
|
|
|
|
def _clean(values: list) -> list:
|
|
"""Keep only finite numeric values, discarding None/NaN/non-numeric/bool."""
|
|
out: list = []
|
|
for v in values:
|
|
# bool is a subclass of int; treat True/False as non-numeric data.
|
|
if isinstance(v, bool):
|
|
continue
|
|
if isinstance(v, (int, float)):
|
|
f = float(v)
|
|
if not math.isnan(f) and not math.isinf(f):
|
|
out.append(f)
|
|
return out
|
|
|
|
|
|
def _mode(values: list) -> float:
|
|
"""Most frequent value; on a tie, the smallest value wins."""
|
|
counts: dict = {}
|
|
for v in values:
|
|
counts[v] = counts.get(v, 0) + 1
|
|
best_count = max(counts.values())
|
|
return min(v for v, c in counts.items() if c == best_count)
|
|
|
|
|
|
def describe_numeric(values: list, bins: int = 20) -> dict:
|
|
"""Compute the fine-grained numeric statistics block for an EDA ColumnProfile.
|
|
|
|
Designed to run on a SAMPLE of a single column, not the whole table.
|
|
None, NaN, infinities and non-numeric values are discarded first. If no
|
|
numeric value survives the cleaning, every key is None and histogram is [].
|
|
|
|
Args:
|
|
values: List of raw column values (may contain None/NaN/strings).
|
|
bins: Number of equal-width buckets for the histogram (default 20).
|
|
|
|
Returns:
|
|
Dict with the exact keys of the eda `numeric_sub` contract:
|
|
{min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
|
|
p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
|
|
negative_pct, distribution_type, histogram}.
|
|
"""
|
|
clean = _clean(values)
|
|
n = len(clean)
|
|
|
|
if n == 0:
|
|
result = {k: None for k in _NULL_KEYS}
|
|
result["histogram"] = []
|
|
return result
|
|
|
|
arr = np.array(clean, dtype=float)
|
|
|
|
minimum = float(np.min(arr))
|
|
maximum = float(np.max(arr))
|
|
mean = float(np.mean(arr))
|
|
std = float(np.std(arr))
|
|
variance = float(np.var(arr))
|
|
cv = (std / mean) if mean != 0 else None
|
|
|
|
p1 = float(np.percentile(arr, 1))
|
|
p5 = float(np.percentile(arr, 5))
|
|
p25 = float(np.percentile(arr, 25))
|
|
p50 = float(np.percentile(arr, 50))
|
|
p75 = float(np.percentile(arr, 75))
|
|
p95 = float(np.percentile(arr, 95))
|
|
p99 = float(np.percentile(arr, 99))
|
|
median = p50
|
|
iqr = p75 - p25
|
|
|
|
mode = _mode(clean)
|
|
|
|
# Distribution shape: reuse detect_distribution_type for skew/kurtosis/type.
|
|
dist = detect_distribution_type(clean)
|
|
distribution_type = dist.get("type")
|
|
dist_stats = dist.get("stats", {})
|
|
skew = dist_stats.get("skew")
|
|
kurtosis = dist_stats.get("kurtosis")
|
|
|
|
# Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True.
|
|
outlier_flags = detect_outliers(clean, 3.0)
|
|
n_outliers = sum(1 for flag in outlier_flags if flag)
|
|
outlier_pct = 100.0 * n_outliers / n
|
|
|
|
zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n
|
|
negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n
|
|
|
|
# Histogram: reuse histogram for the per-bucket counts, then attach the
|
|
# equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}.
|
|
counts = histogram(clean, bins)
|
|
hist: list = []
|
|
if counts:
|
|
if maximum == minimum:
|
|
# Degenerate range: histogram() places everything in bucket 0.
|
|
for i, count in enumerate(counts):
|
|
hist.append({"lo": minimum, "hi": maximum, "count": int(count)})
|
|
else:
|
|
width = (maximum - minimum) / bins
|
|
for i, count in enumerate(counts):
|
|
lo = minimum + i * width
|
|
hi = minimum + (i + 1) * width
|
|
hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})
|
|
|
|
return {
|
|
"min": minimum,
|
|
"max": maximum,
|
|
"mean": mean,
|
|
"median": median,
|
|
"mode": mode,
|
|
"std": std,
|
|
"variance": variance,
|
|
"cv": cv,
|
|
"p1": p1,
|
|
"p5": p5,
|
|
"p25": p25,
|
|
"p50": p50,
|
|
"p75": p75,
|
|
"p95": p95,
|
|
"p99": p99,
|
|
"iqr": iqr,
|
|
"skew": skew,
|
|
"kurtosis": kurtosis,
|
|
"n_outliers": n_outliers,
|
|
"outlier_pct": outlier_pct,
|
|
"zero_pct": zero_pct,
|
|
"negative_pct": negative_pct,
|
|
"distribution_type": distribution_type,
|
|
"histogram": hist,
|
|
}
|