"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile. Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and missing values (None, NaN, non-numeric strings) are discarded before computing. Reuses registry functions instead of reimplementing their logic: - detect_distribution_type (skew, kurtosis, distribution label) - detect_outliers (z-score outlier flags) - histogram (counts per equal-width bucket) """ import math import os import sys import numpy as np sys.path.insert(0, os.path.dirname(__file__)) from datascience import detect_outliers, histogram # noqa: E402 from detect_distribution_type import detect_distribution_type # noqa: E402 # Keys of the numeric sub-block contract for the eda group. _NULL_KEYS = ( "min", "max", "mean", "median", "mode", "std", "variance", "cv", "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr", "skew", "kurtosis", "n_outliers", "outlier_pct", "zero_pct", "negative_pct", "distribution_type", ) def _clean(values: list) -> list: """Keep only finite numeric values, discarding None/NaN/non-numeric/bool.""" out: list = [] for v in values: # bool is a subclass of int; treat True/False as non-numeric data. if isinstance(v, bool): continue if isinstance(v, (int, float)): f = float(v) if not math.isnan(f) and not math.isinf(f): out.append(f) return out def _mode(values: list) -> float: """Most frequent value; on a tie, the smallest value wins.""" counts: dict = {} for v in values: counts[v] = counts.get(v, 0) + 1 best_count = max(counts.values()) return min(v for v, c in counts.items() if c == best_count) def describe_numeric(values: list, bins: int = 20) -> dict: """Compute the fine-grained numeric statistics block for an EDA ColumnProfile. Designed to run on a SAMPLE of a single column, not the whole table. None, NaN, infinities and non-numeric values are discarded first. If no numeric value survives the cleaning, every key is None and histogram is []. Args: values: List of raw column values (may contain None/NaN/strings). bins: Number of equal-width buckets for the histogram (default 20). Returns: Dict with the exact keys of the eda `numeric_sub` contract: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram}. """ clean = _clean(values) n = len(clean) if n == 0: result = {k: None for k in _NULL_KEYS} result["histogram"] = [] return result arr = np.array(clean, dtype=float) minimum = float(np.min(arr)) maximum = float(np.max(arr)) mean = float(np.mean(arr)) std = float(np.std(arr)) variance = float(np.var(arr)) cv = (std / mean) if mean != 0 else None p1 = float(np.percentile(arr, 1)) p5 = float(np.percentile(arr, 5)) p25 = float(np.percentile(arr, 25)) p50 = float(np.percentile(arr, 50)) p75 = float(np.percentile(arr, 75)) p95 = float(np.percentile(arr, 95)) p99 = float(np.percentile(arr, 99)) median = p50 iqr = p75 - p25 mode = _mode(clean) # Distribution shape: reuse detect_distribution_type for skew/kurtosis/type. dist = detect_distribution_type(clean) distribution_type = dist.get("type") dist_stats = dist.get("stats", {}) skew = dist_stats.get("skew") kurtosis = dist_stats.get("kurtosis") # Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True. outlier_flags = detect_outliers(clean, 3.0) n_outliers = sum(1 for flag in outlier_flags if flag) outlier_pct = 100.0 * n_outliers / n zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n # Histogram: reuse histogram for the per-bucket counts, then attach the # equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}. counts = histogram(clean, bins) hist: list = [] if counts: if maximum == minimum: # Degenerate range: histogram() places everything in bucket 0. for i, count in enumerate(counts): hist.append({"lo": minimum, "hi": maximum, "count": int(count)}) else: width = (maximum - minimum) / bins for i, count in enumerate(counts): lo = minimum + i * width hi = minimum + (i + 1) * width hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)}) return { "min": minimum, "max": maximum, "mean": mean, "median": median, "mode": mode, "std": std, "variance": variance, "cv": cv, "p1": p1, "p5": p5, "p25": p25, "p50": p50, "p75": p75, "p95": p95, "p99": p99, "iqr": iqr, "skew": skew, "kurtosis": kurtosis, "n_outliers": n_outliers, "outlier_pct": outlier_pct, "zero_pct": zero_pct, "negative_pct": negative_pct, "distribution_type": distribution_type, "histogram": hist, }