fn_registry/python/functions/datascience/describe_numeric.py

"""describe_numeric — Fine-grained numeric statistics block for an EDA ColumnProfile.

Pure function: no I/O, deterministic. Computes the `numeric` sub-block of a
ColumnProfile (group `eda`) over a SAMPLE of a numeric column. Non-numeric and
missing values (None, NaN, non-numeric strings) are discarded before computing.

Reuses registry functions instead of reimplementing their logic:
  - detect_distribution_type (skew, kurtosis, distribution label)
  - detect_outliers (z-score outlier flags)
  - histogram (counts per equal-width bucket)
"""

import math
import os
import sys

import numpy as np

sys.path.insert(0, os.path.dirname(__file__))

from datascience import detect_outliers, histogram  # noqa: E402
from detect_distribution_type import detect_distribution_type  # noqa: E402


# Keys of the numeric sub-block contract for the eda group.
_NULL_KEYS = (
    "min", "max", "mean", "median", "mode", "std", "variance", "cv",
    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
    "skew", "kurtosis", "n_outliers", "outlier_pct",
    "zero_pct", "negative_pct", "distribution_type",
)


def _clean(values: list) -> list:
    """Keep only finite numeric values, discarding None/NaN/non-numeric/bool."""
    out: list = []
    for v in values:
        # bool is a subclass of int; treat True/False as non-numeric data.
        if isinstance(v, bool):
            continue
        if isinstance(v, (int, float)):
            f = float(v)
            if not math.isnan(f) and not math.isinf(f):
                out.append(f)
    return out


def _mode(values: list) -> float:
    """Most frequent value; on a tie, the smallest value wins."""
    counts: dict = {}
    for v in values:
        counts[v] = counts.get(v, 0) + 1
    best_count = max(counts.values())
    return min(v for v, c in counts.items() if c == best_count)


def describe_numeric(values: list, bins: int = 20) -> dict:
    """Compute the fine-grained numeric statistics block for an EDA ColumnProfile.

    Designed to run on a SAMPLE of a single column, not the whole table.
    None, NaN, infinities and non-numeric values are discarded first. If no
    numeric value survives the cleaning, every key is None and histogram is [].

    Args:
        values: List of raw column values (may contain None/NaN/strings).
        bins: Number of equal-width buckets for the histogram (default 20).

    Returns:
        Dict with the exact keys of the eda `numeric_sub` contract:
        {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
         p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
         negative_pct, distribution_type, histogram}.
    """
    clean = _clean(values)
    n = len(clean)

    if n == 0:
        result = {k: None for k in _NULL_KEYS}
        result["histogram"] = []
        return result

    arr = np.array(clean, dtype=float)

    minimum = float(np.min(arr))
    maximum = float(np.max(arr))
    mean = float(np.mean(arr))
    std = float(np.std(arr))
    variance = float(np.var(arr))
    cv = (std / mean) if mean != 0 else None

    p1 = float(np.percentile(arr, 1))
    p5 = float(np.percentile(arr, 5))
    p25 = float(np.percentile(arr, 25))
    p50 = float(np.percentile(arr, 50))
    p75 = float(np.percentile(arr, 75))
    p95 = float(np.percentile(arr, 95))
    p99 = float(np.percentile(arr, 99))
    median = p50
    iqr = p75 - p25

    mode = _mode(clean)

    # Distribution shape: reuse detect_distribution_type for skew/kurtosis/type.
    dist = detect_distribution_type(clean)
    distribution_type = dist.get("type")
    dist_stats = dist.get("stats", {})
    skew = dist_stats.get("skew")
    kurtosis = dist_stats.get("kurtosis")

    # Outliers: reuse detect_outliers (z-score, threshold 3.0). Count the True.
    outlier_flags = detect_outliers(clean, 3.0)
    n_outliers = sum(1 for flag in outlier_flags if flag)
    outlier_pct = 100.0 * n_outliers / n

    zero_pct = 100.0 * sum(1 for v in clean if v == 0) / n
    negative_pct = 100.0 * sum(1 for v in clean if v < 0) / n

    # Histogram: reuse histogram for the per-bucket counts, then attach the
    # equal-width [lo, hi) edges so the eda contract gets {lo, hi, count}.
    counts = histogram(clean, bins)
    hist: list = []
    if counts:
        if maximum == minimum:
            # Degenerate range: histogram() places everything in bucket 0.
            for i, count in enumerate(counts):
                hist.append({"lo": minimum, "hi": maximum, "count": int(count)})
        else:
            width = (maximum - minimum) / bins
            for i, count in enumerate(counts):
                lo = minimum + i * width
                hi = minimum + (i + 1) * width
                hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})

    return {
        "min": minimum,
        "max": maximum,
        "mean": mean,
        "median": median,
        "mode": mode,
        "std": std,
        "variance": variance,
        "cv": cv,
        "p1": p1,
        "p5": p5,
        "p25": p25,
        "p50": p50,
        "p75": p75,
        "p95": p95,
        "p99": p99,
        "iqr": iqr,
        "skew": skew,
        "kurtosis": kurtosis,
        "n_outliers": n_outliers,
        "outlier_pct": outlier_pct,
        "zero_pct": zero_pct,
        "negative_pct": negative_pct,
        "distribution_type": distribution_type,
        "histogram": hist,
    }