fn_registry/python/functions/datascience/compute_text_length_stats.py

"""Pure EDA helper: document length distribution for the `eda` group.

Given a list of text documents, computes the length distribution along three
axes (characters, words and sentences) plus an equal-width histogram of the
per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a
hand-rolled nearest-rank percentile). No numpy, no sklearn.

The function is dict-no-throw: it never raises. On any unexpected input it
degrades to the empty-shape result.
"""

import math
import re

_WORD_RE = re.compile(r"\w+", re.UNICODE)
_SENT_RE = re.compile(r"[.!?…]+")


def _empty_axis() -> dict:
    """Return an axis sub-dict with every statistic set to ``None``."""
    return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None}


def _pct(sorted_vals, q):
    """Nearest-rank percentile of an already-sorted list.

    Args:
        sorted_vals: List of numbers sorted ascending.
        q: Percentile in the 0..100 range.

    Returns:
        The value at the nearest rank, or ``None`` for an empty list.
    """
    n = len(sorted_vals)
    if n == 0:
        return None
    if q <= 0:
        return sorted_vals[0]
    rank = math.ceil(q / 100.0 * n)
    if rank < 1:
        rank = 1
    if rank > n:
        rank = n
    return sorted_vals[rank - 1]


def _axis_stats(values) -> dict:
    """Compute mean/p50/p90/p99/min/max over a list of integer counts.

    ``mean`` is rounded to 2 decimals; every other statistic is an integer
    (they are counts). Returns an all-``None`` axis for an empty list.
    """
    if not values:
        return _empty_axis()
    sv = sorted(values)
    return {
        "mean": round(sum(sv) / len(sv), 2),
        "p50": int(_pct(sv, 50)),
        "p90": int(_pct(sv, 90)),
        "p99": int(_pct(sv, 99)),
        "min": int(sv[0]),
        "max": int(sv[-1]),
    }


def _word_hist(word_counts, n_bins) -> list:
    """Equal-width histogram of per-document word counts.

    Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When
    every document has the same number of words, there are fewer than 2
    documents, or ``n_bins`` is not at least 1, a single covering bin is
    returned. With no documents the result is ``[]``. The sum of bin ``count``
    always equals ``len(word_counts)``.
    """
    if not word_counts:
        return []
    wmin = min(word_counts)
    wmax = max(word_counts)
    if wmax == wmin or len(word_counts) < 2 or n_bins < 1:
        return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}]

    width = (wmax - wmin) / n_bins
    bins = []
    for i in range(n_bins):
        lo = wmin + i * width
        hi = wmin + (i + 1) * width
        bins.append({"lo": float(lo), "hi": float(hi), "count": 0})
    # Pin the last upper edge to the real maximum to avoid float drift.
    bins[-1]["hi"] = float(wmax)

    for wc in word_counts:
        if wc >= wmax:
            idx = n_bins - 1
        else:
            idx = int((wc - wmin) / width)
            if idx < 0:
                idx = 0
            elif idx >= n_bins:
                idx = n_bins - 1
        bins[idx]["count"] += 1
    return bins


def compute_text_length_stats(texts, n_bins=20) -> dict:
    """Summarize the length distribution of a corpus of text documents.

    For each document three lengths are measured: characters (``len(doc)``),
    words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments
    after splitting on ``.!?…``, with a minimum of 1 for any non-empty
    document). For each axis the mean, p50, p90, p99, min and max are reported,
    plus an equal-width histogram of the per-document word counts.

    ``None`` entries and any non-``str`` items in ``texts`` are discarded.
    The function never raises: on empty/``None`` input or any internal error it
    returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]``
    histogram).

    Args:
        texts: List of text documents (``str``). ``None`` and non-``str``
            items are dropped.
        n_bins: Number of equal-width bins for the word-count histogram.
            Default 20.

    Returns:
        Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and
        ``word_hist``. Each of the three axes is a sub-dict with ``mean``
        (float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max``
        (ints), all ``None`` when there are no documents. ``word_hist`` is a
        list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``.
    """
    empty_axis = _empty_axis()
    fallback = {
        "n_docs": 0,
        "chars": dict(empty_axis),
        "words": dict(empty_axis),
        "sentences": dict(empty_axis),
        "word_hist": [],
    }
    try:
        if not texts:
            return fallback

        docs = [t for t in texts if isinstance(t, str)]
        n_docs = len(docs)
        if n_docs == 0:
            return fallback

        char_counts = [len(d) for d in docs]
        word_counts = [len(_WORD_RE.findall(d)) for d in docs]

        sent_counts = []
        for d in docs:
            segments = [s for s in _SENT_RE.split(d) if s.strip()]
            n = len(segments)
            if d and n == 0:
                # Non-empty document with no detectable sentence: count as 1.
                n = 1
            sent_counts.append(n)

        return {
            "n_docs": n_docs,
            "chars": _axis_stats(char_counts),
            "words": _axis_stats(word_counts),
            "sentences": _axis_stats(sent_counts),
            "word_hist": _word_hist(word_counts, n_bins),
        }
    except Exception:
        return fallback