"""Pure EDA helper: document length distribution for the `eda` group. Given a list of text documents, computes the length distribution along three axes (characters, words and sentences) plus an equal-width histogram of the per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a hand-rolled nearest-rank percentile). No numpy, no sklearn. The function is dict-no-throw: it never raises. On any unexpected input it degrades to the empty-shape result. """ import math import re _WORD_RE = re.compile(r"\w+", re.UNICODE) _SENT_RE = re.compile(r"[.!?…]+") def _empty_axis() -> dict: """Return an axis sub-dict with every statistic set to ``None``.""" return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None} def _pct(sorted_vals, q): """Nearest-rank percentile of an already-sorted list. Args: sorted_vals: List of numbers sorted ascending. q: Percentile in the 0..100 range. Returns: The value at the nearest rank, or ``None`` for an empty list. """ n = len(sorted_vals) if n == 0: return None if q <= 0: return sorted_vals[0] rank = math.ceil(q / 100.0 * n) if rank < 1: rank = 1 if rank > n: rank = n return sorted_vals[rank - 1] def _axis_stats(values) -> dict: """Compute mean/p50/p90/p99/min/max over a list of integer counts. ``mean`` is rounded to 2 decimals; every other statistic is an integer (they are counts). Returns an all-``None`` axis for an empty list. """ if not values: return _empty_axis() sv = sorted(values) return { "mean": round(sum(sv) / len(sv), 2), "p50": int(_pct(sv, 50)), "p90": int(_pct(sv, 90)), "p99": int(_pct(sv, 99)), "min": int(sv[0]), "max": int(sv[-1]), } def _word_hist(word_counts, n_bins) -> list: """Equal-width histogram of per-document word counts. Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When every document has the same number of words, there are fewer than 2 documents, or ``n_bins`` is not at least 1, a single covering bin is returned. With no documents the result is ``[]``. The sum of bin ``count`` always equals ``len(word_counts)``. """ if not word_counts: return [] wmin = min(word_counts) wmax = max(word_counts) if wmax == wmin or len(word_counts) < 2 or n_bins < 1: return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}] width = (wmax - wmin) / n_bins bins = [] for i in range(n_bins): lo = wmin + i * width hi = wmin + (i + 1) * width bins.append({"lo": float(lo), "hi": float(hi), "count": 0}) # Pin the last upper edge to the real maximum to avoid float drift. bins[-1]["hi"] = float(wmax) for wc in word_counts: if wc >= wmax: idx = n_bins - 1 else: idx = int((wc - wmin) / width) if idx < 0: idx = 0 elif idx >= n_bins: idx = n_bins - 1 bins[idx]["count"] += 1 return bins def compute_text_length_stats(texts, n_bins=20) -> dict: """Summarize the length distribution of a corpus of text documents. For each document three lengths are measured: characters (``len(doc)``), words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments after splitting on ``.!?…``, with a minimum of 1 for any non-empty document). For each axis the mean, p50, p90, p99, min and max are reported, plus an equal-width histogram of the per-document word counts. ``None`` entries and any non-``str`` items in ``texts`` are discarded. The function never raises: on empty/``None`` input or any internal error it returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]`` histogram). Args: texts: List of text documents (``str``). ``None`` and non-``str`` items are dropped. n_bins: Number of equal-width bins for the word-count histogram. Default 20. Returns: Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and ``word_hist``. Each of the three axes is a sub-dict with ``mean`` (float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max`` (ints), all ``None`` when there are no documents. ``word_hist`` is a list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``. """ empty_axis = _empty_axis() fallback = { "n_docs": 0, "chars": dict(empty_axis), "words": dict(empty_axis), "sentences": dict(empty_axis), "word_hist": [], } try: if not texts: return fallback docs = [t for t in texts if isinstance(t, str)] n_docs = len(docs) if n_docs == 0: return fallback char_counts = [len(d) for d in docs] word_counts = [len(_WORD_RE.findall(d)) for d in docs] sent_counts = [] for d in docs: segments = [s for s in _SENT_RE.split(d) if s.strip()] n = len(segments) if d and n == 0: # Non-empty document with no detectable sentence: count as 1. n = 1 sent_counts.append(n) return { "n_docs": n_docs, "chars": _axis_stats(char_counts), "words": _axis_stats(word_counts), "sentences": _axis_stats(sent_counts), "word_hist": _word_hist(word_counts, n_bins), } except Exception: return fallback