105e56cf05
Añade el capítulo `text_distr` al motor AutomaticEDA: perfila columnas de texto libre largo (reseñas, descripciones, comentarios) que la distribución categórica no resume bien. Sigue el patrón de cat_distr/num_distr (build_text_distr(profile, ctx) -> Chapter | None) y se registra en CHAPTER_ORDER tras cat_distr. Activación en dos fases: gate barato desde el perfil (columna no numérica con len_mean >= 50 chars) + confirmación con muestra cruda (mediana de palabras >= 20). Un dataset sin texto largo (p.ej. titanic) devuelve None sin tocar el informe. Bloques por columna (Group con page_break): resumen (longitudes, vocabulario con TTR y % hapax, idioma dominante, % duplicados, legibilidad), histograma de longitudes, top términos (tabla + barras), bigramas/trigramas, idiomas detectados y nube de palabras opcional. Términos ttr/hapax enganchados al glosario clicable. Lógica delegada a 7 funciones nuevas del registry (datascience, tag eda), estilo dict-no-throw: - extract_text_sample (impura, push-down SQL DuckDB/Postgres) - compute_text_length_stats, compute_vocabulary_stats, compute_top_ngrams (puras, stdlib) - detect_corpus_language (langdetect opcional), compute_text_readability (textstat opcional), compute_text_duplicates (hash + datasketch opcional) Versión barata sin modelos pesados: las piezas que dependen de una librería opcional (langdetect, textstat, wordcloud, datasketch) degradan a omitidas sin lanzar. Añade langdetect y textstat (ligeras) al pyproject + uv.lock. Verificado: golden sobre dataset de reviews multi-idioma (capítulo presente en PDF+PPTX+MD con métricas reales), titanic sin capítulo (None), degradación sin libs, suite automatic_eda + pipeline verde (128 passed), fn index OK. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
169 lines
5.5 KiB
Python
169 lines
5.5 KiB
Python
"""Pure EDA helper: document length distribution for the `eda` group.
|
|
|
|
Given a list of text documents, computes the length distribution along three
|
|
axes (characters, words and sentences) plus an equal-width histogram of the
|
|
per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a
|
|
hand-rolled nearest-rank percentile). No numpy, no sklearn.
|
|
|
|
The function is dict-no-throw: it never raises. On any unexpected input it
|
|
degrades to the empty-shape result.
|
|
"""
|
|
|
|
import math
|
|
import re
|
|
|
|
_WORD_RE = re.compile(r"\w+", re.UNICODE)
|
|
_SENT_RE = re.compile(r"[.!?…]+")
|
|
|
|
|
|
def _empty_axis() -> dict:
|
|
"""Return an axis sub-dict with every statistic set to ``None``."""
|
|
return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None}
|
|
|
|
|
|
def _pct(sorted_vals, q):
|
|
"""Nearest-rank percentile of an already-sorted list.
|
|
|
|
Args:
|
|
sorted_vals: List of numbers sorted ascending.
|
|
q: Percentile in the 0..100 range.
|
|
|
|
Returns:
|
|
The value at the nearest rank, or ``None`` for an empty list.
|
|
"""
|
|
n = len(sorted_vals)
|
|
if n == 0:
|
|
return None
|
|
if q <= 0:
|
|
return sorted_vals[0]
|
|
rank = math.ceil(q / 100.0 * n)
|
|
if rank < 1:
|
|
rank = 1
|
|
if rank > n:
|
|
rank = n
|
|
return sorted_vals[rank - 1]
|
|
|
|
|
|
def _axis_stats(values) -> dict:
|
|
"""Compute mean/p50/p90/p99/min/max over a list of integer counts.
|
|
|
|
``mean`` is rounded to 2 decimals; every other statistic is an integer
|
|
(they are counts). Returns an all-``None`` axis for an empty list.
|
|
"""
|
|
if not values:
|
|
return _empty_axis()
|
|
sv = sorted(values)
|
|
return {
|
|
"mean": round(sum(sv) / len(sv), 2),
|
|
"p50": int(_pct(sv, 50)),
|
|
"p90": int(_pct(sv, 90)),
|
|
"p99": int(_pct(sv, 99)),
|
|
"min": int(sv[0]),
|
|
"max": int(sv[-1]),
|
|
}
|
|
|
|
|
|
def _word_hist(word_counts, n_bins) -> list:
|
|
"""Equal-width histogram of per-document word counts.
|
|
|
|
Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When
|
|
every document has the same number of words, there are fewer than 2
|
|
documents, or ``n_bins`` is not at least 1, a single covering bin is
|
|
returned. With no documents the result is ``[]``. The sum of bin ``count``
|
|
always equals ``len(word_counts)``.
|
|
"""
|
|
if not word_counts:
|
|
return []
|
|
wmin = min(word_counts)
|
|
wmax = max(word_counts)
|
|
if wmax == wmin or len(word_counts) < 2 or n_bins < 1:
|
|
return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}]
|
|
|
|
width = (wmax - wmin) / n_bins
|
|
bins = []
|
|
for i in range(n_bins):
|
|
lo = wmin + i * width
|
|
hi = wmin + (i + 1) * width
|
|
bins.append({"lo": float(lo), "hi": float(hi), "count": 0})
|
|
# Pin the last upper edge to the real maximum to avoid float drift.
|
|
bins[-1]["hi"] = float(wmax)
|
|
|
|
for wc in word_counts:
|
|
if wc >= wmax:
|
|
idx = n_bins - 1
|
|
else:
|
|
idx = int((wc - wmin) / width)
|
|
if idx < 0:
|
|
idx = 0
|
|
elif idx >= n_bins:
|
|
idx = n_bins - 1
|
|
bins[idx]["count"] += 1
|
|
return bins
|
|
|
|
|
|
def compute_text_length_stats(texts, n_bins=20) -> dict:
|
|
"""Summarize the length distribution of a corpus of text documents.
|
|
|
|
For each document three lengths are measured: characters (``len(doc)``),
|
|
words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments
|
|
after splitting on ``.!?…``, with a minimum of 1 for any non-empty
|
|
document). For each axis the mean, p50, p90, p99, min and max are reported,
|
|
plus an equal-width histogram of the per-document word counts.
|
|
|
|
``None`` entries and any non-``str`` items in ``texts`` are discarded.
|
|
The function never raises: on empty/``None`` input or any internal error it
|
|
returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]``
|
|
histogram).
|
|
|
|
Args:
|
|
texts: List of text documents (``str``). ``None`` and non-``str``
|
|
items are dropped.
|
|
n_bins: Number of equal-width bins for the word-count histogram.
|
|
Default 20.
|
|
|
|
Returns:
|
|
Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and
|
|
``word_hist``. Each of the three axes is a sub-dict with ``mean``
|
|
(float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max``
|
|
(ints), all ``None`` when there are no documents. ``word_hist`` is a
|
|
list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``.
|
|
"""
|
|
empty_axis = _empty_axis()
|
|
fallback = {
|
|
"n_docs": 0,
|
|
"chars": dict(empty_axis),
|
|
"words": dict(empty_axis),
|
|
"sentences": dict(empty_axis),
|
|
"word_hist": [],
|
|
}
|
|
try:
|
|
if not texts:
|
|
return fallback
|
|
|
|
docs = [t for t in texts if isinstance(t, str)]
|
|
n_docs = len(docs)
|
|
if n_docs == 0:
|
|
return fallback
|
|
|
|
char_counts = [len(d) for d in docs]
|
|
word_counts = [len(_WORD_RE.findall(d)) for d in docs]
|
|
|
|
sent_counts = []
|
|
for d in docs:
|
|
segments = [s for s in _SENT_RE.split(d) if s.strip()]
|
|
n = len(segments)
|
|
if d and n == 0:
|
|
# Non-empty document with no detectable sentence: count as 1.
|
|
n = 1
|
|
sent_counts.append(n)
|
|
|
|
return {
|
|
"n_docs": n_docs,
|
|
"chars": _axis_stats(char_counts),
|
|
"words": _axis_stats(word_counts),
|
|
"sentences": _axis_stats(sent_counts),
|
|
"word_hist": _word_hist(word_counts, n_bins),
|
|
}
|
|
except Exception:
|
|
return fallback
|