105e56cf05
Añade el capítulo `text_distr` al motor AutomaticEDA: perfila columnas de texto libre largo (reseñas, descripciones, comentarios) que la distribución categórica no resume bien. Sigue el patrón de cat_distr/num_distr (build_text_distr(profile, ctx) -> Chapter | None) y se registra en CHAPTER_ORDER tras cat_distr. Activación en dos fases: gate barato desde el perfil (columna no numérica con len_mean >= 50 chars) + confirmación con muestra cruda (mediana de palabras >= 20). Un dataset sin texto largo (p.ej. titanic) devuelve None sin tocar el informe. Bloques por columna (Group con page_break): resumen (longitudes, vocabulario con TTR y % hapax, idioma dominante, % duplicados, legibilidad), histograma de longitudes, top términos (tabla + barras), bigramas/trigramas, idiomas detectados y nube de palabras opcional. Términos ttr/hapax enganchados al glosario clicable. Lógica delegada a 7 funciones nuevas del registry (datascience, tag eda), estilo dict-no-throw: - extract_text_sample (impura, push-down SQL DuckDB/Postgres) - compute_text_length_stats, compute_vocabulary_stats, compute_top_ngrams (puras, stdlib) - detect_corpus_language (langdetect opcional), compute_text_readability (textstat opcional), compute_text_duplicates (hash + datasketch opcional) Versión barata sin modelos pesados: las piezas que dependen de una librería opcional (langdetect, textstat, wordcloud, datasketch) degradan a omitidas sin lanzar. Añade langdetect y textstat (ligeras) al pyproject + uv.lock. Verificado: golden sobre dataset de reviews multi-idioma (capítulo presente en PDF+PPTX+MD con métricas reales), titanic sin capítulo (None), degradación sin libs, suite automatic_eda + pipeline verde (128 passed), fn index OK. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
100 lines
3.9 KiB
Python
100 lines
3.9 KiB
Python
"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only).
|
|
|
|
Tokenises a list of documents, counts term frequencies and derives lexical
|
|
richness measures (type-token ratio, hapax legomena) plus the top-k terms.
|
|
No external NLP dependencies (no nltk, no sklearn) — only ``re`` and
|
|
``collections`` from the standard library.
|
|
"""
|
|
|
|
import re
|
|
from collections import Counter
|
|
|
|
# Common Spanish + English stopwords. Inline, lowercase, no accents stripped
|
|
# beyond what already appears here. Filtering is opt-in via remove_stopwords.
|
|
_STOPWORDS = {
|
|
# Spanish
|
|
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
|
"un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas",
|
|
"más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque",
|
|
"esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también",
|
|
"me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
|
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
|
"ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro",
|
|
"otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos",
|
|
# English
|
|
"the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
|
|
"was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at",
|
|
"from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its",
|
|
"our", "their", "what", "which", "who", "whom", "has", "have", "had", "do",
|
|
"does", "did", "will", "would", "can", "could", "should", "may", "might",
|
|
"must", "if", "then", "than", "so", "too", "very", "just", "also", "were",
|
|
"been", "being", "there", "here", "all", "any", "some", "more", "most",
|
|
"out", "up", "down", "into", "over", "such", "only", "own", "same",
|
|
}
|
|
|
|
|
|
def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict:
|
|
"""Profile the vocabulary of a corpus of documents.
|
|
|
|
Args:
|
|
texts: List of strings (the corpus). Entries that are None or not a
|
|
string are discarded silently.
|
|
top_k: Maximum number of most-frequent terms to include in
|
|
``top_terms``. Default 20. Does not affect the other measures.
|
|
remove_stopwords: When True (default) common ES+EN stopwords are
|
|
dropped from the token stream before any counting.
|
|
|
|
Returns:
|
|
A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``,
|
|
``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no
|
|
tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0,
|
|
hapax_pct=None, top_terms=[]. Never raises — any exception degrades to
|
|
the empty-corpus result.
|
|
"""
|
|
empty = {
|
|
"n_tokens": 0,
|
|
"n_types": 0,
|
|
"ttr": None,
|
|
"n_hapax": 0,
|
|
"hapax_pct": None,
|
|
"top_terms": [],
|
|
}
|
|
try:
|
|
tokens = []
|
|
for doc in texts or []:
|
|
if not isinstance(doc, str):
|
|
continue
|
|
for tok in re.findall(r"\w+", doc.lower(), re.UNICODE):
|
|
if tok.isdigit():
|
|
continue
|
|
if remove_stopwords and tok in _STOPWORDS:
|
|
continue
|
|
tokens.append(tok)
|
|
|
|
n_tokens = len(tokens)
|
|
if n_tokens == 0:
|
|
return dict(empty)
|
|
|
|
counts = Counter(tokens)
|
|
n_types = len(counts)
|
|
ttr = round(n_types / n_tokens, 4)
|
|
|
|
n_hapax = sum(1 for c in counts.values() if c == 1)
|
|
hapax_pct = round(n_hapax / n_types * 100, 2)
|
|
|
|
top_terms = [
|
|
{"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)}
|
|
for term, count in counts.most_common(top_k)
|
|
]
|
|
|
|
return {
|
|
"n_tokens": n_tokens,
|
|
"n_types": n_types,
|
|
"ttr": ttr,
|
|
"n_hapax": n_hapax,
|
|
"hapax_pct": hapax_pct,
|
|
"top_terms": top_terms,
|
|
}
|
|
except Exception:
|
|
return dict(empty)
|