"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only). Tokenises a list of documents, counts term frequencies and derives lexical richness measures (type-token ratio, hapax legomena) plus the top-k terms. No external NLP dependencies (no nltk, no sklearn) — only ``re`` and ``collections`` from the standard library. """ import re from collections import Counter # Common Spanish + English stopwords. Inline, lowercase, no accents stripped # beyond what already appears here. Filtering is opt-in via remove_stopwords. _STOPWORDS = { # Spanish "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas", "más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro", "otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos", # English "the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as", "was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at", "from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its", "our", "their", "what", "which", "who", "whom", "has", "have", "had", "do", "does", "did", "will", "would", "can", "could", "should", "may", "might", "must", "if", "then", "than", "so", "too", "very", "just", "also", "were", "been", "being", "there", "here", "all", "any", "some", "more", "most", "out", "up", "down", "into", "over", "such", "only", "own", "same", } def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict: """Profile the vocabulary of a corpus of documents. Args: texts: List of strings (the corpus). Entries that are None or not a string are discarded silently. top_k: Maximum number of most-frequent terms to include in ``top_terms``. Default 20. Does not affect the other measures. remove_stopwords: When True (default) common ES+EN stopwords are dropped from the token stream before any counting. Returns: A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``, ``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0, hapax_pct=None, top_terms=[]. Never raises — any exception degrades to the empty-corpus result. """ empty = { "n_tokens": 0, "n_types": 0, "ttr": None, "n_hapax": 0, "hapax_pct": None, "top_terms": [], } try: tokens = [] for doc in texts or []: if not isinstance(doc, str): continue for tok in re.findall(r"\w+", doc.lower(), re.UNICODE): if tok.isdigit(): continue if remove_stopwords and tok in _STOPWORDS: continue tokens.append(tok) n_tokens = len(tokens) if n_tokens == 0: return dict(empty) counts = Counter(tokens) n_types = len(counts) ttr = round(n_types / n_tokens, 4) n_hapax = sum(1 for c in counts.values() if c == 1) hapax_pct = round(n_hapax / n_types * 100, 2) top_terms = [ {"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)} for term, count in counts.most_common(top_k) ] return { "n_tokens": n_tokens, "n_types": n_types, "ttr": ttr, "n_hapax": n_hapax, "hapax_pct": hapax_pct, "top_terms": top_terms, } except Exception: return dict(empty)