fn_registry/python/functions/datascience/compute_vocabulary_stats.py

"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only).

Tokenises a list of documents, counts term frequencies and derives lexical
richness measures (type-token ratio, hapax legomena) plus the top-k terms.
No external NLP dependencies (no nltk, no sklearn) — only ``re`` and
``collections`` from the standard library.
"""

import re
from collections import Counter

# Common Spanish + English stopwords. Inline, lowercase, no accents stripped
# beyond what already appears here. Filtering is opt-in via remove_stopwords.
_STOPWORDS = {
    # Spanish
    "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
    "un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas",
    "más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque",
    "esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también",
    "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
    "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
    "ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro",
    "otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos",
    # English
    "the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
    "was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at",
    "from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its",
    "our", "their", "what", "which", "who", "whom", "has", "have", "had", "do",
    "does", "did", "will", "would", "can", "could", "should", "may", "might",
    "must", "if", "then", "than", "so", "too", "very", "just", "also", "were",
    "been", "being", "there", "here", "all", "any", "some", "more", "most",
    "out", "up", "down", "into", "over", "such", "only", "own", "same",
}


def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict:
    """Profile the vocabulary of a corpus of documents.

    Args:
        texts: List of strings (the corpus). Entries that are None or not a
            string are discarded silently.
        top_k: Maximum number of most-frequent terms to include in
            ``top_terms``. Default 20. Does not affect the other measures.
        remove_stopwords: When True (default) common ES+EN stopwords are
            dropped from the token stream before any counting.

    Returns:
        A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``,
        ``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no
        tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0,
        hapax_pct=None, top_terms=[]. Never raises — any exception degrades to
        the empty-corpus result.
    """
    empty = {
        "n_tokens": 0,
        "n_types": 0,
        "ttr": None,
        "n_hapax": 0,
        "hapax_pct": None,
        "top_terms": [],
    }
    try:
        tokens = []
        for doc in texts or []:
            if not isinstance(doc, str):
                continue
            for tok in re.findall(r"\w+", doc.lower(), re.UNICODE):
                if tok.isdigit():
                    continue
                if remove_stopwords and tok in _STOPWORDS:
                    continue
                tokens.append(tok)

        n_tokens = len(tokens)
        if n_tokens == 0:
            return dict(empty)

        counts = Counter(tokens)
        n_types = len(counts)
        ttr = round(n_types / n_tokens, 4)

        n_hapax = sum(1 for c in counts.values() if c == 1)
        hapax_pct = round(n_hapax / n_types * 100, 2)

        top_terms = [
            {"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)}
            for term, count in counts.most_common(top_k)
        ]

        return {
            "n_tokens": n_tokens,
            "n_types": n_types,
            "ttr": ttr,
            "n_hapax": n_hapax,
            "hapax_pct": hapax_pct,
            "top_terms": top_terms,
        }
    except Exception:
        return dict(empty)