diff --git a/python/functions/datascience/automatic_eda/chapters/text_distr.py b/python/functions/datascience/automatic_eda/chapters/text_distr.py new file mode 100644 index 00000000..d0b69c2c --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/text_distr.py @@ -0,0 +1,559 @@ +"""Free-text / NLP distributions chapter (TEXT DISTR) for AutomaticEDA. + +First chapter for **non-tabular** content: it profiles the linguistic content of +any column holding long free text (reviews, descriptions, comments, tickets) that +the categorical chapter cannot meaningfully summarize (high cardinality, many +words per value). It is the cheap, model-free counterpart to ``cat_distr`` for +columns that are prose rather than discrete labels. + +Activation (returns ``None`` when it does not apply): + +1. Cheap gate from the aggregated profile: at least one non-numeric column whose + ``categorical.len_mean`` (mean character length) is ``>= _MIN_LEN_CHARS``. + A dataset whose only string columns are short labels (e.g. titanic's + ``Name``, ~27 chars) never passes this gate, so the chapter disappears with + zero extra work and the existing report is untouched. +2. Confirmation from a raw sample: each candidate column is sampled (push-down + ``extract_text_sample`` over ``ctx['db_path']``/``ctx['table']``, or an + in-memory ``ctx['text_raw']`` for tests) and kept only if the **median word + count is ``>= _MIN_WORDS``** — i.e. it is genuinely long text, not a long + single token. If no column survives, the chapter returns ``None``. + +Per surviving column the chapter emits, kept together on its own page/slide +(``Group(page_break_before=...)``): + +- a key/value summary (documents, length percentiles, vocabulary richness with + **[[term:ttr]]TTR[[/term]]** and **[[term:hapax]]hapax legomena[[/term]]**, + dominant language, exact-duplicate %, readability when available); +- a word-count histogram figure; +- a top-terms table + a horizontal bar figure; +- bigram and trigram frequency tables; +- a detected-language bar figure (when ``langdetect`` is available); +- an optional word-cloud figure (only when ``wordcloud`` is installed); +- a closing note on duplicates / readability degradation. + +Every metric is delegated to pure ``eda`` registry functions +(``compute_text_length_stats``, ``compute_vocabulary_stats``, +``compute_top_ngrams``, ``detect_corpus_language``, ``compute_text_duplicates``, +``compute_text_readability``) and the raw sample to ``extract_text_sample``; all +are imported defensively so a missing function or optional library degrades that +single piece to a note instead of aborting the chapter. Optional libraries +(``langdetect``, ``textstat``, ``wordcloud``, ``datasketch``) are never required: +the piece is silently omitted when they are absent. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "text_distr" +CHAPTER_TITLE = "Texto libre (NLP)" + +# Cheap activation gate (characters): a non-numeric column whose mean string +# length reaches this is a candidate for "long text". Short labels (titanic's +# Name ≈ 27 chars) stay below it, so the chapter does not fire on them. +_MIN_LEN_CHARS = 50 +# Confirmation gate (words): a candidate is kept only if its median document has +# at least this many words — genuine prose, not a long id/URL token. +_MIN_WORDS = 20 +# Bound the document so very wide datasets stay readable. +_MAX_TEXT_COLS = 5 +# Raw text rows to sample per column when the chapter must extract them itself. +_SAMPLE_ROWS = 2000 +# Rows shown in the frequency tables. +_TOP_TERMS = 15 +_TOP_NGRAMS = 10 + +# Glossary terms this chapter explains (registered in the shared collector and +# marked clickable on first appearance — same mechanism as cat_distr's entropía). +_TERMS = { + "ttr": ( + "TTR (type-token ratio)", + "Riqueza léxica de un texto: número de palabras distintas (tipos) " + "dividido por el número total de palabras (tokens). Vale 1 cuando no se " + "repite ninguna palabra (máxima variedad) y baja hacia 0 cuando el " + "vocabulario se repite mucho. Depende de la longitud del corpus, así que " + "compara mejor textos de tamaño parecido."), + "hapax": ( + "Hapax legomena", + "Palabras que aparecen una sola vez en todo el corpus. Un porcentaje " + "alto de hapax indica vocabulario muy variado o, a veces, ruido " + "(erratas, identificadores, tokens raros). Se expresa como porcentaje " + "sobre el número de palabras distintas."), +} + + +def _fmt_int(value) -> str: + if value is None: + return "—" + try: + return f"{int(value):,}".replace(",", ".") + except (TypeError, ValueError): + return str(value) + + +def _fmt_num(value, decimals: int = 2) -> str: + if value is None: + return "—" + if isinstance(value, bool): + return str(value) + if isinstance(value, int): + return f"{value:,}".replace(",", ".") + if isinstance(value, float): + if value != value: # NaN + return "NaN" + if value in (float("inf"), float("-inf")): + return str(value) + text = f"{value:.{decimals}f}".rstrip("0").rstrip(".") + return text if text else "0" + return str(value) + + +def _fmt_pct(value, decimals: int = 1) -> str: + if value is None: + return "—" + try: + return f"{float(value):.{decimals}f}%" + except (TypeError, ValueError): + return str(value) + + +def _truncate(text, limit: int = 40) -> str: + s = model._safe_str(text) + return s if len(s) <= limit else s[: max(1, limit - 1)].rstrip() + "…" + + +# --------------------------------------------------------------------------- # +# Defensive wrappers around the registry functions: each returns the function's +# output dict or a safe empty default, never raising and never importing at +# module load (so the chapter stays importable even if a function is missing). +# --------------------------------------------------------------------------- # +def _length_stats(texts) -> dict: + try: + from datascience.compute_text_length_stats import compute_text_length_stats + out = compute_text_length_stats(texts) + if isinstance(out, dict): + return out + except Exception: # noqa: BLE001 + pass + return {} + + +def _vocab_stats(texts) -> dict: + try: + from datascience.compute_vocabulary_stats import compute_vocabulary_stats + out = compute_vocabulary_stats(texts, top_k=_TOP_TERMS) + if isinstance(out, dict): + return out + except Exception: # noqa: BLE001 + pass + return {} + + +def _ngrams(texts, n) -> list: + try: + from datascience.compute_top_ngrams import compute_top_ngrams + out = compute_top_ngrams(texts, n=n, top_k=_TOP_NGRAMS) + if isinstance(out, dict): + return out.get("top") or [] + except Exception: # noqa: BLE001 + pass + return [] + + +def _language(texts) -> dict: + try: + from datascience.detect_corpus_language import detect_corpus_language + out = detect_corpus_language(texts) + if isinstance(out, dict): + return out + except Exception: # noqa: BLE001 + pass + return {"available": False, "distribution": [], "dominant": None} + + +def _duplicates(texts) -> dict: + try: + from datascience.compute_text_duplicates import compute_text_duplicates + out = compute_text_duplicates(texts) + if isinstance(out, dict): + return out + except Exception: # noqa: BLE001 + pass + return {} + + +def _readability(texts) -> dict: + try: + from datascience.compute_text_readability import compute_text_readability + out = compute_text_readability(texts) + if isinstance(out, dict): + return out + except Exception: # noqa: BLE001 + pass + return {"available": False, "flesch": {}} + + +# --------------------------------------------------------------------------- # +# Candidate detection + raw sample acquisition. +# --------------------------------------------------------------------------- # +def _candidate_columns(profile: dict) -> list: + """Cheap gate: non-numeric columns whose mean char length reaches the + threshold. Returns the list of column names (possibly empty).""" + out = [] + for col in profile.get("columns") or []: + if not isinstance(col, dict): + continue + if col.get("inferred_type") == "numeric": + continue + cat = col.get("categorical") + if not isinstance(cat, dict): + continue + len_mean = cat.get("len_mean") + if isinstance(len_mean, (int, float)) and not isinstance(len_mean, bool) \ + and len_mean >= _MIN_LEN_CHARS: + name = col.get("name") + if name: + out.append(str(name)) + return out + + +def _get_samples(profile: dict, ctx: dict, columns: list) -> dict: + """Return {col: [str, ...]} raw text samples for the candidate columns. + + Prefers an in-memory ``ctx['text_raw']`` (used by tests); otherwise pushes a + sample down to the database via ``extract_text_sample`` using ctx db_path / + table. Never raises: returns {} when no sample can be obtained.""" + text_raw = ctx.get("text_raw") + if isinstance(text_raw, dict) and text_raw: + return {c: [str(v) for v in (text_raw.get(c) or []) if v is not None] + for c in columns if text_raw.get(c)} + + db_path = ctx.get("db_path") + table = ctx.get("table") + if not db_path or not table: + return {} + backend = ctx.get("backend") or "duckdb" + sample = ctx.get("sample") or _SAMPLE_ROWS + try: + from datascience.extract_text_sample import extract_text_sample + out = extract_text_sample(db_path, table, columns, backend=backend, + sample=sample) + if isinstance(out, dict) and out.get("status") == "ok": + cols = out.get("columns") + if isinstance(cols, dict): + return {c: list(v) for c, v in cols.items() if v} + except Exception: # noqa: BLE001 — dict-no-throw: no sample → chapter omits. + pass + return {} + + +def _confirm_long_text(samples: dict) -> dict: + """Keep only columns whose median word count reaches _MIN_WORDS. Returns + {col: length_stats_dict} for the survivors, in input order.""" + survivors = {} + for col, texts in samples.items(): + stats = _length_stats(texts) + words = stats.get("words") if isinstance(stats, dict) else None + median = words.get("p50") if isinstance(words, dict) else None + if isinstance(median, (int, float)) and not isinstance(median, bool) \ + and median >= _MIN_WORDS: + survivors[col] = stats + return survivors + + +# --------------------------------------------------------------------------- # +# Figures (lazy matplotlib, scaled by the renderers — same style as num_distr). +# --------------------------------------------------------------------------- # +def _hist_figure(name: str, length_stats: dict): + def make(): + import matplotlib + matplotlib.use("Agg") + from matplotlib.figure import Figure + fig = Figure(figsize=(6.2, 3.0)) + ax = fig.add_subplot(111) + bins = (length_stats or {}).get("word_hist") or [] + drew = False + for b in bins: + if not isinstance(b, dict): + continue + lo, hi, count = b.get("lo"), b.get("hi"), b.get("count") or 0 + if lo is None or hi is None: + continue + width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6) + ax.bar(lo, count, width=width, align="edge", color="#9ec6df", + edgecolor="#5b8aa6", linewidth=0.4) + drew = True + if not drew: + ax.text(0.5, 0.5, "(sin datos de longitud)", ha="center", + va="center", color="#8a8a8a", transform=ax.transAxes) + ax.set_xlabel("palabras por documento", fontsize=8) + ax.set_ylabel("nº de documentos", fontsize=8) + ax.tick_params(labelsize=7) + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + ax.set_title(f"Longitud de «{_truncate(name, 30)}»", fontsize=10, + loc="left") + fig.tight_layout() + return fig + return make + + +def _barh_figure(title: str, items: list, label_key: str, value_key: str, + xlabel: str): + """Horizontal bar chart from [{label_key:..., value_key:...}, ...].""" + def make(): + import matplotlib + matplotlib.use("Agg") + from matplotlib.figure import Figure + rows = [it for it in (items or []) if isinstance(it, dict) + and isinstance(it.get(value_key), (int, float))] + rows = rows[:12] + fig = Figure(figsize=(6.2, max(2.2, 0.32 * len(rows) + 0.8))) + ax = fig.add_subplot(111) + if not rows: + ax.text(0.5, 0.5, "(sin datos)", ha="center", va="center", + color="#8a8a8a", transform=ax.transAxes) + ax.axis("off") + return fig + labels = [_truncate(r.get(label_key), 28) for r in rows][::-1] + values = [float(r.get(value_key) or 0) for r in rows][::-1] + ypos = range(len(rows)) + ax.barh(list(ypos), values, color="#9ec6df", edgecolor="#5b8aa6", + linewidth=0.4) + ax.set_yticks(list(ypos)) + ax.set_yticklabels(labels, fontsize=7) + ax.set_xlabel(xlabel, fontsize=8) + ax.tick_params(labelsize=7) + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + ax.set_title(_truncate(title, 44), fontsize=10, loc="left") + fig.tight_layout() + return fig + return make + + +def _wordcloud_figure(texts): + """Word-cloud figure callable, or None if wordcloud is not installed.""" + try: + import wordcloud # noqa: F401 + except Exception: # noqa: BLE001 — optional dependency: omit the figure. + return None + + def make(): + import matplotlib + matplotlib.use("Agg") + from matplotlib.figure import Figure + from wordcloud import WordCloud + fig = Figure(figsize=(6.2, 3.2)) + ax = fig.add_subplot(111) + joined = " ".join(t for t in texts if isinstance(t, str)) + try: + wc = WordCloud(width=800, height=400, background_color="white", + colormap="viridis").generate(joined) + ax.imshow(wc, interpolation="bilinear") + except Exception: # noqa: BLE001 + ax.text(0.5, 0.5, "(nube de palabras no disponible)", ha="center", + va="center", color="#8a8a8a", transform=ax.transAxes) + ax.axis("off") + fig.tight_layout() + return fig + return make + + +# --------------------------------------------------------------------------- # +# Per-column block assembly. +# --------------------------------------------------------------------------- # +def _summary_kv(n_docs, length_stats, vocab, lang, dup, read): + chars = (length_stats or {}).get("chars") or {} + words = (length_stats or {}).get("words") or {} + sents = (length_stats or {}).get("sentences") or {} + rows = [ + ("Documentos", _fmt_int(n_docs)), + ("Caracteres (media · p50 · p90 · p99)", + f"{_fmt_num(chars.get('mean'))} · {_fmt_int(chars.get('p50'))} · " + f"{_fmt_int(chars.get('p90'))} · {_fmt_int(chars.get('p99'))}"), + ("Palabras (media · p50 · p90 · p99)", + f"{_fmt_num(words.get('mean'))} · {_fmt_int(words.get('p50'))} · " + f"{_fmt_int(words.get('p90'))} · {_fmt_int(words.get('p99'))}"), + ("Frases (media · máx)", + f"{_fmt_num(sents.get('mean'))} · {_fmt_int(sents.get('max'))}"), + ("Vocabulario (tokens · tipos · TTR)", + f"{_fmt_int(vocab.get('n_tokens'))} · {_fmt_int(vocab.get('n_types'))} " + f"· {_fmt_num(vocab.get('ttr'), 3)}"), + ("Hapax legomena", + f"{_fmt_int(vocab.get('n_hapax'))} ({_fmt_pct(vocab.get('hapax_pct'))})"), + ] + if isinstance(lang, dict) and lang.get("available"): + dom = lang.get("dominant") + n_langs = len(lang.get("distribution") or []) + rows.append(("Idioma dominante · nº idiomas", + f"{model._safe_str(dom) or '—'} · {_fmt_int(n_langs)}")) + if isinstance(dup, dict) and dup.get("n_docs"): + rows.append(("Duplicados exactos", + f"{_fmt_int(dup.get('n_exact_dup'))} " + f"({_fmt_pct(dup.get('exact_dup_pct'))})")) + if isinstance(read, dict) and read.get("available"): + flesch = read.get("flesch") or {} + rows.append(("Legibilidad Flesch (media)", + _fmt_num(flesch.get("mean"), 1))) + return model.KVTable(rows=rows, title="Resumen del texto") + + +def _terms_table(vocab) -> "model.DataTable | None": + top = (vocab or {}).get("top_terms") or [] + rows = [[_truncate(t.get("term"), 32), _fmt_int(t.get("count")), + _fmt_pct(t.get("pct"))] + for t in top[:_TOP_TERMS] if isinstance(t, dict)] + if not rows: + return None + return model.DataTable(header=["Término", "Conteo", "% tokens"], rows=rows, + title="Términos más frecuentes", + note="stopwords ES+EN eliminadas") + + +def _ngram_table(items, n_label) -> "model.DataTable | None": + rows = [[_truncate(it.get("ngram"), 40), _fmt_int(it.get("count"))] + for it in (items or [])[:_TOP_NGRAMS] if isinstance(it, dict)] + if not rows: + return None + return model.DataTable(header=[n_label, "Conteo"], rows=rows, + title=f"{n_label} más frecuentes") + + +def _dup_note(dup, lang, read) -> "model.Note | None": + bits = [] + if isinstance(dup, dict): + nd = dup.get("near_dup") or {} + if nd.get("available"): + bits.append( + f"casi-duplicados detectados (MinHash, umbral " + f"{_fmt_num(nd.get('threshold'))}): " + f"{_fmt_int(nd.get('n_near_dup_docs'))} documentos") + else: + bits.append("near-duplicados no calculados (datasketch no instalado; " + "se reportan solo los duplicados exactos por hash)") + if isinstance(lang, dict) and not lang.get("available"): + bits.append("detección de idioma omitida (langdetect no instalado)") + if isinstance(read, dict) and not read.get("available"): + bits.append("legibilidad omitida (textstat no instalado)") + if not bits: + return None + return model.Note(" · ".join(bits)) + + +def _column_group(name, texts, length_stats, idx, mark_terms): + vocab = _vocab_stats(texts) + lang = _language(texts) + dup = _duplicates(texts) + read = _readability(texts) + n_docs = (length_stats or {}).get("n_docs") + + blocks = [ + model.Heading(text=str(name), level=2), + _summary_kv(n_docs, length_stats, vocab, lang, dup, read), + model.Figure(make=_hist_figure(name, length_stats), + caption=f"Distribución de la longitud (palabras) de " + f"«{_truncate(name, 30)}»."), + ] + + terms_tbl = _terms_table(vocab) + if terms_tbl is not None: + blocks.append(terms_tbl) + blocks.append(model.Figure( + make=_barh_figure(f"Top términos de «{_truncate(name, 24)}»", + vocab.get("top_terms"), "term", "count", + "conteo"), + caption="Términos más frecuentes (barras).")) + + bi_tbl = _ngram_table(_ngrams(texts, 2), "Bigrama") + if bi_tbl is not None: + blocks.append(bi_tbl) + tri_tbl = _ngram_table(_ngrams(texts, 3), "Trigrama") + if tri_tbl is not None: + blocks.append(tri_tbl) + + if isinstance(lang, dict) and lang.get("available") \ + and lang.get("distribution"): + blocks.append(model.Figure( + make=_barh_figure(f"Idiomas detectados en «{_truncate(name, 24)}»", + lang.get("distribution"), "lang", "count", + "documentos"), + caption="Distribución de idiomas detectados (langdetect).")) + + wc = _wordcloud_figure(texts) + if wc is not None: + blocks.append(model.Figure( + make=wc, caption=f"Nube de palabras de «{_truncate(name, 30)}».")) + + note = _dup_note(dup, lang, read) + if note is not None: + blocks.append(note) + + return model.Group(blocks=blocks, page_break_before=(idx > 0)) + + +def _intro_blocks(n_cols, mark_terms): + ttr = ("[[term:ttr]]TTR[[/term]]" if mark_terms else "TTR") + hapax = ("[[term:hapax]]hapax legomena[[/term]]" if mark_terms + else "hapax legomena") + text = ( + f"Este capítulo perfila las columnas de **texto libre largo** del " + f"dataset (reseñas, descripciones, comentarios): contenido lingüístico " + f"que la distribución categórica no resume bien. Para cada columna se " + f"muestran la longitud de los documentos, la riqueza de vocabulario " + f"(incluido el {ttr} y el porcentaje de {hapax}), los términos y " + f"n-gramas más frecuentes, los idiomas detectados y el nivel de " + f"duplicación. Las métricas son baratas y sin modelos pesados; las " + f"piezas que dependen de una librería opcional se omiten si no está " + f"instalada.") + return [ + model.Heading(text=CHAPTER_TITLE, level=1), + model.Markdown(text=text), + ] + + +def build_text_distr(profile: dict, ctx: dict): + """Build the free-text Chapter, or None if no long-text column applies.""" + profile = profile or {} + ctx = ctx or {} + + # 1) Cheap gate from the profile (no DB access yet). + candidates = _candidate_columns(profile) + if not candidates: + return None + + # 2) Raw sample + 3) confirm genuine long text (median words >= threshold). + samples = _get_samples(profile, ctx, candidates) + if not samples: + return None + survivors = _confirm_long_text(samples) + if not survivors: + return None + + # Register glossary terms (clickable) once we know the chapter applies. + glossary = ctx.get("glossary") + mark_terms = False + if isinstance(glossary, model.GlossaryCollector): + for key, (label, definition) in _TERMS.items(): + glossary.add(key, label, definition) + mark_terms = True + + blocks = list(_intro_blocks(len(survivors), mark_terms)) + + rendered = list(survivors.items())[:_MAX_TEXT_COLS] + for idx, (name, length_stats) in enumerate(rendered): + texts = samples.get(name) or [] + blocks.append(_column_group(name, texts, length_stats, idx, mark_terms)) + + if len(survivors) > len(rendered): + omitted = len(survivors) - len(rendered) + blocks.append(model.Note( + f"Se muestran las primeras {len(rendered)} columnas de texto; " + f"quedan {omitted} sin mostrar para mantener acotado el informe.")) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/text_distr_test.py b/python/functions/datascience/automatic_eda/chapters/text_distr_test.py new file mode 100644 index 00000000..7c73a783 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/text_distr_test.py @@ -0,0 +1,256 @@ +"""Tests for the TEXT DISTR chapter — DoD: golden + edges + degradation. + +Self-contained: builds synthetic TableProfiles and feeds the raw text sample +in-memory through ``ctx['text_raw']`` (no DuckDB needed), so the suite is fast +and deterministic. Verifies that ``build_text_distr``: + +- GOLDEN: with a long-text column, emits the chapter with its key blocks + (length summary, word histogram, top-terms table, n-gram tables, language + bars) and registers the clickable glossary terms; and that it renders inside + the full document to both PDF and PPTX showing that content. +- EDGE (None): a dataset whose only string column is short labels (titanic-like + ``Name``) yields ``None`` without raising — the existing report is untouched. +- EDGE (None): a column that passes the cheap char gate but whose documents are + short (median words below the threshold) is rejected at the confirmation step. +- DEGRADATION: with ``langdetect`` / ``textstat`` / ``wordcloud`` unavailable, + the chapter still builds (those pieces are omitted) and never raises. +""" + +import builtins +import os +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.model import ( + DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown, + Note, +) +from datascience.automatic_eda.chapters.text_distr import ( + CHAPTER_ID, CHAPTER_VERSION, build_text_distr, +) +from datascience.automatic_eda.chapters_registry import build_document +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +# --------------------------------------------------------------------------- # +# Synthetic corpus + profiles. +# --------------------------------------------------------------------------- # +_ES = [ + "El producto llegó en perfecto estado y mucho antes de lo previsto por la tienda", + "La calidad de los materiales es realmente excelente y se nota la diferencia al usarlo", + "No me convenció del todo porque esperaba bastante más por el precio que pagué finalmente", + "El servicio de atención al cliente fue rápido amable y resolvió mi problema sin demora", + "Lo recomiendo totalmente ya que ha superado con creces todas mis expectativas iniciales", +] +_EN = [ + "The product arrived in perfect condition and much earlier than the store had promised me", + "The build quality is genuinely outstanding and you can really feel the difference using it", + "I was not fully convinced because I expected quite a lot more for the price i finally paid", + "Customer support was fast friendly and solved my whole problem without any delay at all", + "I highly recommend it since it has exceeded by far every one of my initial expectations", +] + + +def _long_reviews(n=40) -> list: + """A corpus of long multi-sentence reviews (>= 20 words each), mixing two + languages and including a few exact duplicates.""" + out = [] + for i in range(n): + base = _ES if i % 3 != 0 else _EN # mostly ES, some EN + a = base[i % len(base)] + b = base[(i + 2) % len(base)] + out.append(f"{a}. {b}.") + # Inject a couple of exact duplicates. + out.append(out[0]) + out.append(out[1]) + return out + + +def _text_profile() -> dict: + """Profile with a long free-text column (review) + a numeric + a short cat.""" + return { + "table": "reviews", + "source": "/data/reviews.duckdb", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 42, + "n_cols": 3, + "quality_score": 88.0, + "columns": [ + { + "name": "review", + "inferred_type": "categorical", + "categorical": { + "top": [{"value": "x", "count": 2, "pct": 0.05}], + "n_distinct": 40, + "len_mean": 180.0, + "len_min": 80, + "len_max": 220, + }, + }, + { + "name": "rating", + "inferred_type": "numeric", + "numeric": {"mean": 3.1, "median": 3.0, "std": 1.2, + "min": 1, "max": 5}, + }, + { + "name": "product", + "inferred_type": "categorical", + "categorical": { + "top": [{"value": "teclado", "count": 10, "pct": 0.25}], + "n_distinct": 6, + "len_mean": 7.0, + "len_min": 5, "len_max": 11, + }, + }, + ], + } + + +def _no_text_profile() -> dict: + """titanic-like: the only string column is short labels (Name ≈ 27 chars).""" + return { + "table": "titanic", + "n_rows": 891, + "n_cols": 3, + "columns": [ + {"name": "Age", "inferred_type": "numeric", + "numeric": {"mean": 29.7, "median": 28.0, "std": 14.5}}, + {"name": "Name", "inferred_type": "categorical", + "categorical": {"top": [{"value": "Braund, Mr. Owen Harris", + "count": 1, "pct": 0.001}], + "n_distinct": 891, "len_mean": 27.0, + "len_min": 12, "len_max": 82}}, + {"name": "Sex", "inferred_type": "categorical", + "categorical": {"top": [{"value": "male", "count": 577, + "pct": 0.65}], + "n_distinct": 2, "len_mean": 4.6, + "len_min": 4, "len_max": 6}}, + ], + } + + +def _flatten(blocks) -> list: + """Recursively flatten Group blocks so tests can inspect leaf blocks.""" + out = [] + for b in blocks: + if isinstance(b, Group): + out.extend(_flatten(b.blocks)) + else: + out.append(b) + return out + + +# --------------------------------------------------------------------------- # +# Golden. +# --------------------------------------------------------------------------- # +def test_golden_activa_con_texto(): + glossary = GlossaryCollector() + ctx = {"text_raw": {"review": _long_reviews()}, "glossary": glossary} + ch = build_text_distr(_text_profile(), ctx) + + assert ch is not None, "el capítulo debe activarse con una columna de texto largo" + assert ch.id == CHAPTER_ID + assert ch.version == CHAPTER_VERSION + leaves = _flatten(ch.blocks) + kinds = [b.kind for b in leaves] + assert "heading" in kinds + assert "kv_table" in kinds # summary + assert "figure" in kinds # histogram / bars + assert "data_table" in kinds # top terms + n-grams + + # KV summary mentions vocabulary metrics. + kv = next(b for b in leaves if isinstance(b, KVTable)) + labels = " ".join(str(r[0]) for r in kv.rows) + assert "TTR" in labels + assert "Hapax" in labels or "hapax" in labels + + # There is a terms table and at least one n-gram table. + titles = [getattr(b, "title", "") or "" for b in leaves + if isinstance(b, DataTable)] + assert any("Términos" in t for t in titles) + assert any("Bigrama" in t for t in titles) + + # Glossary terms were registered (clickable destinations). + assert glossary.has("ttr") + assert glossary.has("hapax") + + +def test_golden_render_pdf_pptx(): + profile = _text_profile() + ctx = {"text_raw": {"review": _long_reviews()}, + "dataset_name": "reviews"} + chapters = build_document(profile, ctx) + ids = [c.id for c in chapters] + assert "text_distr" in ids, f"text_distr ausente en {ids}" + + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "t.pdf") + pptx = os.path.join(d, "t.pptx") + rp = render_automatic_eda_pdf(profile, pdf, {"title": "EDA", "ctx": ctx}) + rx = render_automatic_eda_pptx(profile, pptx, {"title": "EDA", "ctx": ctx}) + assert rp.get("path") and os.path.exists(pdf) + assert rx.get("path") and os.path.exists(pptx) + + text = "\n".join(p.extract_text() or "" for p in PdfReader(pdf).pages) + assert "Texto libre" in text or "TTR" in text + + prs = Presentation(pptx) + ptext = [] + for slide in prs.slides: + for shp in slide.shapes: + if shp.has_text_frame: + ptext.append(shp.text_frame.text) + joined = "\n".join(ptext) + assert "Texto libre" in joined or "TTR" in joined + + +# --------------------------------------------------------------------------- # +# Edges — None. +# --------------------------------------------------------------------------- # +def test_edge_none_sin_texto_largo(): + # titanic-like: short labels only → chapter must not apply. + assert build_text_distr(_no_text_profile(), {}) is None + + +def test_edge_none_palabras_cortas(): + # Char gate passes (len_mean high) but documents are short → confirmation + # rejects them (median words below threshold). + profile = _text_profile() + short = ["palabra " * 3] * 30 # 3 words each, < _MIN_WORDS + ctx = {"text_raw": {"review": short}} + assert build_text_distr(profile, ctx) is None + + +def test_edge_none_empty_profile(): + assert build_text_distr({}, {}) is None + assert build_text_distr(None, None) is None + + +# --------------------------------------------------------------------------- # +# Degradation — optional libs absent. +# --------------------------------------------------------------------------- # +def test_degradacion_sin_libs(monkeypatch): + real_import = builtins.__import__ + blocked = ("langdetect", "textstat", "wordcloud", "datasketch") + + def fake_import(name, *a, **k): + if name in blocked or any(name.startswith(b + ".") for b in blocked): + raise ImportError(f"simulado: {name}") + return real_import(name, *a, **k) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + ctx = {"text_raw": {"review": _long_reviews()}} + ch = build_text_distr(_text_profile(), ctx) + # Still builds (the cheap, stdlib-only pieces remain) and never raises. + assert ch is not None + leaves = _flatten(ch.blocks) + assert any(isinstance(b, KVTable) for b in leaves) + assert any(isinstance(b, DataTable) for b in leaves) + # A degradation note is present mentioning the missing optional libs. + notes = " ".join(b.text for b in leaves if isinstance(b, Note)) + assert "langdetect" in notes or "textstat" in notes or "datasketch" in notes diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index d9030999..70351b6d 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -31,6 +31,7 @@ CHAPTER_ORDER = [ "analisis_llm", # LLM interpretation — sits next to overview (user request) "num_distr", # numeric distributions "cat_distr", # categorical distributions + "text_distr", # free-text / NLP distributions (non-tabular content) "calidad", # data quality "correlacion", # correlations / associations "relaciones", # key relations: declared/candidate PK + FK (inter/intra-table) diff --git a/python/functions/datascience/compute_text_duplicates.md b/python/functions/datascience/compute_text_duplicates.md new file mode 100644 index 00000000..735eef17 --- /dev/null +++ b/python/functions/datascience/compute_text_duplicates.md @@ -0,0 +1,102 @@ +--- +id: compute_text_duplicates_py_datascience +name: compute_text_duplicates +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict" +description: "Detecta documentos duplicados en un corpus de texto. Los duplicados EXACTOS se calculan siempre con la stdlib: cada documento se normaliza (colapsa espacios, strip, lower) y se hashea con SHA-1; n_exact_dup es cuántos docs repiten uno ya visto y exact_dup_pct su porcentaje. Los CASI-duplicados (near-dup) usan la dependencia OPCIONAL datasketch (MinHash + LSH sobre 3-shingles de palabras); si no está instalada, esa parte degrada a available:False sin afectar al resto. Estilo dict-no-throw del grupo eda — nunca lanza." +tags: [eda, datascience, text, nlp, duplicates, minhash, pure, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [hashlib, re] +example: | + from datascience.compute_text_duplicates import compute_text_duplicates + texts = ["El gato come pescado", "El gato come pescado", "Un perro ladra"] + result = compute_text_duplicates(texts) + # {"n_docs": 3, "n_exact_dup": 1, "exact_dup_pct": 33.33, "n_unique": 2, + # "near_dup": {"available": False, "n_near_dup_docs": 0}} +tested: true +tests: + - "test_duplicados_exactos" + - "test_sin_duplicados" + - "test_vacio" + - "test_near_dup_degrada" +test_file_path: "python/functions/datascience/compute_text_duplicates_test.py" +file_path: "python/functions/datascience/compute_text_duplicates.py" +params: + - name: texts + desc: "Lista de documentos de texto. Los elementos None o que no sean str se descartan silenciosamente; n_docs cuenta solo los documentos válidos. None como argumento se trata como lista vacía." + - name: near_threshold + desc: "Umbral de similitud Jaccard (0–1) para considerar dos documentos casi-duplicados en el cálculo near-dup vía MinHashLSH. Solo aplica si datasketch está instalada. Default 0.85." + - name: sample_max + desc: "Número máximo de documentos muestreados (los primeros) para el cálculo near-dup, que es O(n) en memoria de MinHashes. No afecta al conteo de duplicados exactos, que siempre recorre todo el corpus. Default 2000." +output: "Dict con exactamente 5 claves, siempre presentes: n_docs (int, docs válidos), n_exact_dup (int, docs que repiten un texto normalizado ya visto = n_docs - n_unique), exact_dup_pct (float a 2 decimales = n_exact_dup/n_docs*100, o None si el corpus está vacío), n_unique (int, nº de textos normalizados distintos), y near_dup (sub-dict con available:bool y n_near_dup_docs:int; cuando available es True incluye además threshold con el near_threshold usado). La función nunca lanza: captura toda excepción y degrada." +--- + +## Ejemplo + +```python +from datascience.compute_text_duplicates import compute_text_duplicates + +# Tres copias del mismo texto (con espacios/casing distintos) + dos únicos. +texts = [ + "El gato come pescado", + "El gato come pescado", + "el GATO come pescado", # mismo tras normalizar + "Un perro ladra", + "La luna brilla", +] + +compute_text_duplicates(texts) +# { +# "n_docs": 5, +# "n_exact_dup": 2, # 3 copias del primer texto => 2 repeticiones +# "exact_dup_pct": 40.0, # 2 / 5 * 100 +# "n_unique": 3, # 3 textos normalizados distintos +# "near_dup": {"available": False, "n_near_dup_docs": 0}, # datasketch ausente +# } + +# Corpus vacío: contrato estable, exact_dup_pct None, sin excepción. +compute_text_duplicates([]) +# {"n_docs": 0, "n_exact_dup": 0, "exact_dup_pct": None, "n_unique": 0, +# "near_dup": {"available": False, "n_near_dup_docs": 0}} +``` + +## Cuando usarla + +Úsala en la fase de calidad de un EDA de texto, cuando quieras saber cuánto de +tu corpus es ruido duplicado antes de entrenar, vectorizar o muestrear: te da +el porcentaje de duplicados exactos (`exact_dup_pct`), el número de documentos +únicos (`n_unique`) y, si tienes `datasketch` instalada, una estimación de +casi-duplicados (paráfrasis, copias con pequeñas ediciones) vía MinHash + LSH. +Pásale directamente la columna/lista de textos crudos; la función filtra None y +no-str por ti y nunca lanza, así que es segura para encadenar en pipelines de +perfilado. + +## Gotchas + +- **Near-dup requiere `datasketch` (opcional).** Si la librería no está + instalada, `near_dup` degrada a `{"available": False, "n_near_dup_docs": 0}` + (sin clave `threshold`) y el resto del resultado se calcula igual. Los + duplicados **exactos** funcionan siempre porque solo usan la stdlib (hash). +- **Normalización de exactos.** Dos textos cuentan como el mismo duplicado + exacto si coinciden tras `" ".join(doc.split()).strip().lower()`: se colapsan + espacios/tabuladores/saltos, se recortan extremos y se ignora el caso. Cambios + de puntuación o acentos SÍ los distinguen (no se eliminan). +- **`n_exact_dup` cuenta repeticiones, no grupos.** Con 3 copias de un mismo + texto, `n_exact_dup` es 2 (las dos copias extra), no 1. Equivale a + `n_docs - n_unique`. +- **`exact_dup_pct` es `None` con corpus vacío** (no `ZeroDivisionError`); en + cualquier otro caso es un float redondeado a 2 decimales. +- **`sample_max` solo limita el near-dup.** El conteo de duplicados exactos + recorre todo el corpus; el near-dup muestrea los primeros `sample_max` + documentos para acotar memoria. Si el corpus está ordenado, considera barajar + antes para que la muestra sea representativa. +- **Elementos no-str se descartan.** `True`/`False` no cuentan como str y se + ignoran igual que `None`; `n_docs` refleja solo los documentos válidos. diff --git a/python/functions/datascience/compute_text_duplicates.py b/python/functions/datascience/compute_text_duplicates.py new file mode 100644 index 00000000..a627fea0 --- /dev/null +++ b/python/functions/datascience/compute_text_duplicates.py @@ -0,0 +1,128 @@ +"""Detección de documentos duplicados en un corpus de texto. + +Función pura, estilo dict-no-throw del grupo `eda`: nunca lanza, siempre +devuelve el mismo contrato de claves. Los duplicados EXACTOS se calculan +siempre con la stdlib (normalización + hash SHA-1). Los CASI-duplicados +(near-dup) requieren la dependencia opcional `datasketch`; si no está +instalada, esa parte degrada limpiamente a ``available: False`` sin afectar +al resto del cálculo. +""" + +import hashlib +import re + + +def _compute_near_dup(valid, near_threshold, sample_max): + """Cuenta documentos con al menos otro casi-duplicado vía MinHash + LSH. + + Import perezoso de ``datasketch``. Si la librería no está disponible (o + cualquier paso falla), degrada a ``{"available": False, "n_near_dup_docs": 0}`` + sin propagar la excepción. + + Args: + valid: lista de str ya filtrada (sin None ni no-str). + near_threshold: umbral de similitud Jaccard para LSH. + sample_max: número máximo de documentos a muestrear. + + Returns: + dict con ``available`` (bool) y ``n_near_dup_docs`` (int). Cuando + ``available`` es True, incluye además ``threshold``. + """ + try: + from datasketch import MinHash, MinHashLSH + except Exception: + return {"available": False, "n_near_dup_docs": 0} + + try: + docs = valid[:sample_max] + num_perm = 128 + lsh = MinHashLSH(threshold=near_threshold, num_perm=num_perm) + minhashes = {} + + for i, doc in enumerate(docs): + tokens = re.findall(r"\w+", doc.lower()) + shingles = set() + for j in range(len(tokens) - 2): + shingles.add(" ".join(tokens[j:j + 3])) + # Documentos con menos de 3 tokens no generan 3-shingles: caemos a + # los tokens sueltos para no perderlos del todo. + if not shingles: + shingles = set(tokens) + if not shingles: + # Documento sin tokens (cadena vacía / solo símbolos): se omite. + continue + m = MinHash(num_perm=num_perm) + for sh in shingles: + m.update(sh.encode("utf-8")) + key = "d{}".format(i) + minhashes[key] = m + lsh.insert(key, m) + + n_near = 0 + for key, m in minhashes.items(): + matches = lsh.query(m) + if len(matches) > 1: + n_near += 1 + + return { + "available": True, + "n_near_dup_docs": int(n_near), + "threshold": near_threshold, + } + except Exception: + return {"available": False, "n_near_dup_docs": 0} + + +def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict: + """Detecta duplicados exactos y casi-duplicados en un corpus de texto. + + Args: + texts: lista de documentos. Los elementos None o que no sean str se + descartan; ``n_docs`` cuenta solo los válidos. + near_threshold: umbral de similitud Jaccard para considerar dos + documentos casi-duplicados (solo near-dup, requiere datasketch). + sample_max: tope de documentos muestreados para el cálculo near-dup. + + Returns: + dict con las claves ``n_docs``, ``n_exact_dup``, ``exact_dup_pct`` + (float redondeado a 2 decimales, o None si el corpus está vacío), + ``n_unique`` y ``near_dup`` (sub-dict con ``available`` y + ``n_near_dup_docs``, más ``threshold`` cuando está disponible). + Nunca lanza: captura toda excepción y degrada. + """ + # Filtrado defensivo de documentos válidos. + try: + valid = [t for t in texts if isinstance(t, str)] if texts is not None else [] + except Exception: + valid = [] + + n_docs = len(valid) + + # Duplicados exactos: normalizar + hash SHA-1 (stdlib, siempre disponible). + try: + seen = set() + n_exact_dup = 0 + for doc in valid: + norm = " ".join(doc.split()).strip().lower() + digest = hashlib.sha1(norm.encode("utf-8")).hexdigest() + if digest in seen: + n_exact_dup += 1 + else: + seen.add(digest) + n_unique = len(seen) + except Exception: + n_exact_dup = 0 + n_unique = 0 + + exact_dup_pct = round(n_exact_dup / n_docs * 100, 2) if n_docs > 0 else None + + # Casi-duplicados: opcional vía datasketch, degrada solo. + near_dup = _compute_near_dup(valid, near_threshold, sample_max) + + return { + "n_docs": n_docs, + "n_exact_dup": n_exact_dup, + "exact_dup_pct": exact_dup_pct, + "n_unique": n_unique, + "near_dup": near_dup, + } diff --git a/python/functions/datascience/compute_text_duplicates_test.py b/python/functions/datascience/compute_text_duplicates_test.py new file mode 100644 index 00000000..f4209713 --- /dev/null +++ b/python/functions/datascience/compute_text_duplicates_test.py @@ -0,0 +1,77 @@ +"""Tests para compute_text_duplicates. + +Importa el modulo hoja directamente (`datascience.compute_text_duplicates`) +para no depender de que el paquete reexporte la funcion en su __init__. +datasketch normalmente NO esta instalada en el venv, asi que near_dup +degrada a available=False; los tests no requieren la libreria. +""" + +from datascience.compute_text_duplicates import compute_text_duplicates + + +EXPECTED_KEYS = {"n_docs", "n_exact_dup", "exact_dup_pct", "n_unique", "near_dup"} + + +def test_duplicados_exactos(): + """3 copias del mismo texto + 2 únicos: n_exact_dup=2, pct>0.""" + texts = [ + "El gato come pescado", + "El gato come pescado", + "el GATO come pescado", # mismo tras normalizar (espacios + case) + "Un perro ladra", + "La luna brilla", + ] + result = compute_text_duplicates(texts) + + assert set(result.keys()) == EXPECTED_KEYS + assert result["n_docs"] == 5 + # 3 copias del primer texto (2 son repeticion) + 2 textos unicos. + assert result["n_exact_dup"] == 2 + assert result["n_unique"] == 3 + assert result["exact_dup_pct"] is not None + assert result["exact_dup_pct"] > 0 + # 2 / 5 * 100 = 40.0 + assert abs(result["exact_dup_pct"] - 40.0) < 1e-9 + + +def test_sin_duplicados(): + """Corpus sin repeticiones: n_exact_dup=0, n_unique==n_docs.""" + texts = [ + "primero documento distinto", + "segundo documento distinto", + "tercero documento distinto", + ] + result = compute_text_duplicates(texts) + + assert result["n_docs"] == 3 + assert result["n_exact_dup"] == 0 + assert result["n_unique"] == 3 + assert abs(result["exact_dup_pct"] - 0.0) < 1e-9 + + +def test_vacio(): + """Corpus vacio: n_docs 0, exact_dup_pct None, no lanza.""" + result = compute_text_duplicates([]) + + assert set(result.keys()) == EXPECTED_KEYS + assert result["n_docs"] == 0 + assert result["n_exact_dup"] == 0 + assert result["exact_dup_pct"] is None + assert result["n_unique"] == 0 + assert result["near_dup"]["n_near_dup_docs"] == 0 + + +def test_near_dup_degrada(): + """near_dup expone 'available' (bool) y no lanza aunque falte datasketch.""" + texts = ["uno dos tres cuatro", "uno dos tres cuatro cinco", "algo distinto"] + result = compute_text_duplicates(texts) + + near = result["near_dup"] + assert "available" in near + assert isinstance(near["available"], bool) + assert "n_near_dup_docs" in near + assert isinstance(near["n_near_dup_docs"], int) + # Tambien tolera None y entradas no-str sin lanzar. + mixed = compute_text_duplicates(["hola", None, 123, "hola"]) + assert mixed["n_docs"] == 2 + assert mixed["n_exact_dup"] == 1 diff --git a/python/functions/datascience/compute_text_length_stats.md b/python/functions/datascience/compute_text_length_stats.md new file mode 100644 index 00000000..92ab1246 --- /dev/null +++ b/python/functions/datascience/compute_text_length_stats.md @@ -0,0 +1,86 @@ +--- +id: compute_text_length_stats_py_datascience +name: compute_text_length_stats +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def compute_text_length_stats(texts, n_bins=20) -> dict" +description: "Profiles the length distribution of a corpus of text documents for EDA: per-document characters, words (unicode \\w+ tokens) and sentences (segments split on .!?… with a minimum of 1 per non-empty doc), each summarized with mean/p50/p90/p99/min/max (nearest-rank percentiles), plus an equal-width histogram of per-document word counts. None and non-str items are discarded. Dict-no-throw: never raises. Stdlib only (re)." +tags: [eda, datascience, text, nlp, length, statistics, pure, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re, math] +example: | + from datascience.compute_text_length_stats import compute_text_length_stats + result = compute_text_length_stats(["Hola mundo.", "Una frase mas larga aqui."], n_bins=5) +tested: true +tests: + - "test_basico" + - "test_vacio" + - "test_descarta_none" + - "test_un_documento" +test_file_path: "python/functions/datascience/compute_text_length_stats_test.py" +file_path: "python/functions/datascience/compute_text_length_stats.py" +params: + - name: texts + desc: "List of text documents (str). None entries and any non-str items (ints, floats, etc.) are discarded before any computation. An empty string \"\" is kept (chars 0, words 0, sentences 0)." + - name: n_bins + desc: "Number of equal-width bins for the per-document word-count histogram. Default 20. When all docs have the same word count, there are <2 docs, or n_bins < 1, a single covering bin is returned instead." +output: "Dict with keys n_docs (int), chars, words, sentences and word_hist. Each of the three axis sub-dicts has the exact keys mean (float, 2 decimals), p50, p90, p99, min, max (ints). When there are no valid documents, n_docs is 0, every axis statistic is None and word_hist is []. word_hist is a list of {lo: float, hi: float, count: int} bins; the sum of all bin counts equals n_docs." +--- + +## Ejemplo + +```python +from datascience.compute_text_length_stats import compute_text_length_stats + +compute_text_length_stats( + [ + "Hola mundo.", + "Una frase mas larga con varias palabras aqui.", + "Esto. Tiene. Tres frases distintas!", + ], + n_bins=5, +) +# { +# "n_docs": 3, +# "chars": {"mean": 30.33, "p50": 35, "p90": 45, "p99": 45, "min": 11, "max": 45}, +# "words": {"mean": 5.0, "p50": 5, "p90": 8, "p99": 8, "min": 2, "max": 8}, +# "sentences": {"mean": 1.67, "p50": 1, "p90": 3, "p99": 3, "min": 1, "max": 3}, +# "word_hist": [ +# {"lo": 2.0, "hi": 3.2, "count": 1}, +# {"lo": 3.2, "hi": 4.4, "count": 0}, +# {"lo": 4.4, "hi": 5.6, "count": 1}, +# {"lo": 5.6, "hi": 6.8, "count": 0}, +# {"lo": 6.8, "hi": 8.0, "count": 1}, +# ], +# } +``` + +## Cuando usarla + +Úsala al perfilar una columna o corpus de texto libre en un EDA: cuando +necesites saber lo largos que son los documentos (en caracteres, palabras y +frases) y cómo se reparte esa longitud antes de tokenizar, vectorizar o decidir +truncados/ventanas para un modelo. Pásale la lista de strings crudos de la +columna; `None` y valores no-texto se descartan solos. Encaja en el grupo `eda` +como bloque de longitud junto a `summarize_categorical`. + +## Gotchas + +- Función pura, solo stdlib (`re`). No usa numpy, pandas ni sklearn. +- Percentiles por método **nearest-rank** (devuelven un valor real de la lista, + no interpolan); por eso p50/p90/p99/min/max son enteros y `mean` es el único + float (redondeado a 2 decimales). +- El conteo de frases es una **aproximación** por puntuación (`.!?…`): un texto + sin esa puntuación cuenta como 1 frase si no está vacío; abreviaturas o + ellipsis pueden inflar o reducir el conteo. +- `word_hist` es equal-width entre min y max de palabras: con todos los docs + del mismo tamaño, menos de 2 docs, o `n_bins < 1`, devuelve un único bin. +- Dict-no-throw: ante input inesperado devuelve la forma vacía + (`n_docs` 0, ejes `None`, `word_hist` []) en vez de lanzar. diff --git a/python/functions/datascience/compute_text_length_stats.py b/python/functions/datascience/compute_text_length_stats.py new file mode 100644 index 00000000..3dcd84c5 --- /dev/null +++ b/python/functions/datascience/compute_text_length_stats.py @@ -0,0 +1,168 @@ +"""Pure EDA helper: document length distribution for the `eda` group. + +Given a list of text documents, computes the length distribution along three +axes (characters, words and sentences) plus an equal-width histogram of the +per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a +hand-rolled nearest-rank percentile). No numpy, no sklearn. + +The function is dict-no-throw: it never raises. On any unexpected input it +degrades to the empty-shape result. +""" + +import math +import re + +_WORD_RE = re.compile(r"\w+", re.UNICODE) +_SENT_RE = re.compile(r"[.!?…]+") + + +def _empty_axis() -> dict: + """Return an axis sub-dict with every statistic set to ``None``.""" + return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None} + + +def _pct(sorted_vals, q): + """Nearest-rank percentile of an already-sorted list. + + Args: + sorted_vals: List of numbers sorted ascending. + q: Percentile in the 0..100 range. + + Returns: + The value at the nearest rank, or ``None`` for an empty list. + """ + n = len(sorted_vals) + if n == 0: + return None + if q <= 0: + return sorted_vals[0] + rank = math.ceil(q / 100.0 * n) + if rank < 1: + rank = 1 + if rank > n: + rank = n + return sorted_vals[rank - 1] + + +def _axis_stats(values) -> dict: + """Compute mean/p50/p90/p99/min/max over a list of integer counts. + + ``mean`` is rounded to 2 decimals; every other statistic is an integer + (they are counts). Returns an all-``None`` axis for an empty list. + """ + if not values: + return _empty_axis() + sv = sorted(values) + return { + "mean": round(sum(sv) / len(sv), 2), + "p50": int(_pct(sv, 50)), + "p90": int(_pct(sv, 90)), + "p99": int(_pct(sv, 99)), + "min": int(sv[0]), + "max": int(sv[-1]), + } + + +def _word_hist(word_counts, n_bins) -> list: + """Equal-width histogram of per-document word counts. + + Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When + every document has the same number of words, there are fewer than 2 + documents, or ``n_bins`` is not at least 1, a single covering bin is + returned. With no documents the result is ``[]``. The sum of bin ``count`` + always equals ``len(word_counts)``. + """ + if not word_counts: + return [] + wmin = min(word_counts) + wmax = max(word_counts) + if wmax == wmin or len(word_counts) < 2 or n_bins < 1: + return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}] + + width = (wmax - wmin) / n_bins + bins = [] + for i in range(n_bins): + lo = wmin + i * width + hi = wmin + (i + 1) * width + bins.append({"lo": float(lo), "hi": float(hi), "count": 0}) + # Pin the last upper edge to the real maximum to avoid float drift. + bins[-1]["hi"] = float(wmax) + + for wc in word_counts: + if wc >= wmax: + idx = n_bins - 1 + else: + idx = int((wc - wmin) / width) + if idx < 0: + idx = 0 + elif idx >= n_bins: + idx = n_bins - 1 + bins[idx]["count"] += 1 + return bins + + +def compute_text_length_stats(texts, n_bins=20) -> dict: + """Summarize the length distribution of a corpus of text documents. + + For each document three lengths are measured: characters (``len(doc)``), + words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments + after splitting on ``.!?…``, with a minimum of 1 for any non-empty + document). For each axis the mean, p50, p90, p99, min and max are reported, + plus an equal-width histogram of the per-document word counts. + + ``None`` entries and any non-``str`` items in ``texts`` are discarded. + The function never raises: on empty/``None`` input or any internal error it + returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]`` + histogram). + + Args: + texts: List of text documents (``str``). ``None`` and non-``str`` + items are dropped. + n_bins: Number of equal-width bins for the word-count histogram. + Default 20. + + Returns: + Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and + ``word_hist``. Each of the three axes is a sub-dict with ``mean`` + (float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max`` + (ints), all ``None`` when there are no documents. ``word_hist`` is a + list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``. + """ + empty_axis = _empty_axis() + fallback = { + "n_docs": 0, + "chars": dict(empty_axis), + "words": dict(empty_axis), + "sentences": dict(empty_axis), + "word_hist": [], + } + try: + if not texts: + return fallback + + docs = [t for t in texts if isinstance(t, str)] + n_docs = len(docs) + if n_docs == 0: + return fallback + + char_counts = [len(d) for d in docs] + word_counts = [len(_WORD_RE.findall(d)) for d in docs] + + sent_counts = [] + for d in docs: + segments = [s for s in _SENT_RE.split(d) if s.strip()] + n = len(segments) + if d and n == 0: + # Non-empty document with no detectable sentence: count as 1. + n = 1 + sent_counts.append(n) + + return { + "n_docs": n_docs, + "chars": _axis_stats(char_counts), + "words": _axis_stats(word_counts), + "sentences": _axis_stats(sent_counts), + "word_hist": _word_hist(word_counts, n_bins), + } + except Exception: + return fallback diff --git a/python/functions/datascience/compute_text_length_stats_test.py b/python/functions/datascience/compute_text_length_stats_test.py new file mode 100644 index 00000000..681f477e --- /dev/null +++ b/python/functions/datascience/compute_text_length_stats_test.py @@ -0,0 +1,70 @@ +"""Tests para compute_text_length_stats. + +Inserta `python/functions` en sys.path (relativo a este archivo) para importar +el modulo hoja por su paquete `datascience`, sin depender de que el paquete lo +reexporte en su __init__. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from datascience.compute_text_length_stats import compute_text_length_stats + + +def test_basico(): + """Varios textos de longitudes distintas: stats y histograma coherentes.""" + texts = [ + "Hola mundo.", # 2 words, 1 sentence + "Una frase mas larga con varias palabras aqui.", # 8 words, 1 sentence + "Corto.", # 1 word, 1 sentence + "Esto. Tiene. Tres frases distintas!", # 5 words, 3 sentences + ] + result = compute_text_length_stats(texts) + + assert result["n_docs"] == 4 + # Diferentes longitudes en palabras -> max estrictamente mayor que min. + assert result["words"]["max"] > result["words"]["min"] + # El histograma de palabras no esta vacio. + assert result["word_hist"] != [] + # La suma de counts del histograma cubre todos los documentos. + assert sum(b["count"] for b in result["word_hist"]) == result["n_docs"] + # mean es float redondeado; min/max son enteros. + assert isinstance(result["words"]["mean"], float) + assert isinstance(result["words"]["min"], int) + assert isinstance(result["words"]["max"], int) + # El documento con 3 frases empuja el max de sentences a >= 3. + assert result["sentences"]["max"] >= 3 + + +def test_vacio(): + """Lista vacia: n_docs 0, subdicts None, word_hist [].""" + result = compute_text_length_stats([]) + assert result["n_docs"] == 0 + for axis in ("chars", "words", "sentences"): + for key in ("mean", "p50", "p90", "p99", "min", "max"): + assert result[axis][key] is None + assert result["word_hist"] == [] + + +def test_descarta_none(): + """None y valores no-str se descartan del computo.""" + result = compute_text_length_stats(["hello world", None, 123, 4.5, "foo bar baz"]) + # Solo dos strings validos. + assert result["n_docs"] == 2 + assert result["words"]["min"] == 2 # "hello world" + assert result["words"]["max"] == 3 # "foo bar baz" + assert sum(b["count"] for b in result["word_hist"]) == 2 + + +def test_un_documento(): + """Un solo documento: word_hist tiene exactamente un bin con count 1.""" + result = compute_text_length_stats(["solo un documento aqui"]) + assert result["n_docs"] == 1 + assert len(result["word_hist"]) == 1 + assert result["word_hist"][0]["count"] == 1 + # Con un unico documento, p50 == min == max == su numero de palabras (4). + assert result["words"]["min"] == 4 + assert result["words"]["max"] == 4 + assert result["words"]["p50"] == 4 diff --git a/python/functions/datascience/compute_text_readability.md b/python/functions/datascience/compute_text_readability.md new file mode 100644 index 00000000..b6deaf07 --- /dev/null +++ b/python/functions/datascience/compute_text_readability.md @@ -0,0 +1,88 @@ +--- +id: compute_text_readability_py_datascience +name: compute_text_readability +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def compute_text_readability(texts, sample_max=500) -> dict" +description: "Calcula la legibilidad Flesch Reading Ease de un corpus de texto usando textstat con import perezoso y degradación. Filtra None/no-str/vacíos, muestrea hasta sample_max documentos (los primeros) y agrega los scores Flesch en {mean, p50, min, max}. Si textstat no está instalada devuelve available=False sin lanzar. Estilo dict-no-throw del grupo eda — nunca lanza." +tags: [eda, datascience, text, nlp, readability, flesch, textstat, pure, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [math, textstat] +example: | + from datascience.compute_text_readability import compute_text_readability + out = compute_text_readability(["The cat sat on the mat. It was warm and sunny."]) + # {"available": True, "n_scored": 1, "flesch": {"mean": 109.0, "p50": 109.0, "min": 108.96..., "max": 108.96...}} +tested: true +tests: + - "test_prosa_ingles" + - "test_vacio" + - "test_degradacion" +test_file_path: "python/functions/datascience/compute_text_readability_test.py" +file_path: "python/functions/datascience/compute_text_readability.py" +params: + - name: texts + desc: "Lista de str (documentos del corpus). Los elementos None, no-str o vacíos tras strip() se descartan silenciosamente. El orden se respeta: el muestreo toma los primeros documentos válidos." + - name: sample_max + desc: "Número máximo de documentos válidos a puntuar (los primeros). Default 500. Acota el coste en corpus grandes. Valores no convertibles a int caen a 500; negativos se tratan como 0." +output: "Dict con exactamente 3 claves siempre presentes: available (bool: True si textstat se pudo importar), n_scored (int: nº de documentos efectivamente puntuados), flesch (dict con mean, p50, min, max). mean y p50 redondeados a 1 decimal; p50 por nearest-rank sobre los scores ordenados; min/max son los scores extremos sin redondear. Todos los valores de flesch son None cuando n_scored es 0. La función nunca lanza: cualquier excepción global (incluida ImportError de textstat) degrada a available=False, n_scored=0 y flesch todo None." +--- + +## Ejemplo + +```python +from datascience.compute_text_readability import compute_text_readability + +textos = [ + "The cat sat on the mat. It was a warm and sunny day in the park.", + "Reading is a wonderful habit. Books open doors to new worlds and ideas.", + "He ran quickly to the store to buy some fresh bread and a bottle of milk.", +] + +compute_text_readability(textos) +# { +# "available": True, +# "n_scored": 3, +# "flesch": {"mean": 91.4, "p50": 95.4, "min": 70.08..., "max": 108.83...} +# } + +# Corpus vacío (textstat presente): available True pero nada que puntuar. +compute_text_readability([]) +# {"available": True, "n_scored": 0, +# "flesch": {"mean": None, "p50": None, "min": None, "max": None}} +``` + +## Cuando usarla + +Úsala en un EDA de texto cuando necesites una métrica única y comparable de +**lo fácil que es de leer** un corpus de documentos (descripciones, reviews, +artículos, tickets). Devuelve el resumen Flesch Reading Ease agregado +(`mean`/`p50`/`min`/`max`) listo para un report o un bloque del notebook, sin +tener que iterar `textstat` a mano. Pásale la lista de textos crudos y, si el +corpus es grande, limita el coste con `sample_max`. El estilo dict-no-throw +permite incrustarla en pipelines del grupo `eda` sin envolver en try/except. + +## Gotchas + +- **`textstat` es una dependencia opcional.** Si no está instalada (o falla al + importar) la función NO lanza: devuelve `available=False`, `n_scored=0` y + `flesch` todo `None`. Comprueba `available` antes de interpretar los números. +- **Flesch Reading Ease está pensado para prosa en inglés.** Aplicado a otros + idiomas o a texto no-prosa (código, listas, tablas, cadenas muy cortas) los + scores no son interpretables, aunque se calculen sin error. +- **Escala Flesch:** valores **altos** = más fácil de leer (≈90–100 muy fácil), + valores **bajos** = más difícil (puede ser negativo en texto muy denso). No + se recortan a ningún rango: se reportan tal cual los devuelve `textstat`. +- **`available=True` con `n_scored=0`** significa que `textstat` está presente + pero el corpus no aportó documentos puntuables (vacío, solo None/no-str, o + todos los docs fallaron al puntuar). Es distinto de `available=False`. +- **Muestreo = los primeros `sample_max`**, no aleatorio. Si el orden del corpus + está sesgado, el resumen reflejará ese sesgo. +- **`mean` y `p50` redondean a 1 decimal**; `min`/`max` se devuelven sin + redondear (los scores extremos reales). diff --git a/python/functions/datascience/compute_text_readability.py b/python/functions/datascience/compute_text_readability.py new file mode 100644 index 00000000..61595560 --- /dev/null +++ b/python/functions/datascience/compute_text_readability.py @@ -0,0 +1,121 @@ +"""Legibilidad Flesch Reading Ease de un corpus de texto. + +Función pura del grupo `eda`, estilo dict-no-throw: nunca lanza. Usa la +librería `textstat` con import perezoso y degradación: si `textstat` no está +instalada (o falla al importar), devuelve un resultado con `available=False` +en lugar de propagar el error. +""" + + +def _percentile_nearest_rank(sorted_values, pct): + """Percentil por nearest-rank sobre una lista ya ordenada ascendente. + + rank = ceil(pct/100 * n); índice 1-based recortado a [1, n]. + Devuelve None si la lista está vacía. + """ + n = len(sorted_values) + if n == 0: + return None + import math + + rank = math.ceil((pct / 100.0) * n) + if rank < 1: + rank = 1 + if rank > n: + rank = n + return sorted_values[rank - 1] + + +def compute_text_readability(texts, sample_max=500) -> dict: + """Calcula la legibilidad Flesch Reading Ease de un corpus. + + Args: + texts: lista de str. Los elementos None, no-str o vacíos (tras strip) + se descartan. Se muestrean los primeros `sample_max` documentos + válidos. + sample_max: número máximo de documentos a puntuar (los primeros). + + Returns: + Dict con la forma exacta:: + + {"available": bool, "n_scored": int, + "flesch": {"mean": float|None, "p50": float|None, + "min": float|None, "max": float|None}} + + `available` es True si `textstat` se pudo importar. La función nunca + lanza: cualquier excepción global degrada a `available=False`. + """ + empty = { + "available": False, + "n_scored": 0, + "flesch": {"mean": None, "p50": None, "min": None, "max": None}, + } + try: + # Import perezoso con degradación: textstat es una dependencia opcional. + try: + import textstat + except Exception: + return { + "available": False, + "n_scored": 0, + "flesch": {"mean": None, "p50": None, "min": None, "max": None}, + } + + # Filtrar y muestrear documentos válidos (los primeros sample_max). + docs = [] + if texts is not None: + try: + limit = int(sample_max) + except Exception: + limit = 500 + if limit < 0: + limit = 0 + for item in texts: + if not isinstance(item, str): + continue + if item.strip() == "": + continue + docs.append(item) + if len(docs) >= limit: + break + + scores = [] + for doc in docs: + try: + score = textstat.flesch_reading_ease(doc) + except Exception: + continue + try: + score = float(score) + except Exception: + continue + scores.append(score) + + n_scored = len(scores) + if n_scored == 0: + # textstat presente pero corpus vacío / sin puntuar. + return { + "available": True, + "n_scored": 0, + "flesch": {"mean": None, "p50": None, "min": None, "max": None}, + } + + mean_val = round(sum(scores) / n_scored, 1) + sorted_scores = sorted(scores) + p50_raw = _percentile_nearest_rank(sorted_scores, 50) + p50_val = round(p50_raw, 1) if p50_raw is not None else None + min_val = sorted_scores[0] + max_val = sorted_scores[-1] + + return { + "available": True, + "n_scored": n_scored, + "flesch": { + "mean": mean_val, + "p50": p50_val, + "min": min_val, + "max": max_val, + }, + } + except Exception: + return empty diff --git a/python/functions/datascience/compute_text_readability_test.py b/python/functions/datascience/compute_text_readability_test.py new file mode 100644 index 00000000..2cf4f3fc --- /dev/null +++ b/python/functions/datascience/compute_text_readability_test.py @@ -0,0 +1,74 @@ +"""Tests para compute_text_readability.""" + +import sys +import os +import builtins + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +from datascience.compute_text_readability import compute_text_readability + + +EXPECTED_KEYS = {"available", "n_scored", "flesch"} +FLESCH_KEYS = {"mean", "p50", "min", "max"} + + +def test_prosa_ingles(): + """Varios textos en prosa inglesa: available True, n_scored>0, mean no None.""" + texts = [ + "The cat sat on the mat. It was a warm and sunny day in the park.", + "She sells sea shells by the sea shore. The shells she sells are surely sea shells.", + "Reading is a wonderful habit. Books open doors to new worlds and ideas.", + "He ran quickly to the store to buy some fresh bread and a bottle of milk.", + ] + out = compute_text_readability(texts) + + assert set(out.keys()) == EXPECTED_KEYS + assert out["available"] is True + assert out["n_scored"] > 0 + assert set(out["flesch"].keys()) == FLESCH_KEYS + assert out["flesch"]["mean"] is not None + assert out["flesch"]["p50"] is not None + assert out["flesch"]["min"] is not None + assert out["flesch"]["max"] is not None + # min <= mean/p50 <= max coherente. + assert out["flesch"]["min"] <= out["flesch"]["max"] + + +def test_vacio(): + """Corpus vacío con textstat presente: available True, n_scored 0, flesch None.""" + out = compute_text_readability([]) + + assert set(out.keys()) == EXPECTED_KEYS + assert out["available"] is True + assert out["n_scored"] == 0 + assert out["flesch"]["mean"] is None + assert out["flesch"]["p50"] is None + assert out["flesch"]["min"] is None + assert out["flesch"]["max"] is None + + # Elementos no-str / vacíos también se descartan -> n_scored 0. + out2 = compute_text_readability([None, "", " ", 123]) + assert out2["available"] is True + assert out2["n_scored"] == 0 + + +def test_degradacion(monkeypatch): + """Sin textstat (ImportError forzado): degrada a available False sin lanzar.""" + import datascience.compute_text_readability as m + + real = builtins.__import__ + + def fake(name, *a, **k): + if name == "textstat" or name.startswith("textstat."): + raise ImportError("simulado") + return real(name, *a, **k) + + monkeypatch.setattr(builtins, "__import__", fake) + out = m.compute_text_readability(["The cat sat on the mat. It was happy and warm."]) + assert out["available"] is False + assert out["n_scored"] == 0 + assert out["flesch"]["mean"] is None + assert out["flesch"]["p50"] is None + assert out["flesch"]["min"] is None + assert out["flesch"]["max"] is None diff --git a/python/functions/datascience/compute_top_ngrams.md b/python/functions/datascience/compute_top_ngrams.md new file mode 100644 index 00000000..81c4d504 --- /dev/null +++ b/python/functions/datascience/compute_top_ngrams.md @@ -0,0 +1,103 @@ +--- +id: compute_top_ngrams_py_datascience +name: compute_top_ngrams +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict" +description: "Calcula los n-gramas de palabras más frecuentes de un corpus de texto (n=1 unigramas, 2 bigramas, 3 trigramas...). Tokeniza a minúsculas con re.findall(r'\\w+', ...), descarta tokens numéricos y, si remove_stopwords=True, elimina stopwords ES+EN ANTES de formar los n-gramas (n-gramas contiguos sobre la secuencia de tokens de contenido, sin cruzar documentos). Pura y autocontenida con collections.Counter, sin sklearn. Estilo dict-no-throw del grupo eda: nunca lanza." +tags: [eda, datascience, text, nlp, ngrams, bigrams, trigrams, pure, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re, collections] +example: | + from datascience.compute_top_ngrams import compute_top_ngrams + texts = ["machine learning rocks", "we love machine learning"] + compute_top_ngrams(texts, n=2, top_k=5) + # {"n": 2, "top": [{"ngram": "machine learning", "count": 2}, ...]} +tested: true +tests: + - "test_bigramas" + - "test_trigramas" + - "test_vacio" + - "test_stopwords" +test_file_path: "python/functions/datascience/compute_top_ngrams_test.py" +file_path: "python/functions/datascience/compute_top_ngrams.py" +params: + - name: texts + desc: "Lista (o tupla) de cadenas. Los elementos None o que no sean str se descartan silenciosamente. Cada documento se tokeniza por separado; los n-gramas no cruzan la frontera entre documentos." + - name: n + desc: "Tamaño del n-grama: 1 unigramas, 2 bigramas, 3 trigramas, etc. Valores < 1 o no enteros producen top vacío (se conserva tal cual en la clave 'n' del retorno)." + - name: top_k + desc: "Número máximo de n-gramas a devolver, ordenados por frecuencia descendente con desempate alfabético determinista. Default 15. Valores negativos se tratan como 0." + - name: remove_stopwords + desc: "Si True (default) elimina las stopwords ES+EN de una lista inline (~130 términos de altísima frecuencia) ANTES de formar los n-gramas, de modo que los n-gramas se construyen sobre la secuencia de tokens de contenido." +output: "Dict con exactamente 2 claves: n (el n recibido, sin normalizar) y top (lista de dicts {'ngram': str, 'count': int} ordenada por count descendente, longitud <= top_k). ngram es la unión de los tokens del n-grama por un espacio. Corpus vacío, tokens insuficientes para formar n-gramas o cualquier excepción interna degradan a {'n': n, 'top': []}. La función nunca lanza." +--- + +## Ejemplo + +```python +from datascience.compute_top_ngrams import compute_top_ngrams + +texts = [ + "machine learning rocks", + "machine learning is fun", + "we love machine learning", +] + +# Bigramas (n=2): "machine learning" aparece en los 3 documentos. +compute_top_ngrams(texts, n=2, top_k=5) +# { +# "n": 2, +# "top": [ +# {"ngram": "machine learning", "count": 3}, +# {"ngram": "learning fun", "count": 1}, +# {"ngram": "learning rocks", "count": 1}, +# {"ngram": "love machine", "count": 1}, +# ], +# } + +# Unigramas con stopwords fuera (default): solo palabras de contenido. +compute_top_ngrams(["the cat sat on the mat"], n=1, top_k=3) +# {"n": 1, "top": [{"ngram": "cat", "count": 1}, +# {"ngram": "mat", "count": 1}, +# {"ngram": "sat", "count": 1}]} +``` + +## Cuando usarla + +Úsala en la fase de EDA de texto cuando, además del vocabulario suelto, necesites +ver qué **combinaciones de palabras contiguas** dominan un corpus: colocaciones, +frases técnicas recurrentes ("machine learning", "data analyst"), o patrones de +trigramas en titulares/descripciones. Es el complemento natural de un perfil de +vocabulario: pasa de "qué palabras aparecen" a "qué secuencias aparecen". Llámala +con `n=1` para unigramas, `n=2` para bigramas y `n=3` para trigramas, y ajusta +`top_k` al tamaño de la tabla que vas a renderizar. Deja `remove_stopwords=True` +para que los n-gramas reflejen contenido y no conectores gramaticales. + +## Gotchas + +- **Las stopwords se eliminan ANTES de formar los n-gramas.** Con + `remove_stopwords=True` la frase "data of analysis" produce el bigrama + "data analysis" (el "of" intermedio desaparece y los tokens de contenido se + vuelven contiguos), no "data of" ni "of analysis". Si quieres preservar la + adyacencia literal del texto original, pasa `remove_stopwords=False`. +- **Los n-gramas NO cruzan documentos.** Cada elemento de `texts` se tokeniza y + recorre por separado; el último token de un documento nunca se combina con el + primero del siguiente. +- **Tokens puramente numéricos se descartan** (`tok.isdigit()`), pero los + alfanuméricos mixtos no: "3d" o "covid19" sí cuentan como tokens. Un decimal + como "3.5" se parte en "3" y "5" por `\w+` y ambos se descartan por numéricos. +- **La lista de stopwords es inline ES+EN**, pensada para textos generales en + esos dos idiomas. Para otros idiomas o jerga específica de dominio puede dejar + pasar conectores; en ese caso filtra el corpus aguas arriba o usa + `remove_stopwords=False` y posfiltra. +- **`top` puede tener menos de `top_k` elementos** si el corpus no tiene tantos + n-gramas distintos. El desempate por frecuencia es alfabético (determinista), + no por orden de aparición. diff --git a/python/functions/datascience/compute_top_ngrams.py b/python/functions/datascience/compute_top_ngrams.py new file mode 100644 index 00000000..0afb41b0 --- /dev/null +++ b/python/functions/datascience/compute_top_ngrams.py @@ -0,0 +1,94 @@ +"""Top n-gramas de palabras más frecuentes de un corpus de texto. + +Función pura, autocontenida (solo stdlib: re + collections.Counter). No depende +de scikit-learn ni de ninguna otra librería externa. Estilo dict-no-throw del +grupo `eda`: ante cualquier entrada degenerada o excepción interna devuelve +``{"n": n, "top": []}`` en vez de lanzar. +""" + +import re +from collections import Counter + +# Lista inline de stopwords ES + EN (~80 términos de altísima frecuencia). +# Se eliminan ANTES de formar los n-gramas: los n-gramas se construyen sobre la +# secuencia de tokens de contenido, no sobre el texto original. +_STOPWORDS = frozenset({ + # Español + "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", + "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "mas", + "pero", "sus", "le", "ya", "o", "este", "sí", "si", "porque", "esta", + "entre", "cuando", "muy", "sin", "sobre", "también", "tambien", "me", + "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", + "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", + "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", + "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", + "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", + "algo", "nosotros", + # Inglés + "the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as", + "are", "was", "be", "this", "that", "by", "an", "or", "at", "from", "but", + "not", "have", "has", "had", "they", "you", "we", "he", "she", "his", + "her", "their", "its", "i", "my", "me", "our", "us", "do", "does", "did", + "will", "would", "can", "could", "should", "there", "which", "who", "what", + "when", "where", "how", "all", "if", "so", "than", "then", "out", "up", +}) + + +def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict: + """Calcula los n-gramas de palabras más frecuentes de un corpus. + + Args: + texts: lista de cadenas. Los elementos ``None`` o que no sean ``str`` se + descartan silenciosamente. + n: tamaño del n-grama (1 = unigramas, 2 = bigramas, 3 = trigramas...). + Valores < 1 o no enteros producen ``top`` vacío. + top_k: número máximo de n-gramas a devolver, ordenados por frecuencia + descendente (con desempate alfabético determinista). + remove_stopwords: si ``True`` elimina las stopwords ES+EN ANTES de + formar los n-gramas, de modo que los n-gramas se construyen sobre la + secuencia de tokens de contenido (no cruzando documentos). + + Returns: + ``{"n": n, "top": [{"ngram": "w1 w2", "count": int}, ...]}``. Corpus + vacío, sin tokens suficientes o cualquier excepción interna degrada a + ``{"n": n, "top": []}``. Nunca lanza. + """ + try: + if not isinstance(n, int) or n < 1: + return {"n": n, "top": []} + + try: + limit = int(top_k) + except (TypeError, ValueError): + limit = 0 + if limit < 0: + limit = 0 + + if not isinstance(texts, (list, tuple)): + return {"n": n, "top": []} + + counter = Counter() + for doc in texts: + if not isinstance(doc, str): + continue + tokens = [ + tok + for tok in re.findall(r"\w+", doc.lower(), re.UNICODE) + if not tok.isdigit() + ] + if remove_stopwords: + tokens = [tok for tok in tokens if tok not in _STOPWORDS] + if len(tokens) < n: + continue + for i in range(len(tokens) - n + 1): + ngram = " ".join(tokens[i:i + n]) + counter[ngram] += 1 + + if not counter: + return {"n": n, "top": []} + + ordered = sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])) + top = [{"ngram": ngram, "count": count} for ngram, count in ordered[:limit]] + return {"n": n, "top": top} + except Exception: + return {"n": n, "top": []} diff --git a/python/functions/datascience/compute_top_ngrams_test.py b/python/functions/datascience/compute_top_ngrams_test.py new file mode 100644 index 00000000..9b5eb869 --- /dev/null +++ b/python/functions/datascience/compute_top_ngrams_test.py @@ -0,0 +1,65 @@ +"""Tests para compute_top_ngrams.""" + +import sys +import os + +# sys.path estándar: añade `python/functions/` para importar por paquete raíz. +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +from datascience.compute_top_ngrams import compute_top_ngrams + + +def test_bigramas(): + # "machine learning" se repite en cada documento -> bigrama más frecuente. + texts = [ + "machine learning rocks", + "machine learning is fun", + "we love machine learning", + ] + result = compute_top_ngrams(texts, n=2, top_k=5) + assert result["n"] == 2 + assert result["top"], "esperaba al menos un bigrama" + assert result["top"][0]["ngram"] == "machine learning" + assert result["top"][0]["count"] == 3 + # Cada entrada respeta el contrato {"ngram": str, "count": int}. + for item in result["top"]: + assert isinstance(item["ngram"], str) + assert isinstance(item["count"], int) + + +def test_trigramas(): + texts = [ + "alpha beta gamma delta", + "alpha beta gamma omega", + ] + # Con stopwords desactivadas para no descartar tokens de contenido. + result = compute_top_ngrams(texts, n=3, top_k=5, remove_stopwords=False) + assert result["n"] == 3 + ngrams = {item["ngram"]: item["count"] for item in result["top"]} + # "alpha beta gamma" aparece en ambos documentos. + assert ngrams.get("alpha beta gamma") == 2 + # Trigramas únicos de cada documento. + assert ngrams.get("beta gamma delta") == 1 + assert ngrams.get("beta gamma omega") == 1 + + +def test_vacio(): + assert compute_top_ngrams([], n=2) == {"n": 2, "top": []} + # Documentos no-str / None se descartan -> corpus efectivamente vacío. + assert compute_top_ngrams([None, 123, {"a": 1}], n=2) == {"n": 2, "top": []} + + +def test_stopwords(): + # "the cat" debería desaparecer al quitar stopwords ("the" es stopword EN). + texts = ["the cat the cat the cat"] + con = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=True) + sin = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=False) + + con_ngrams = {item["ngram"] for item in con["top"]} + sin_ngrams = {item["ngram"] for item in sin["top"]} + + # Sin filtrar, el bigrama dominante es "the cat". + assert "the cat" in sin_ngrams + # Al filtrar stopwords, ya no aparece "the cat" (queda solo "cat cat"). + assert "the cat" not in con_ngrams + assert con_ngrams != sin_ngrams diff --git a/python/functions/datascience/compute_vocabulary_stats.md b/python/functions/datascience/compute_vocabulary_stats.md new file mode 100644 index 00000000..fb2ada89 --- /dev/null +++ b/python/functions/datascience/compute_vocabulary_stats.md @@ -0,0 +1,91 @@ +--- +id: compute_vocabulary_stats_py_datascience +name: compute_vocabulary_stats +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def compute_vocabulary_stats(texts: list, top_k: int = 20, remove_stopwords: bool = True) -> dict" +description: "Profiles the vocabulary of a text corpus for EDA: tokenises a list of documents, counts term frequencies and derives lexical-richness measures — total tokens, unique types, type-token ratio (TTR), hapax legomena and the top-k most frequent terms. Pure, stdlib only (re + collections.Counter); no nltk, no sklearn. Inline ES+EN stopword list, opt-out via remove_stopwords. Never raises: empty/degenerate input returns the zeroed result." +tags: [eda, datascience, text, nlp, vocabulary, ttr, hapax, pure, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [re, collections] +example: | + from datascience.compute_vocabulary_stats import compute_vocabulary_stats + result = compute_vocabulary_stats(["el gato y el perro", "gato veloz"], top_k=5) +tested: true +tests: + - "test_basico" + - "test_vacio" + - "test_stopwords_quitadas" + - "test_stopwords_conservadas" +test_file_path: "python/functions/datascience/compute_vocabulary_stats_test.py" +file_path: "python/functions/datascience/compute_vocabulary_stats.py" +params: + - name: texts + desc: "List of documents (strings) forming the corpus. Entries that are None or not a str are silently discarded. Tokens are extracted per document with re.findall(r'\\w+', doc.lower(), re.UNICODE); purely numeric tokens (tok.isdigit()) are dropped." + - name: top_k + desc: "Maximum number of most-frequent terms to return in top_terms. Default 20. Does not affect n_tokens/n_types/ttr/hapax — only the length of the top_terms list." + - name: remove_stopwords + desc: "When True (default) common Spanish+English stopwords from the inline _STOPWORDS set (~120 entries) are removed from the token stream before any counting. Set False to keep every word (raw lexical profile)." +output: "Dict with the exact keys n_tokens (int), n_types (int), ttr (float|None, n_types/n_tokens rounded to 4 dp), n_hapax (int, terms occurring exactly once), hapax_pct (float|None, n_hapax/n_types*100 rounded to 2 dp) and top_terms (list of {term, count, pct} sorted by count descending, pct = count/n_tokens*100 rounded to 2 dp). For an empty corpus (no tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0, hapax_pct=None, top_terms=[]. Any exception degrades to that same empty result — the function never throws." +--- + +## Ejemplo + +```python +from datascience.compute_vocabulary_stats import compute_vocabulary_stats + +compute_vocabulary_stats( + ["el gato y el perro", "gato veloz corre", "perro perro perro"], + top_k=5, +) +# { +# "n_tokens": 6, # stopwords (el, y) eliminadas por defecto +# "n_types": 3, # gato, perro, veloz, corre -> tras quitar stopwords +# "ttr": 0.5, # n_types / n_tokens +# "n_hapax": 2, # veloz, corre (1 aparicion cada uno) +# "hapax_pct": 50.0, # n_hapax / n_types * 100 +# "top_terms": [ +# {"term": "perro", "count": 4, "pct": 44.44}, +# {"term": "gato", "count": 2, "pct": 22.22}, +# ... +# ], +# } + +# Perfil lexico crudo (sin filtrar stopwords): +compute_vocabulary_stats(["the cat and the dog"], remove_stopwords=False) +``` + +## Cuando usarla + +Úsala al perfilar una columna o corpus de texto libre en un EDA del grupo `eda`: +cuando necesites medir la riqueza léxica (cuántos tokens y cuántas palabras +distintas, type-token ratio, porcentaje de palabras que solo aparecen una vez) y +ver qué términos dominan el vocabulario (top-k frecuencias). Pásale la lista de +documentos crudos (filas de la columna); `None` y valores no-string se ignoran +solos. Es el equivalente para texto largo de `summarize_categorical`, que perfila +categorías cortas. + +## Gotchas + +- Función pura y stdlib-only, pero el resultado depende del **idioma**: la lista + `_STOPWORDS` cubre español e inglés. Para otros idiomas pon + `remove_stopwords=False` o filtra fuera, o el perfil mezclará stopwords no + reconocidas en `top_terms`. +- La tokenización es `\w+` con `re.UNICODE`: separa por puntuación y conserva + acentos/ñ, pero NO hace stemming ni lematización — "gato" y "gatos" cuentan + como tipos distintos. Tampoco hace stripping de acentos, así que "más" (con + tilde) y "mas" son tokens diferentes (ambos están en la stoplist). +- Los tokens **puramente numéricos** (`"123"`) se descartan siempre; un token + alfanumérico mixto (`"covid19"`) se conserva. +- `ttr` baja artificialmente en corpus grandes (más texto, más repetición): no + compares TTR entre corpus de tamaños muy distintos sin normalizar. +- Nunca lanza: entrada vacía, `None`, o cualquier excepción interna devuelven el + resultado con ceros/`None`/`[]`. Comprueba `n_tokens == 0` para detectar el + caso degenerado. diff --git a/python/functions/datascience/compute_vocabulary_stats.py b/python/functions/datascience/compute_vocabulary_stats.py new file mode 100644 index 00000000..a3f65115 --- /dev/null +++ b/python/functions/datascience/compute_vocabulary_stats.py @@ -0,0 +1,99 @@ +"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only). + +Tokenises a list of documents, counts term frequencies and derives lexical +richness measures (type-token ratio, hapax legomena) plus the top-k terms. +No external NLP dependencies (no nltk, no sklearn) — only ``re`` and +``collections`` from the standard library. +""" + +import re +from collections import Counter + +# Common Spanish + English stopwords. Inline, lowercase, no accents stripped +# beyond what already appears here. Filtering is opt-in via remove_stopwords. +_STOPWORDS = { + # Spanish + "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", + "un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas", + "más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque", + "esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también", + "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", + "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", + "ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro", + "otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos", + # English + "the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as", + "was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at", + "from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its", + "our", "their", "what", "which", "who", "whom", "has", "have", "had", "do", + "does", "did", "will", "would", "can", "could", "should", "may", "might", + "must", "if", "then", "than", "so", "too", "very", "just", "also", "were", + "been", "being", "there", "here", "all", "any", "some", "more", "most", + "out", "up", "down", "into", "over", "such", "only", "own", "same", +} + + +def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict: + """Profile the vocabulary of a corpus of documents. + + Args: + texts: List of strings (the corpus). Entries that are None or not a + string are discarded silently. + top_k: Maximum number of most-frequent terms to include in + ``top_terms``. Default 20. Does not affect the other measures. + remove_stopwords: When True (default) common ES+EN stopwords are + dropped from the token stream before any counting. + + Returns: + A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``, + ``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no + tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0, + hapax_pct=None, top_terms=[]. Never raises — any exception degrades to + the empty-corpus result. + """ + empty = { + "n_tokens": 0, + "n_types": 0, + "ttr": None, + "n_hapax": 0, + "hapax_pct": None, + "top_terms": [], + } + try: + tokens = [] + for doc in texts or []: + if not isinstance(doc, str): + continue + for tok in re.findall(r"\w+", doc.lower(), re.UNICODE): + if tok.isdigit(): + continue + if remove_stopwords and tok in _STOPWORDS: + continue + tokens.append(tok) + + n_tokens = len(tokens) + if n_tokens == 0: + return dict(empty) + + counts = Counter(tokens) + n_types = len(counts) + ttr = round(n_types / n_tokens, 4) + + n_hapax = sum(1 for c in counts.values() if c == 1) + hapax_pct = round(n_hapax / n_types * 100, 2) + + top_terms = [ + {"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)} + for term, count in counts.most_common(top_k) + ] + + return { + "n_tokens": n_tokens, + "n_types": n_types, + "ttr": ttr, + "n_hapax": n_hapax, + "hapax_pct": hapax_pct, + "top_terms": top_terms, + } + except Exception: + return dict(empty) diff --git a/python/functions/datascience/compute_vocabulary_stats_test.py b/python/functions/datascience/compute_vocabulary_stats_test.py new file mode 100644 index 00000000..aacf5d35 --- /dev/null +++ b/python/functions/datascience/compute_vocabulary_stats_test.py @@ -0,0 +1,74 @@ +"""Tests para compute_vocabulary_stats.""" + +import os +import sys + +sys.path.insert( + 0, os.path.join(os.path.dirname(__file__), "..", "..", "functions") +) + +from datascience.compute_vocabulary_stats import compute_vocabulary_stats + + +def test_basico(): + # Corpus con repeticiones y hapax. Stopwords desactivadas para controlar + # exactamente que tokens entran. + texts = ["gato gato perro", "perro perro raton", "elefante"] + r = compute_vocabulary_stats(texts, top_k=10, remove_stopwords=False) + + # n_types < n_tokens cuando hay repeticiones. + assert r["n_types"] < r["n_tokens"] + assert r["n_tokens"] == 7 + assert r["n_types"] == 4 # gato, perro, raton, elefante + + # ttr en (0, 1]. + assert 0 < r["ttr"] <= 1 + assert r["ttr"] == round(4 / 7, 4) + + # top_terms ordenado por count descendente. + counts = [t["count"] for t in r["top_terms"]] + assert counts == sorted(counts, reverse=True) + assert r["top_terms"][0]["term"] == "perro" + assert r["top_terms"][0]["count"] == 3 + + # hapax: raton y elefante aparecen exactamente una vez. + assert r["n_hapax"] == 2 + assert r["hapax_pct"] == round(2 / 4 * 100, 2) + + # pct coherente con count/n_tokens. + assert r["top_terms"][0]["pct"] == round(3 / 7 * 100, 2) + + +def test_vacio(): + # Sin documentos validos -> ceros / None / []. + for arg in ([], None, [None, 123, ""], ["123 456"]): + r = compute_vocabulary_stats(arg) + assert r["n_tokens"] == 0 + assert r["n_types"] == 0 + assert r["ttr"] is None + assert r["n_hapax"] == 0 + assert r["hapax_pct"] is None + assert r["top_terms"] == [] + + +def test_stopwords_quitadas(): + texts = ["the gato the perro", "de la casa azul"] + r = compute_vocabulary_stats(texts, remove_stopwords=True) + terms = {t["term"] for t in r["top_terms"]} + # Stopwords ES+EN no deben aparecer. + assert "the" not in terms + assert "de" not in terms + assert "la" not in terms + # Palabras de contenido si. + assert "gato" in terms + assert "casa" in terms + + +def test_stopwords_conservadas(): + texts = ["the gato the perro", "de la casa azul"] + r = compute_vocabulary_stats(texts, remove_stopwords=False) + terms = {t["term"] for t in r["top_terms"]} + # Con el filtro desactivado, las stopwords se conservan. + assert "the" in terms + assert "de" in terms + assert "la" in terms diff --git a/python/functions/datascience/detect_corpus_language.md b/python/functions/datascience/detect_corpus_language.md new file mode 100644 index 00000000..0daba9ee --- /dev/null +++ b/python/functions/datascience/detect_corpus_language.md @@ -0,0 +1,80 @@ +--- +name: detect_corpus_language +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict" +description: "Estima la distribucion de idiomas de un corpus de textos con la libreria langdetect (import perezoso). Funcion pura y defensiva del grupo eda: filtra documentos None/no-str/vacios, muestrea hasta sample_max docs, clasifica cada uno con detect() ignorando los que langdetect no puede resolver (LangDetectException), y devuelve la distribucion top_k por frecuencia mas el idioma dominante. Si langdetect no esta instalada o algo falla, degrada a {available: False, ...} y NUNCA lanza (dict-no-throw). Seed fija (DetectorFactory.seed=0) para deteccion determinista." +tags: [eda, datascience, text, nlp, language-detection, langdetect, pure, python] +params: + - name: texts + desc: "Lista de strings (documentos). Los elementos None, no-str o vacios tras strip se descartan antes de clasificar." + - name: top_k + desc: "Numero maximo de idiomas a devolver en distribution, ordenados por count descendente (desempate por codigo ISO ascendente). Default 10." + - name: sample_max + desc: "Numero maximo de documentos a clasificar (se toman los primeros del corpus) para acotar el coste. Default 1000." +output: > + Dict con forma fija (dict-no-throw, nunca lanza): + {"available": bool, "n_detected": int, + "distribution": [{"lang": str, "count": int, "pct": float}, ...], + "dominant": str|None}. + available=True si langdetect es importable; lang son codigos ISO 639-1 ("es","en","fr",...); + pct = count/n_detected*100 redondeado a 2 decimales; n_detected = docs clasificados con exito; + dominant = idioma mas frecuente (None si no hubo detecciones). Corpus vacio con langdetect + presente -> available True, n_detected 0, distribution [], dominant None. Sin langdetect (o + fallo global) -> available False y el resto de campos a su valor vacio. +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [langdetect] +tested: true +tests: ["test_mixto_es_en", "test_vacio", "test_degradacion"] +test_file_path: "python/functions/datascience/detect_corpus_language_test.py" +file_path: "python/functions/datascience/detect_corpus_language.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience.detect_corpus_language import detect_corpus_language + +corpus = [ + "este es un texto bastante largo en español para detectar el idioma correctamente", + "la inteligencia artificial transforma la manera en que trabajamos cada dia", + "this is a fairly long english text to detect the language correctly without issues", +] +out = detect_corpus_language(corpus) +# {"available": True, "n_detected": 3, +# "distribution": [{"lang": "es", "count": 2, "pct": 66.67}, +# {"lang": "en", "count": 1, "pct": 33.33}], +# "dominant": "es"} +``` + +## Cuando usarla + +Cuando perfiles una columna o corpus de texto en un EDA y necesites saber en +que idioma(s) esta escrito antes de elegir tokenizadores, stopwords, modelos +NLP o stemmers. Util tambien como check de calidad: detectar corpus mezclados +o un idioma inesperado. Llamala con la lista de textos crudos; la funcion +limpia, muestrea y resume sola. + +## Gotchas + +- `langdetect` es **opcional**: si no esta instalada, la funcion no lanza — + devuelve `{"available": False, "n_detected": 0, "distribution": [], "dominant": None}`. + Comprueba `out["available"]` antes de usar la distribucion. +- **Textos cortos** (pocas palabras o sin features lingüisticas) pueden no + detectarse: langdetect lanza `LangDetectException`, que se ignora y el doc no + cuenta en `n_detected`. Pasa frases razonablemente largas para resultados fiables. +- **Determinismo**: se fija `DetectorFactory.seed = 0` en cada llamada para que la + deteccion sea reproducible; sin esa semilla langdetect puede dar resultados + ligeramente distintos entre ejecuciones. +- `distribution` esta truncada a `top_k`; si el corpus tiene mas idiomas que + `top_k`, la suma de los `count` mostrados puede ser menor que `n_detected` + (pero `dominant` siempre refleja el idioma mas frecuente del corpus completo). diff --git a/python/functions/datascience/detect_corpus_language.py b/python/functions/datascience/detect_corpus_language.py new file mode 100644 index 00000000..a737ab91 --- /dev/null +++ b/python/functions/datascience/detect_corpus_language.py @@ -0,0 +1,91 @@ +"""Detecta la distribucion de idiomas de un corpus de textos. + +Funcion pura y defensiva: el computo es determinista y local (sin I/O de red). +La libreria opcional `langdetect` se importa de forma perezosa dentro de la +funcion; si no esta instalada (o cualquier paso falla), la funcion degrada +limpiamente a `available=False` y NUNCA lanza excepciones. +""" + + +def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict: + """Estima la distribucion de idiomas de un corpus con `langdetect`. + + Args: + texts: lista de strings (documentos). Los elementos None, no-str o + vacios tras strip se descartan. + top_k: numero maximo de idiomas a devolver en `distribution`, + ordenados por frecuencia descendente. + sample_max: numero maximo de documentos a clasificar (se toman los + primeros) para acotar el coste. + + Returns: + dict con la forma fija (dict-no-throw): + { + "available": bool, # True si langdetect es importable + "n_detected": int, # documentos clasificados con exito + "distribution": [{"lang": str, "count": int, "pct": float}, ...], + "dominant": str | None, + } + """ + degraded = { + "available": False, + "n_detected": 0, + "distribution": [], + "dominant": None, + } + try: + # Import perezoso con degradacion: si langdetect no esta disponible, + # devolvemos el dict degradado sin lanzar. + try: + from langdetect import detect, DetectorFactory + + # Semilla fija -> deteccion determinista entre ejecuciones. + DetectorFactory.seed = 0 + except Exception: + return dict(degraded) + + # Normaliza y filtra el corpus. + docs = [] + if texts: + for t in texts: + if isinstance(t, str): + s = t.strip() + if s: + docs.append(s) + + # Muestreo de los primeros `sample_max` documentos. + if sample_max is not None and sample_max >= 0: + docs = docs[:sample_max] + + # Conteo por idioma; langdetect lanza LangDetectException en textos + # sin features detectables -> se ignora y se sigue. + counts: dict = {} + for doc in docs: + try: + lang = detect(doc) + except Exception: + continue + counts[lang] = counts.get(lang, 0) + 1 + + n_detected = sum(counts.values()) + + # Orden estable: por count descendente, desempate por codigo de idioma. + ordered = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])) + + k = top_k if (top_k is not None and top_k >= 0) else len(ordered) + distribution = [] + for lang, count in ordered[:k]: + pct = round(count / n_detected * 100, 2) if n_detected else 0.0 + distribution.append({"lang": lang, "count": count, "pct": pct}) + + dominant = ordered[0][0] if ordered else None + + return { + "available": True, + "n_detected": n_detected, + "distribution": distribution, + "dominant": dominant, + } + except Exception: + # Cualquier fallo global degrada a available False sin lanzar. + return dict(degraded) diff --git a/python/functions/datascience/detect_corpus_language_test.py b/python/functions/datascience/detect_corpus_language_test.py new file mode 100644 index 00000000..5e94d293 --- /dev/null +++ b/python/functions/datascience/detect_corpus_language_test.py @@ -0,0 +1,58 @@ +"""Tests para detect_corpus_language.""" + +import builtins +import os +import sys + +# Anade python/functions a sys.path para importar el paquete `datascience`. +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from datascience.detect_corpus_language import detect_corpus_language + +_ES = [ + "este es un texto bastante largo en español para detectar el idioma correctamente sin problemas", + "la inteligencia artificial transforma la manera en que trabajamos cada dia en muchos sectores", +] +_EN = [ + "this is a fairly long english text to detect the language correctly without any length issues", + "machine learning models can classify documents into many different categories quite reliably", +] + + +def test_mixto_es_en(): + """Golden: corpus mixto ES+EN claro -> available True, >=2 idiomas, counts coherentes.""" + out = detect_corpus_language(_ES + _EN) + assert out["available"] is True + assert out["dominant"] in {"es", "en"} + assert len(out["distribution"]) >= 2 + total = sum(item["count"] for item in out["distribution"]) + assert total == out["n_detected"] + assert out["n_detected"] == 4 + + +def test_vacio(): + """Edge: lista vacia con langdetect presente -> available True, sin detecciones.""" + out = detect_corpus_language([]) + assert out["available"] is True + assert out["n_detected"] == 0 + assert out["distribution"] == [] + assert out["dominant"] is None + + +def test_degradacion(monkeypatch): + """Error path: si langdetect no es importable -> degrada a available False sin lanzar.""" + import datascience.detect_corpus_language as m + + real_import = builtins.__import__ + + def fake_import(name, *a, **k): + if name == "langdetect" or name.startswith("langdetect."): + raise ImportError("simulado") + return real_import(name, *a, **k) + + monkeypatch.setattr(builtins, "__import__", fake_import) + out = m.detect_corpus_language(["hola mundo", "hello world"]) + assert out["available"] is False + assert out["n_detected"] == 0 + assert out["distribution"] == [] + assert out["dominant"] is None diff --git a/python/functions/datascience/extract_text_sample.md b/python/functions/datascience/extract_text_sample.md new file mode 100644 index 00000000..aec24232 --- /dev/null +++ b/python/functions/datascience/extract_text_sample.md @@ -0,0 +1,102 @@ +--- +name: extract_text_sample +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def extract_text_sample(db_path: str, table: str, columns: list, backend: str = 'duckdb', sample: int = 2000) -> dict" +description: "Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL (LIMIT sample), SIN traer la tabla entera a RAM. Funcion impura del grupo de capacidad `eda`: la usan los capitulos de texto/NLP del AutomaticEDA que necesitan valores crudos de texto (longitudes, tokens, ejemplos) sobre una muestra acotada. Construye el lector read-only query_fn(sql)->dict igual que build_eda_render_ctx (closure sobre duckdb_query_readonly / pg_query importados perezosamente desde infra). Escapa los identificadores con comillas dobles y lanza una sola query SELECT \"c1\", \"c2\" FROM \"table\" LIMIT n. Por columna, la lista de strings solo contiene valores NO None y NO vacios: cada celda no nula se convierte con str(...) y se descarta si queda cadena vacia. Estilo dict-no-throw del grupo eda: NUNCA lanza; ante cualquier fallo (query, conversion, backend desconocido) devuelve {status:'error', error:str, columns:{}, n:0}. La clave n reporta el numero de FILAS leidas por la query (antes de filtrar None/vacios)." +tags: [eda, datascience, text, nlp, extraction, read-only, duckdb, postgres, python] +uses_functions: [duckdb_query_readonly_py_infra, pg_query_py_infra] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +params: + - name: db_path + desc: "ruta al archivo DuckDB, o DSN PostgreSQL si backend='postgres'. Se inyecta en el closure query_fn. No se valida aqui: si la base no existe o el DSN es invalido, la query devuelve status error y el resultado es {status:'error', ...} (no lanza)." + - name: table + desc: "nombre de la tabla. Se escapa con comillas dobles en la query (SELECT ... FROM \"table\")." + - name: columns + desc: "lista de nombres de columna de texto a muestrear. Se filtra a las entradas que sean str no vacio; cada nombre se escapa con comillas dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0} sin tocar la base." + - name: backend + desc: "'duckdb' (default) o 'postgres'. Selecciona el lector read-only del registry (duckdb_query_readonly / pg_query). Cualquier otro valor -> {status:'error', error:'backend desconocido: ', columns:{}, n:0}." + - name: sample + desc: "maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota memoria y tiempo: con tablas grandes obtienes el primer tramo por orden fisico (sin ORDER BY), no un muestreo uniforme." +output: "dict dict-no-throw (NUNCA lanza): {status:'ok'|'error', columns:{col_name:[str,...]}, n:int, error:str}. En exito (status='ok') columns mapea cada columna pedida a la lista de sus valores de texto NO None y NO vacios (cada celda convertida con str(...)); n es el numero de FILAS leidas por la query (antes de filtrar None/vacios). columns vacio -> {status:'ok', columns:{}, n:0}. En error (backend desconocido, query con status!='ok', o cualquier excepcion) -> {status:'error', error:str, columns:{}, n:0}; la clave error solo aparece en este caso." +tested: true +tests: ["test_extract_basic", "test_backend_desconocido", "test_columns_vacio", "test_sample_limit"] +test_file_path: "python/functions/datascience/extract_text_sample_test.py" +file_path: "python/functions/datascience/extract_text_sample.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +# Import directo del submodulo (no requiere export en datascience/__init__.py). +from datascience.extract_text_sample import extract_text_sample + +# Muestrea hasta 2000 filas de dos columnas de texto de una tabla DuckDB. +res = extract_text_sample( + "data/reviews.duckdb", "reviews", ["title", "body"], + backend="duckdb", sample=2000, +) +# res == { +# "status": "ok", +# "columns": { +# "title": ["Gran producto", "No funciona", ...], # solo no-None, no-"" +# "body": ["Lo uso a diario...", ...], +# }, +# "n": 2000, # filas leidas por la query (antes de filtrar None/vacios) +# } + +# Postgres: db_path es el DSN. +res_pg = extract_text_sample( + "postgresql://user:pass@localhost:5433/trends", "comentarios", ["texto"], + backend="postgres", sample=500, +) +``` + +## Cuando usarla + +Cuando necesites valores CRUDOS de texto de una o varias columnas para analisis +NLP/texto (distribucion de longitudes, conteo de tokens, ejemplos representativos, +deteccion de idioma) pero NO quieras cargar la tabla entera en memoria. Es el +muestreador de texto del grupo `eda`: una sola llamada con push-down `LIMIT` +devuelve listas de strings por columna, limpias de None y vacios, listas para +alimentar un capitulo de texto del AutomaticEDA o cualquier rutina de tokenizado. +Usala junto a `profile_table` / `build_eda_render_ctx` cuando el perfil agregado +no basta y hace falta el texto real. + +## Gotchas + +- **Impura**: lee de la base de datos a traves de `query_fn` (closure sobre + `duckdb_query_readonly` / `pg_query`). No abre conexiones fuera de esos wrappers + del registry. Estilo dict-no-throw del grupo `eda`: NUNCA lanza; ante cualquier + fallo devuelve `{status:'error', error:str, columns:{}, n:0}`. +- **`error_type` en el frontmatter es `error_go_core` por convencion del registry** + (toda funcion impura debe declararlo y el indexer lo exige), pero el codigo NO + lanza esa excepcion: degrada al dict de error. Es metadata, no comportamiento. +- **Backend desconocido**: con un `backend` que no sea `duckdb` ni `postgres` + devuelve `{status:'error', error:'backend desconocido: ', columns:{}, + n:0}` sin tocar la base. +- **Las listas NO incluyen None ni cadenas vacias**: cada celda no nula se pasa + por `str(...)` y se descarta si queda `""`. Por eso `len(columns[col])` puede ser + menor que `n` (que cuenta las filas leidas). Si necesitas alineacion por fila + (una entrada por fila aunque sea None), usa `build_eda_render_ctx` (raw_numeric), + no esta funcion. +- **`LIMIT sample` sin `ORDER BY`**: con tablas grandes obtienes el primer tramo + por orden fisico del backend, no un muestreo uniforme ni reproducible. Sube + `sample` para mas cobertura, o pre-ordena/aleatoriza la tabla si necesitas + representatividad. +- **DuckDB en sandbox por defecto**: `duckdb_query_readonly` abre la conexion con + `enable_external_access=False`, asi que la query solo puede leer la propia base + (no `read_csv`/`httpfs`/`ATTACH` a paths externos). Lee tablas ya existentes en + el archivo DuckDB sin problema. +- **No loguear los datos crudos**: las listas de `columns` pueden contener texto + sensible (reviews, comentarios, PII). En trazas usa solo conteos (`n`, + `len(columns[col])`) y nombres de columna, no el dict completo. diff --git a/python/functions/datascience/extract_text_sample.py b/python/functions/datascience/extract_text_sample.py new file mode 100644 index 00000000..d44b9b95 --- /dev/null +++ b/python/functions/datascience/extract_text_sample.py @@ -0,0 +1,112 @@ +"""extract_text_sample — muestrea columnas de texto de una tabla sin cargarla en RAM. + +Funcion impura (lee de la base de datos) del grupo de capacidad `eda`. Dado un +``db_path`` + ``table`` (DuckDB o PostgreSQL) y una lista de ``columns`` de texto, +trae una MUESTRA de esas columnas con push-down SQL (``LIMIT sample``), nunca la +tabla entera. La usan los capitulos de texto/NLP del AutomaticEDA que necesitan +valores crudos de texto (longitudes, tokens, ejemplos) sin materializar millones +de filas en memoria. + +El lector read-only ``query_fn(sql) -> dict`` se construye igual que en +``build_eda_render_ctx`` / ``profile_table``: un closure sobre el wrapper del +registry (``duckdb_query_readonly`` / ``pg_query``), importado perezosamente +dentro de la funcion para no crear ciclos al cargar el ``__init__`` del paquete +``datascience``. Nunca abre conexiones fuera de esos wrappers. + +Estilo dict-no-throw del grupo `eda`: la funcion NUNCA lanza. Captura cualquier +excepcion (query, conversion) y devuelve ``{"status":"error", "error":str(e), +"columns":{}, "n":0}``. Si la query subyacente devuelve ``status != "ok"``, se +propaga como error con el mensaje del wrapper. + +Por columna, la lista de strings solo contiene valores NO nulos y NO vacios: +cada celda no-None se convierte con ``str(...)`` y se descarta si queda ``""``. +La clave ``n`` reporta el numero de FILAS leidas por la query (antes de filtrar +los None/vacios), util para saber cuanto se muestreo realmente. +""" + + +def extract_text_sample(db_path, table, columns, backend="duckdb", sample=2000): + """Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL. + + Args: + db_path: ruta al archivo DuckDB, o DSN PostgreSQL si backend="postgres". + Se inyecta en el closure query_fn. No se valida aqui: si la base no + existe o el DSN es invalido, la query devuelve status error y el + resultado es {status:'error', ...} (no lanza). + table: nombre de la tabla. Se escapa con comillas dobles en la query. + columns: lista de nombres de columna de texto a muestrear. Se filtra a las + entradas que sean str no vacio; cada nombre se escapa con comillas + dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0}. + backend: "duckdb" (default) o "postgres". Selecciona el lector read-only + del registry (duckdb_query_readonly / pg_query). Cualquier otro valor + -> {status:'error', error:'backend desconocido: ...', columns:{}, n:0}. + sample: maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota + memoria y tiempo: con tablas grandes obtienes el primer tramo por + orden fisico, no un muestreo uniforme. + + Returns: + dict (dict-no-throw, NUNCA lanza): + {"status": "ok"|"error", + "columns": {col_name: [str, str, ...], ...}, # solo no-None, no-"" + "n": int, # nº de filas leidas por la query (antes de filtrar) + "error": str} # solo presente si status == "error" + """ + try: + # 1) Lector read-only del backend activo, construido como en + # build_eda_render_ctx (closure sobre el wrapper del registry). Imports + # perezosos: este modulo vive en el paquete `datascience`, importar a + # `infra` a nivel de modulo crearia un ciclo al cargar el __init__. + if backend == "duckdb": + from infra import duckdb_query_readonly + + def query_fn(sql): + return duckdb_query_readonly(db_path, sql) + + elif backend == "postgres": + from infra import pg_query + + def query_fn(sql): + return pg_query(db_path, sql) + + else: + return { + "status": "error", + "error": f"backend desconocido: {backend}", + "columns": {}, + "n": 0, + } + + # 2) Columnas validas (str no vacio). Si no queda ninguna, nada que + # muestrear: ok con columns vacio. + cols = [] + if isinstance(columns, (list, tuple)): + cols = [c for c in columns if isinstance(c, str) and c != ""] + if not cols: + return {"status": "ok", "columns": {}, "n": 0} + + # 3) Push-down: una sola query con LIMIT. Identificadores escapados con + # comillas dobles, igual que build_eda_render_ctx. + cols_sql = ", ".join(f'"{c}"' for c in cols) + sql = f'SELECT {cols_sql} FROM "{table}" LIMIT {int(sample)}' + q = query_fn(sql) + if not isinstance(q, dict) or q.get("status") != "ok": + err = q.get("error") if isinstance(q, dict) else "query sin resultado" + return {"status": "error", "error": str(err), "columns": {}, "n": 0} + + rows = q.get("rows") or [] + out = {c: [] for c in cols} + for row in rows: + if not isinstance(row, dict): + continue + for c in cols: + value = row.get(c) + if value is None: + continue + s = str(value) + if s == "": + continue + out[c].append(s) + + return {"status": "ok", "columns": out, "n": len(rows)} + except Exception as exc: # noqa: BLE001 - dict-no-throw del grupo eda + return {"status": "error", "error": str(exc), "columns": {}, "n": 0} diff --git a/python/functions/datascience/extract_text_sample_test.py b/python/functions/datascience/extract_text_sample_test.py new file mode 100644 index 00000000..b6c15e83 --- /dev/null +++ b/python/functions/datascience/extract_text_sample_test.py @@ -0,0 +1,83 @@ +"""Tests para extract_text_sample. + +Self-contained: crea un DuckDB temporal pequeño con una columna de texto (algunas +filas con NULL) y una numerica, y verifica que la muestra de texto trae solo los +valores no nulos, que el backend desconocido y la lista de columnas vacia se +manejan dict-no-throw, y que sample acota el numero de filas leidas. +""" + +import os +import sys + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..")) # python/functions +if _FUNCTIONS not in sys.path: + sys.path.insert(0, _FUNCTIONS) + +import duckdb # noqa: E402 + +from datascience.extract_text_sample import extract_text_sample # noqa: E402 + +_TABLE = "t" +# 6 filas: txt VARCHAR con dos NULL, other INT siempre presente. +_ROWS = [ + ("alpha", 1), + ("beta", 2), + (None, 3), + ("gamma", 4), + (None, 5), + ("delta", 6), +] +_TXT_NON_NULL = {"alpha", "beta", "gamma", "delta"} + + +def _make_db(tmp_path): + """Crea un DuckDB temporal con la tabla de prueba y devuelve su ruta.""" + db_path = os.path.join(str(tmp_path), "text_sample.duckdb") + con = duckdb.connect(db_path) + try: + con.execute(f'CREATE TABLE "{_TABLE}" (txt VARCHAR, other INTEGER)') + con.executemany(f'INSERT INTO "{_TABLE}" VALUES (?, ?)', _ROWS) + finally: + con.close() + return db_path + + +def test_extract_basic(tmp_path): + db_path = _make_db(tmp_path) + res = extract_text_sample(db_path, _TABLE, ["txt"]) + assert res["status"] == "ok" + # n = filas leidas por la query (6), antes de filtrar None. + assert res["n"] == len(_ROWS) + # columns["txt"] trae solo los strings no nulos (los dos NULL fuera). + assert "txt" in res["columns"] + assert set(res["columns"]["txt"]) == _TXT_NON_NULL + assert len(res["columns"]["txt"]) == len(_TXT_NON_NULL) + # No se pidio "other", no debe aparecer. + assert "other" not in res["columns"] + + +def test_backend_desconocido(tmp_path): + db_path = _make_db(tmp_path) + res = extract_text_sample(db_path, _TABLE, ["txt"], backend="mysql") + assert res["status"] == "error" + assert "backend desconocido" in res["error"] + assert res["columns"] == {} + assert res["n"] == 0 + + +def test_columns_vacio(tmp_path): + db_path = _make_db(tmp_path) + res = extract_text_sample(db_path, _TABLE, []) + assert res["status"] == "ok" + assert res["columns"] == {} + assert res["n"] == 0 + + +def test_sample_limit(tmp_path): + db_path = _make_db(tmp_path) + res = extract_text_sample(db_path, _TABLE, ["txt"], sample=2) + assert res["status"] == "ok" + # sample=2 -> la query lee como mucho 2 filas. + assert res["n"] == 2 + assert len(res["columns"]["txt"]) <= 2 diff --git a/python/pyproject.toml b/python/pyproject.toml index 052f7280..f0fed9a1 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "google-cloud-bigquery-storage>=2.27", "google-cloud-storage>=3.10.1", "httpx", + "langdetect>=1.0.9", "matplotlib>=3.10.9", "opencv-contrib-python-headless>=4.13.0.92", "openpyxl>=3.1.5", @@ -40,6 +41,7 @@ dependencies = [ "seaborn>=0.13.2", "shapely>=2.1.2", "statsmodels>=0.14.6", + "textstat>=0.7.13", "trimesh>=4.12.2", "xlrd>=2.0.2", ] diff --git a/python/uv.lock b/python/uv.lock index d46df6d9..be3188f3 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -899,6 +899,7 @@ dependencies = [ { name = "google-cloud-bigquery-storage" }, { name = "google-cloud-storage" }, { name = "httpx" }, + { name = "langdetect" }, { name = "matplotlib" }, { name = "opencv-contrib-python-headless" }, { name = "openpyxl" }, @@ -906,9 +907,11 @@ dependencies = [ { name = "polars" }, { name = "pymeshlab" }, { name = "pymssql" }, + { name = "pymupdf" }, { name = "pypdf" }, { name = "pyproj" }, { name = "python-docx" }, + { name = "python-pptx" }, { name = "pyyaml" }, { name = "qrcode", extra = ["pil"] }, { name = "rapidfuzz" }, @@ -919,6 +922,7 @@ dependencies = [ { name = "seaborn" }, { name = "shapely" }, { name = "statsmodels" }, + { name = "textstat" }, { name = "trimesh" }, { name = "xlrd" }, ] @@ -959,6 +963,7 @@ requires-dist = [ { name = "jupyter-collaboration", marker = "extra == 'jupyter'", specifier = ">=2.0" }, { name = "jupyter-mcp-server", marker = "extra == 'jupyter'" }, { name = "jupyterlab", marker = "extra == 'jupyter'", specifier = ">=4.0" }, + { name = "langdetect", specifier = ">=1.0.9" }, { name = "matplotlib", specifier = ">=3.10.9" }, { name = "opencv-contrib-python-headless", specifier = ">=4.13.0.92" }, { name = "openpyxl", specifier = ">=3.1.5" }, @@ -966,9 +971,11 @@ requires-dist = [ { name = "polars", specifier = ">=1.40.1" }, { name = "pymeshlab", specifier = ">=2025.7.post1" }, { name = "pymssql", specifier = ">=2.3.13" }, + { name = "pymupdf", specifier = ">=1.28.0" }, { name = "pypdf", specifier = ">=6.10.0" }, { name = "pyproj", specifier = ">=3.7.2" }, { name = "python-docx", specifier = ">=1.2.0" }, + { name = "python-pptx", specifier = ">=1.0.2" }, { name = "pyyaml", specifier = ">=6.0.3" }, { name = "qrcode", extras = ["pil"], specifier = ">=8.2" }, { name = "rapidfuzz", specifier = ">=3.14.5" }, @@ -979,6 +986,7 @@ requires-dist = [ { name = "seaborn", specifier = ">=0.13.2" }, { name = "shapely", specifier = ">=2.1.2" }, { name = "statsmodels", specifier = ">=0.14.6" }, + { name = "textstat", specifier = ">=0.7.13" }, { name = "trimesh", specifier = ">=4.12.2" }, { name = "xlrd", specifier = ">=2.0.2" }, ] @@ -2198,6 +2206,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/91/53255615acd2a1eaca307ede3c90eb550bae9c94581f8c00081b6b1c8f44/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:1f1489f769582498610e015a8ef2d36f28f505ab3096d0e16b4858a9ec214f57", size = 75987, upload-time = "2026-03-09T13:15:39.65Z" }, ] +[[package]] +name = "langdetect" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0", size = 981474, upload-time = "2021-05-07T07:54:13.562Z" } + [[package]] name = "lark" version = "1.3.1" @@ -2699,6 +2716,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, ] +[[package]] +name = "nltk" +version = "3.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "joblib" }, + { name = "regex" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/74/a1/b3b4adf15585a5bc4c357adde150c01ebeeb642173ded4d871e89468767c/nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0", size = 2946864, upload-time = "2026-03-24T06:13:40.641Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/91/04e965f8e717ba0ab4bdca5c112deeab11c9e750d94c4d4602f050295d39/nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f", size = 1552087, upload-time = "2026-03-24T06:13:38.47Z" }, +] + [[package]] name = "notebook-shim" version = "0.2.4" @@ -3750,6 +3782,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/50/4be9bd9cf4b43208a7175117a533ece200cfe4131a39f9909bdc7560ddeb/pymssql-2.3.13-cp314-cp314-win_amd64.whl", hash = "sha256:7d7037d2b5b907acc7906d0479924db2935a70c720450c41339146a4ada2b93d", size = 2049139, upload-time = "2026-02-14T05:00:23.951Z" }, ] +[[package]] +name = "pymupdf" +version = "1.28.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/e9/6d6c5d6c0a3551bffd47681a6240caf941727f195b45593cf20ab36f018f/pymupdf-1.28.0.tar.gz", hash = "sha256:e53f3567403a92da15caa9e7ae0164327fff48817e9f40175367fb9de524258d", size = 87637751, upload-time = "2026-06-29T09:08:47.547Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/b7/88043e38cc7529de070f0c9bd267fa258035cca0b4ad5260536b994594a7/pymupdf-1.28.0-cp310-abi3-macosx_10_15_x86_64.whl", hash = "sha256:892b89ba88e8f98b53133b62877a9dc9b5e7dc6a4aeb837b612db56a8d2e03ac", size = 24597385, upload-time = "2026-06-29T09:03:30.608Z" }, + { url = "https://files.pythonhosted.org/packages/33/f4/23775bbda0781b61fc398cc75079a2b0e64696d8fcf93271748883e9627e/pymupdf-1.28.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d692dcf44d3566ae96bc6f6346c6ad432274a29ba617bf7a9fe18009e24adb4", size = 23828292, upload-time = "2026-06-29T09:03:46.129Z" }, + { url = "https://files.pythonhosted.org/packages/1c/f5/bf75fc7a415722f8b33662054f82d88520c0cbfd4c36d0e08aeaec605e49/pymupdf-1.28.0-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:47a5c29ed4eb0744de9c4e37bb49b1259b18d4d75fcc8a7c130f7c9fa15956f6", size = 25045507, upload-time = "2026-06-29T09:04:03.86Z" }, + { url = "https://files.pythonhosted.org/packages/58/69/5d12c9f1f2d76f28383d6110a069c79fbfced5a4f97bb1ee6e8354f52bb7/pymupdf-1.28.0-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44f0973f5e5edbaec95bc34b64e71d1959d4ee90b1328de1b4f4f5b4fa78673f", size = 25716599, upload-time = "2026-06-29T09:04:19.367Z" }, + { url = "https://files.pythonhosted.org/packages/4d/b4/ec0e017bc42857cc86bd651441dbc41cc18be48d4698ecd27aac491e0c9a/pymupdf-1.28.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4d61ec323a706e153a12e262e51febfb43eeaa20977785ace135d18d48bcdc83", size = 25940489, upload-time = "2026-06-29T09:04:36.624Z" }, + { url = "https://files.pythonhosted.org/packages/06/86/f831fef09013f33b3c9c09fb3923f2ff53e1e437f6ace14b8ae46392f558/pymupdf-1.28.0-cp310-abi3-win32.whl", hash = "sha256:caea2b3b67347fd79e5d15ed7929b0e886aac594ea228073b6d39de0078189da", size = 18489703, upload-time = "2026-06-29T20:50:30.599Z" }, + { url = "https://files.pythonhosted.org/packages/2e/5d/1a03f53eb0449900469335fcfc742ca28e3ba159b7d650e0921d50b8b308/pymupdf-1.28.0-cp310-abi3-win_amd64.whl", hash = "sha256:e01e90fd86abfeb37ceb921eddb951f988a11d45ff6ce6b7664f2039849068ec", size = 19773102, upload-time = "2026-06-29T09:04:49.773Z" }, + { url = "https://files.pythonhosted.org/packages/72/f6/1e52ce243ca792254f6223b4017c5667194c146ce9b88baf37bc5eb3d1c9/pymupdf-1.28.0-cp313-abi3-pyemscripten_2025_0_wasm32.whl", hash = "sha256:74c6d00ba2a9aad3a635db73b07c15db462b480741d831a34a75a56535ebc22b", size = 18357011, upload-time = "2026-06-29T20:50:50.353Z" }, + { url = "https://files.pythonhosted.org/packages/62/b1/46b5b3d8ef3cc71114667cf10c4d8b33f39af97253af32e9a0986775b638/pymupdf-1.28.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:b3e1399c7a64c6914239116a369efcdaac4cfb9e838bde2656d7accc4a85c72d", size = 25753599, upload-time = "2026-06-29T09:05:09.398Z" }, +] + [[package]] name = "pyogrio" version = "0.12.1" @@ -3811,6 +3860,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/55/f2/7ebe366f633f30a6ad105f650f44f24f98cb1335c4157d21ae47138b3482/pypdf-6.10.0-py3-none-any.whl", hash = "sha256:90005e959e1596c6e6c84c8b0ad383285b3e17011751cedd17f2ce8fcdfc86de", size = 334459, upload-time = "2026-04-10T09:34:54.966Z" }, ] +[[package]] +name = "pyphen" +version = "0.17.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/56/e4d7e1bd70d997713649c5ce530b2d15a5fc2245a74ca820fc2d51d89d4d/pyphen-0.17.2.tar.gz", hash = "sha256:f60647a9c9b30ec6c59910097af82bc5dd2d36576b918e44148d8b07ef3b4aa3", size = 2079470, upload-time = "2025-01-20T13:18:36.296Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/1f/c2142d2edf833a90728e5cdeb10bdbdc094dde8dbac078cee0cf33f5e11b/pyphen-0.17.2-py3-none-any.whl", hash = "sha256:3a07fb017cb2341e1d9ff31b8634efb1ae4dc4b130468c7c39dd3d32e7c3affd", size = 2079358, upload-time = "2025-01-20T13:18:29.629Z" }, +] + [[package]] name = "pyproj" version = "3.7.2" @@ -3935,6 +3993,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/fd/0318007beb234790993d3ec5afd051d1dbceb733e81e3afe2b981ece3f37/python_multipart-0.0.30-py3-none-any.whl", hash = "sha256:830964def8c90607ac5daa00514e3987815865713ade8d20febc9177ac0c3c5b", size = 29730, upload-time = "2026-05-31T19:24:53.814Z" }, ] +[[package]] +name = "python-pptx" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "pillow" }, + { name = "typing-extensions" }, + { name = "xlsxwriter" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, +] + [[package]] name = "pywin32" version = "311" @@ -4936,6 +5009,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" }, ] +[[package]] +name = "textstat" +version = "0.7.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nltk" }, + { name = "pyphen" }, + { name = "setuptools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8c/0f/b673fcec5ad6e976b2e8368ef3651fe0fea3348a1191bacfcd41a17ddec6/textstat-0.7.13.tar.gz", hash = "sha256:a88d1da76287cd27ca4ce7bcba1ebaf2890544a5f0bb6a5758fa84cef3bceccb", size = 138932, upload-time = "2026-02-18T21:07:39.525Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/0eb4cc5bb021b4ceaaa602c59ba16ce99256b9dd30981bef3f3a53d8555f/textstat-0.7.13-py3-none-any.whl", hash = "sha256:04b1ec995d1e8b2e628759497e6b23204a9ec91dcd652447d8cbba9478f25471", size = 177050, upload-time = "2026-02-18T21:07:38.163Z" }, +] + [[package]] name = "threadpoolctl" version = "3.6.0" @@ -5312,6 +5399,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/62/c8d562e7766786ba6587d09c5a8ba9f718ed3fa8af7f4553e8f91c36f302/xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9", size = 96555, upload-time = "2025-06-14T08:46:37.766Z" }, ] +[[package]] +name = "xlsxwriter" +version = "3.2.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, +] + [[package]] name = "xxhash" version = "3.7.0"