Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 105e56cf05 |
@@ -0,0 +1,559 @@
|
||||
"""Free-text / NLP distributions chapter (TEXT DISTR) for AutomaticEDA.
|
||||
|
||||
First chapter for **non-tabular** content: it profiles the linguistic content of
|
||||
any column holding long free text (reviews, descriptions, comments, tickets) that
|
||||
the categorical chapter cannot meaningfully summarize (high cardinality, many
|
||||
words per value). It is the cheap, model-free counterpart to ``cat_distr`` for
|
||||
columns that are prose rather than discrete labels.
|
||||
|
||||
Activation (returns ``None`` when it does not apply):
|
||||
|
||||
1. Cheap gate from the aggregated profile: at least one non-numeric column whose
|
||||
``categorical.len_mean`` (mean character length) is ``>= _MIN_LEN_CHARS``.
|
||||
A dataset whose only string columns are short labels (e.g. titanic's
|
||||
``Name``, ~27 chars) never passes this gate, so the chapter disappears with
|
||||
zero extra work and the existing report is untouched.
|
||||
2. Confirmation from a raw sample: each candidate column is sampled (push-down
|
||||
``extract_text_sample`` over ``ctx['db_path']``/``ctx['table']``, or an
|
||||
in-memory ``ctx['text_raw']`` for tests) and kept only if the **median word
|
||||
count is ``>= _MIN_WORDS``** — i.e. it is genuinely long text, not a long
|
||||
single token. If no column survives, the chapter returns ``None``.
|
||||
|
||||
Per surviving column the chapter emits, kept together on its own page/slide
|
||||
(``Group(page_break_before=...)``):
|
||||
|
||||
- a key/value summary (documents, length percentiles, vocabulary richness with
|
||||
**[[term:ttr]]TTR[[/term]]** and **[[term:hapax]]hapax legomena[[/term]]**,
|
||||
dominant language, exact-duplicate %, readability when available);
|
||||
- a word-count histogram figure;
|
||||
- a top-terms table + a horizontal bar figure;
|
||||
- bigram and trigram frequency tables;
|
||||
- a detected-language bar figure (when ``langdetect`` is available);
|
||||
- an optional word-cloud figure (only when ``wordcloud`` is installed);
|
||||
- a closing note on duplicates / readability degradation.
|
||||
|
||||
Every metric is delegated to pure ``eda`` registry functions
|
||||
(``compute_text_length_stats``, ``compute_vocabulary_stats``,
|
||||
``compute_top_ngrams``, ``detect_corpus_language``, ``compute_text_duplicates``,
|
||||
``compute_text_readability``) and the raw sample to ``extract_text_sample``; all
|
||||
are imported defensively so a missing function or optional library degrades that
|
||||
single piece to a note instead of aborting the chapter. Optional libraries
|
||||
(``langdetect``, ``textstat``, ``wordcloud``, ``datasketch``) are never required:
|
||||
the piece is silently omitted when they are absent.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "text_distr"
|
||||
CHAPTER_TITLE = "Texto libre (NLP)"
|
||||
|
||||
# Cheap activation gate (characters): a non-numeric column whose mean string
|
||||
# length reaches this is a candidate for "long text". Short labels (titanic's
|
||||
# Name ≈ 27 chars) stay below it, so the chapter does not fire on them.
|
||||
_MIN_LEN_CHARS = 50
|
||||
# Confirmation gate (words): a candidate is kept only if its median document has
|
||||
# at least this many words — genuine prose, not a long id/URL token.
|
||||
_MIN_WORDS = 20
|
||||
# Bound the document so very wide datasets stay readable.
|
||||
_MAX_TEXT_COLS = 5
|
||||
# Raw text rows to sample per column when the chapter must extract them itself.
|
||||
_SAMPLE_ROWS = 2000
|
||||
# Rows shown in the frequency tables.
|
||||
_TOP_TERMS = 15
|
||||
_TOP_NGRAMS = 10
|
||||
|
||||
# Glossary terms this chapter explains (registered in the shared collector and
|
||||
# marked clickable on first appearance — same mechanism as cat_distr's entropía).
|
||||
_TERMS = {
|
||||
"ttr": (
|
||||
"TTR (type-token ratio)",
|
||||
"Riqueza léxica de un texto: número de palabras distintas (tipos) "
|
||||
"dividido por el número total de palabras (tokens). Vale 1 cuando no se "
|
||||
"repite ninguna palabra (máxima variedad) y baja hacia 0 cuando el "
|
||||
"vocabulario se repite mucho. Depende de la longitud del corpus, así que "
|
||||
"compara mejor textos de tamaño parecido."),
|
||||
"hapax": (
|
||||
"Hapax legomena",
|
||||
"Palabras que aparecen una sola vez en todo el corpus. Un porcentaje "
|
||||
"alto de hapax indica vocabulario muy variado o, a veces, ruido "
|
||||
"(erratas, identificadores, tokens raros). Se expresa como porcentaje "
|
||||
"sobre el número de palabras distintas."),
|
||||
}
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(value):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 2) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}".replace(",", ".")
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 1) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}%"
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _truncate(text, limit: int = 40) -> str:
|
||||
s = model._safe_str(text)
|
||||
return s if len(s) <= limit else s[: max(1, limit - 1)].rstrip() + "…"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Defensive wrappers around the registry functions: each returns the function's
|
||||
# output dict or a safe empty default, never raising and never importing at
|
||||
# module load (so the chapter stays importable even if a function is missing).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _length_stats(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
out = compute_text_length_stats(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _vocab_stats(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
out = compute_vocabulary_stats(texts, top_k=_TOP_TERMS)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _ngrams(texts, n) -> list:
|
||||
try:
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
out = compute_top_ngrams(texts, n=n, top_k=_TOP_NGRAMS)
|
||||
if isinstance(out, dict):
|
||||
return out.get("top") or []
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _language(texts) -> dict:
|
||||
try:
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
out = detect_corpus_language(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {"available": False, "distribution": [], "dominant": None}
|
||||
|
||||
|
||||
def _duplicates(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
out = compute_text_duplicates(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _readability(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
out = compute_text_readability(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {"available": False, "flesch": {}}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Candidate detection + raw sample acquisition.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _candidate_columns(profile: dict) -> list:
|
||||
"""Cheap gate: non-numeric columns whose mean char length reaches the
|
||||
threshold. Returns the list of column names (possibly empty)."""
|
||||
out = []
|
||||
for col in profile.get("columns") or []:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
if col.get("inferred_type") == "numeric":
|
||||
continue
|
||||
cat = col.get("categorical")
|
||||
if not isinstance(cat, dict):
|
||||
continue
|
||||
len_mean = cat.get("len_mean")
|
||||
if isinstance(len_mean, (int, float)) and not isinstance(len_mean, bool) \
|
||||
and len_mean >= _MIN_LEN_CHARS:
|
||||
name = col.get("name")
|
||||
if name:
|
||||
out.append(str(name))
|
||||
return out
|
||||
|
||||
|
||||
def _get_samples(profile: dict, ctx: dict, columns: list) -> dict:
|
||||
"""Return {col: [str, ...]} raw text samples for the candidate columns.
|
||||
|
||||
Prefers an in-memory ``ctx['text_raw']`` (used by tests); otherwise pushes a
|
||||
sample down to the database via ``extract_text_sample`` using ctx db_path /
|
||||
table. Never raises: returns {} when no sample can be obtained."""
|
||||
text_raw = ctx.get("text_raw")
|
||||
if isinstance(text_raw, dict) and text_raw:
|
||||
return {c: [str(v) for v in (text_raw.get(c) or []) if v is not None]
|
||||
for c in columns if text_raw.get(c)}
|
||||
|
||||
db_path = ctx.get("db_path")
|
||||
table = ctx.get("table")
|
||||
if not db_path or not table:
|
||||
return {}
|
||||
backend = ctx.get("backend") or "duckdb"
|
||||
sample = ctx.get("sample") or _SAMPLE_ROWS
|
||||
try:
|
||||
from datascience.extract_text_sample import extract_text_sample
|
||||
out = extract_text_sample(db_path, table, columns, backend=backend,
|
||||
sample=sample)
|
||||
if isinstance(out, dict) and out.get("status") == "ok":
|
||||
cols = out.get("columns")
|
||||
if isinstance(cols, dict):
|
||||
return {c: list(v) for c, v in cols.items() if v}
|
||||
except Exception: # noqa: BLE001 — dict-no-throw: no sample → chapter omits.
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _confirm_long_text(samples: dict) -> dict:
|
||||
"""Keep only columns whose median word count reaches _MIN_WORDS. Returns
|
||||
{col: length_stats_dict} for the survivors, in input order."""
|
||||
survivors = {}
|
||||
for col, texts in samples.items():
|
||||
stats = _length_stats(texts)
|
||||
words = stats.get("words") if isinstance(stats, dict) else None
|
||||
median = words.get("p50") if isinstance(words, dict) else None
|
||||
if isinstance(median, (int, float)) and not isinstance(median, bool) \
|
||||
and median >= _MIN_WORDS:
|
||||
survivors[col] = stats
|
||||
return survivors
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Figures (lazy matplotlib, scaled by the renderers — same style as num_distr).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _hist_figure(name: str, length_stats: dict):
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
fig = Figure(figsize=(6.2, 3.0))
|
||||
ax = fig.add_subplot(111)
|
||||
bins = (length_stats or {}).get("word_hist") or []
|
||||
drew = False
|
||||
for b in bins:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
lo, hi, count = b.get("lo"), b.get("hi"), b.get("count") or 0
|
||||
if lo is None or hi is None:
|
||||
continue
|
||||
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
|
||||
ax.bar(lo, count, width=width, align="edge", color="#9ec6df",
|
||||
edgecolor="#5b8aa6", linewidth=0.4)
|
||||
drew = True
|
||||
if not drew:
|
||||
ax.text(0.5, 0.5, "(sin datos de longitud)", ha="center",
|
||||
va="center", color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.set_xlabel("palabras por documento", fontsize=8)
|
||||
ax.set_ylabel("nº de documentos", fontsize=8)
|
||||
ax.tick_params(labelsize=7)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
ax.set_title(f"Longitud de «{_truncate(name, 30)}»", fontsize=10,
|
||||
loc="left")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
def _barh_figure(title: str, items: list, label_key: str, value_key: str,
|
||||
xlabel: str):
|
||||
"""Horizontal bar chart from [{label_key:..., value_key:...}, ...]."""
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
rows = [it for it in (items or []) if isinstance(it, dict)
|
||||
and isinstance(it.get(value_key), (int, float))]
|
||||
rows = rows[:12]
|
||||
fig = Figure(figsize=(6.2, max(2.2, 0.32 * len(rows) + 0.8)))
|
||||
ax = fig.add_subplot(111)
|
||||
if not rows:
|
||||
ax.text(0.5, 0.5, "(sin datos)", ha="center", va="center",
|
||||
color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.axis("off")
|
||||
return fig
|
||||
labels = [_truncate(r.get(label_key), 28) for r in rows][::-1]
|
||||
values = [float(r.get(value_key) or 0) for r in rows][::-1]
|
||||
ypos = range(len(rows))
|
||||
ax.barh(list(ypos), values, color="#9ec6df", edgecolor="#5b8aa6",
|
||||
linewidth=0.4)
|
||||
ax.set_yticks(list(ypos))
|
||||
ax.set_yticklabels(labels, fontsize=7)
|
||||
ax.set_xlabel(xlabel, fontsize=8)
|
||||
ax.tick_params(labelsize=7)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
ax.set_title(_truncate(title, 44), fontsize=10, loc="left")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
def _wordcloud_figure(texts):
|
||||
"""Word-cloud figure callable, or None if wordcloud is not installed."""
|
||||
try:
|
||||
import wordcloud # noqa: F401
|
||||
except Exception: # noqa: BLE001 — optional dependency: omit the figure.
|
||||
return None
|
||||
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
from wordcloud import WordCloud
|
||||
fig = Figure(figsize=(6.2, 3.2))
|
||||
ax = fig.add_subplot(111)
|
||||
joined = " ".join(t for t in texts if isinstance(t, str))
|
||||
try:
|
||||
wc = WordCloud(width=800, height=400, background_color="white",
|
||||
colormap="viridis").generate(joined)
|
||||
ax.imshow(wc, interpolation="bilinear")
|
||||
except Exception: # noqa: BLE001
|
||||
ax.text(0.5, 0.5, "(nube de palabras no disponible)", ha="center",
|
||||
va="center", color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.axis("off")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-column block assembly.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _summary_kv(n_docs, length_stats, vocab, lang, dup, read):
|
||||
chars = (length_stats or {}).get("chars") or {}
|
||||
words = (length_stats or {}).get("words") or {}
|
||||
sents = (length_stats or {}).get("sentences") or {}
|
||||
rows = [
|
||||
("Documentos", _fmt_int(n_docs)),
|
||||
("Caracteres (media · p50 · p90 · p99)",
|
||||
f"{_fmt_num(chars.get('mean'))} · {_fmt_int(chars.get('p50'))} · "
|
||||
f"{_fmt_int(chars.get('p90'))} · {_fmt_int(chars.get('p99'))}"),
|
||||
("Palabras (media · p50 · p90 · p99)",
|
||||
f"{_fmt_num(words.get('mean'))} · {_fmt_int(words.get('p50'))} · "
|
||||
f"{_fmt_int(words.get('p90'))} · {_fmt_int(words.get('p99'))}"),
|
||||
("Frases (media · máx)",
|
||||
f"{_fmt_num(sents.get('mean'))} · {_fmt_int(sents.get('max'))}"),
|
||||
("Vocabulario (tokens · tipos · TTR)",
|
||||
f"{_fmt_int(vocab.get('n_tokens'))} · {_fmt_int(vocab.get('n_types'))} "
|
||||
f"· {_fmt_num(vocab.get('ttr'), 3)}"),
|
||||
("Hapax legomena",
|
||||
f"{_fmt_int(vocab.get('n_hapax'))} ({_fmt_pct(vocab.get('hapax_pct'))})"),
|
||||
]
|
||||
if isinstance(lang, dict) and lang.get("available"):
|
||||
dom = lang.get("dominant")
|
||||
n_langs = len(lang.get("distribution") or [])
|
||||
rows.append(("Idioma dominante · nº idiomas",
|
||||
f"{model._safe_str(dom) or '—'} · {_fmt_int(n_langs)}"))
|
||||
if isinstance(dup, dict) and dup.get("n_docs"):
|
||||
rows.append(("Duplicados exactos",
|
||||
f"{_fmt_int(dup.get('n_exact_dup'))} "
|
||||
f"({_fmt_pct(dup.get('exact_dup_pct'))})"))
|
||||
if isinstance(read, dict) and read.get("available"):
|
||||
flesch = read.get("flesch") or {}
|
||||
rows.append(("Legibilidad Flesch (media)",
|
||||
_fmt_num(flesch.get("mean"), 1)))
|
||||
return model.KVTable(rows=rows, title="Resumen del texto")
|
||||
|
||||
|
||||
def _terms_table(vocab) -> "model.DataTable | None":
|
||||
top = (vocab or {}).get("top_terms") or []
|
||||
rows = [[_truncate(t.get("term"), 32), _fmt_int(t.get("count")),
|
||||
_fmt_pct(t.get("pct"))]
|
||||
for t in top[:_TOP_TERMS] if isinstance(t, dict)]
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=["Término", "Conteo", "% tokens"], rows=rows,
|
||||
title="Términos más frecuentes",
|
||||
note="stopwords ES+EN eliminadas")
|
||||
|
||||
|
||||
def _ngram_table(items, n_label) -> "model.DataTable | None":
|
||||
rows = [[_truncate(it.get("ngram"), 40), _fmt_int(it.get("count"))]
|
||||
for it in (items or [])[:_TOP_NGRAMS] if isinstance(it, dict)]
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=[n_label, "Conteo"], rows=rows,
|
||||
title=f"{n_label} más frecuentes")
|
||||
|
||||
|
||||
def _dup_note(dup, lang, read) -> "model.Note | None":
|
||||
bits = []
|
||||
if isinstance(dup, dict):
|
||||
nd = dup.get("near_dup") or {}
|
||||
if nd.get("available"):
|
||||
bits.append(
|
||||
f"casi-duplicados detectados (MinHash, umbral "
|
||||
f"{_fmt_num(nd.get('threshold'))}): "
|
||||
f"{_fmt_int(nd.get('n_near_dup_docs'))} documentos")
|
||||
else:
|
||||
bits.append("near-duplicados no calculados (datasketch no instalado; "
|
||||
"se reportan solo los duplicados exactos por hash)")
|
||||
if isinstance(lang, dict) and not lang.get("available"):
|
||||
bits.append("detección de idioma omitida (langdetect no instalado)")
|
||||
if isinstance(read, dict) and not read.get("available"):
|
||||
bits.append("legibilidad omitida (textstat no instalado)")
|
||||
if not bits:
|
||||
return None
|
||||
return model.Note(" · ".join(bits))
|
||||
|
||||
|
||||
def _column_group(name, texts, length_stats, idx, mark_terms):
|
||||
vocab = _vocab_stats(texts)
|
||||
lang = _language(texts)
|
||||
dup = _duplicates(texts)
|
||||
read = _readability(texts)
|
||||
n_docs = (length_stats or {}).get("n_docs")
|
||||
|
||||
blocks = [
|
||||
model.Heading(text=str(name), level=2),
|
||||
_summary_kv(n_docs, length_stats, vocab, lang, dup, read),
|
||||
model.Figure(make=_hist_figure(name, length_stats),
|
||||
caption=f"Distribución de la longitud (palabras) de "
|
||||
f"«{_truncate(name, 30)}»."),
|
||||
]
|
||||
|
||||
terms_tbl = _terms_table(vocab)
|
||||
if terms_tbl is not None:
|
||||
blocks.append(terms_tbl)
|
||||
blocks.append(model.Figure(
|
||||
make=_barh_figure(f"Top términos de «{_truncate(name, 24)}»",
|
||||
vocab.get("top_terms"), "term", "count",
|
||||
"conteo"),
|
||||
caption="Términos más frecuentes (barras)."))
|
||||
|
||||
bi_tbl = _ngram_table(_ngrams(texts, 2), "Bigrama")
|
||||
if bi_tbl is not None:
|
||||
blocks.append(bi_tbl)
|
||||
tri_tbl = _ngram_table(_ngrams(texts, 3), "Trigrama")
|
||||
if tri_tbl is not None:
|
||||
blocks.append(tri_tbl)
|
||||
|
||||
if isinstance(lang, dict) and lang.get("available") \
|
||||
and lang.get("distribution"):
|
||||
blocks.append(model.Figure(
|
||||
make=_barh_figure(f"Idiomas detectados en «{_truncate(name, 24)}»",
|
||||
lang.get("distribution"), "lang", "count",
|
||||
"documentos"),
|
||||
caption="Distribución de idiomas detectados (langdetect)."))
|
||||
|
||||
wc = _wordcloud_figure(texts)
|
||||
if wc is not None:
|
||||
blocks.append(model.Figure(
|
||||
make=wc, caption=f"Nube de palabras de «{_truncate(name, 30)}»."))
|
||||
|
||||
note = _dup_note(dup, lang, read)
|
||||
if note is not None:
|
||||
blocks.append(note)
|
||||
|
||||
return model.Group(blocks=blocks, page_break_before=(idx > 0))
|
||||
|
||||
|
||||
def _intro_blocks(n_cols, mark_terms):
|
||||
ttr = ("[[term:ttr]]TTR[[/term]]" if mark_terms else "TTR")
|
||||
hapax = ("[[term:hapax]]hapax legomena[[/term]]" if mark_terms
|
||||
else "hapax legomena")
|
||||
text = (
|
||||
f"Este capítulo perfila las columnas de **texto libre largo** del "
|
||||
f"dataset (reseñas, descripciones, comentarios): contenido lingüístico "
|
||||
f"que la distribución categórica no resume bien. Para cada columna se "
|
||||
f"muestran la longitud de los documentos, la riqueza de vocabulario "
|
||||
f"(incluido el {ttr} y el porcentaje de {hapax}), los términos y "
|
||||
f"n-gramas más frecuentes, los idiomas detectados y el nivel de "
|
||||
f"duplicación. Las métricas son baratas y sin modelos pesados; las "
|
||||
f"piezas que dependen de una librería opcional se omiten si no está "
|
||||
f"instalada.")
|
||||
return [
|
||||
model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
model.Markdown(text=text),
|
||||
]
|
||||
|
||||
|
||||
def build_text_distr(profile: dict, ctx: dict):
|
||||
"""Build the free-text Chapter, or None if no long-text column applies."""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
|
||||
# 1) Cheap gate from the profile (no DB access yet).
|
||||
candidates = _candidate_columns(profile)
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# 2) Raw sample + 3) confirm genuine long text (median words >= threshold).
|
||||
samples = _get_samples(profile, ctx, candidates)
|
||||
if not samples:
|
||||
return None
|
||||
survivors = _confirm_long_text(samples)
|
||||
if not survivors:
|
||||
return None
|
||||
|
||||
# Register glossary terms (clickable) once we know the chapter applies.
|
||||
glossary = ctx.get("glossary")
|
||||
mark_terms = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark_terms = True
|
||||
|
||||
blocks = list(_intro_blocks(len(survivors), mark_terms))
|
||||
|
||||
rendered = list(survivors.items())[:_MAX_TEXT_COLS]
|
||||
for idx, (name, length_stats) in enumerate(rendered):
|
||||
texts = samples.get(name) or []
|
||||
blocks.append(_column_group(name, texts, length_stats, idx, mark_terms))
|
||||
|
||||
if len(survivors) > len(rendered):
|
||||
omitted = len(survivors) - len(rendered)
|
||||
blocks.append(model.Note(
|
||||
f"Se muestran las primeras {len(rendered)} columnas de texto; "
|
||||
f"quedan {omitted} sin mostrar para mantener acotado el informe."))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,256 @@
|
||||
"""Tests for the TEXT DISTR chapter — DoD: golden + edges + degradation.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles and feeds the raw text sample
|
||||
in-memory through ``ctx['text_raw']`` (no DuckDB needed), so the suite is fast
|
||||
and deterministic. Verifies that ``build_text_distr``:
|
||||
|
||||
- GOLDEN: with a long-text column, emits the chapter with its key blocks
|
||||
(length summary, word histogram, top-terms table, n-gram tables, language
|
||||
bars) and registers the clickable glossary terms; and that it renders inside
|
||||
the full document to both PDF and PPTX showing that content.
|
||||
- EDGE (None): a dataset whose only string column is short labels (titanic-like
|
||||
``Name``) yields ``None`` without raising — the existing report is untouched.
|
||||
- EDGE (None): a column that passes the cheap char gate but whose documents are
|
||||
short (median words below the threshold) is rejected at the confirmation step.
|
||||
- DEGRADATION: with ``langdetect`` / ``textstat`` / ``wordcloud`` unavailable,
|
||||
the chapter still builds (those pieces are omitted) and never raises.
|
||||
"""
|
||||
|
||||
import builtins
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import (
|
||||
DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown,
|
||||
Note,
|
||||
)
|
||||
from datascience.automatic_eda.chapters.text_distr import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_text_distr,
|
||||
)
|
||||
from datascience.automatic_eda.chapters_registry import build_document
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Synthetic corpus + profiles.
|
||||
# --------------------------------------------------------------------------- #
|
||||
_ES = [
|
||||
"El producto llegó en perfecto estado y mucho antes de lo previsto por la tienda",
|
||||
"La calidad de los materiales es realmente excelente y se nota la diferencia al usarlo",
|
||||
"No me convenció del todo porque esperaba bastante más por el precio que pagué finalmente",
|
||||
"El servicio de atención al cliente fue rápido amable y resolvió mi problema sin demora",
|
||||
"Lo recomiendo totalmente ya que ha superado con creces todas mis expectativas iniciales",
|
||||
]
|
||||
_EN = [
|
||||
"The product arrived in perfect condition and much earlier than the store had promised me",
|
||||
"The build quality is genuinely outstanding and you can really feel the difference using it",
|
||||
"I was not fully convinced because I expected quite a lot more for the price i finally paid",
|
||||
"Customer support was fast friendly and solved my whole problem without any delay at all",
|
||||
"I highly recommend it since it has exceeded by far every one of my initial expectations",
|
||||
]
|
||||
|
||||
|
||||
def _long_reviews(n=40) -> list:
|
||||
"""A corpus of long multi-sentence reviews (>= 20 words each), mixing two
|
||||
languages and including a few exact duplicates."""
|
||||
out = []
|
||||
for i in range(n):
|
||||
base = _ES if i % 3 != 0 else _EN # mostly ES, some EN
|
||||
a = base[i % len(base)]
|
||||
b = base[(i + 2) % len(base)]
|
||||
out.append(f"{a}. {b}.")
|
||||
# Inject a couple of exact duplicates.
|
||||
out.append(out[0])
|
||||
out.append(out[1])
|
||||
return out
|
||||
|
||||
|
||||
def _text_profile() -> dict:
|
||||
"""Profile with a long free-text column (review) + a numeric + a short cat."""
|
||||
return {
|
||||
"table": "reviews",
|
||||
"source": "/data/reviews.duckdb",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 42,
|
||||
"n_cols": 3,
|
||||
"quality_score": 88.0,
|
||||
"columns": [
|
||||
{
|
||||
"name": "review",
|
||||
"inferred_type": "categorical",
|
||||
"categorical": {
|
||||
"top": [{"value": "x", "count": 2, "pct": 0.05}],
|
||||
"n_distinct": 40,
|
||||
"len_mean": 180.0,
|
||||
"len_min": 80,
|
||||
"len_max": 220,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"inferred_type": "numeric",
|
||||
"numeric": {"mean": 3.1, "median": 3.0, "std": 1.2,
|
||||
"min": 1, "max": 5},
|
||||
},
|
||||
{
|
||||
"name": "product",
|
||||
"inferred_type": "categorical",
|
||||
"categorical": {
|
||||
"top": [{"value": "teclado", "count": 10, "pct": 0.25}],
|
||||
"n_distinct": 6,
|
||||
"len_mean": 7.0,
|
||||
"len_min": 5, "len_max": 11,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _no_text_profile() -> dict:
|
||||
"""titanic-like: the only string column is short labels (Name ≈ 27 chars)."""
|
||||
return {
|
||||
"table": "titanic",
|
||||
"n_rows": 891,
|
||||
"n_cols": 3,
|
||||
"columns": [
|
||||
{"name": "Age", "inferred_type": "numeric",
|
||||
"numeric": {"mean": 29.7, "median": 28.0, "std": 14.5}},
|
||||
{"name": "Name", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "Braund, Mr. Owen Harris",
|
||||
"count": 1, "pct": 0.001}],
|
||||
"n_distinct": 891, "len_mean": 27.0,
|
||||
"len_min": 12, "len_max": 82}},
|
||||
{"name": "Sex", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "male", "count": 577,
|
||||
"pct": 0.65}],
|
||||
"n_distinct": 2, "len_mean": 4.6,
|
||||
"len_min": 4, "len_max": 6}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _flatten(blocks) -> list:
|
||||
"""Recursively flatten Group blocks so tests can inspect leaf blocks."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if isinstance(b, Group):
|
||||
out.extend(_flatten(b.blocks))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_activa_con_texto():
|
||||
glossary = GlossaryCollector()
|
||||
ctx = {"text_raw": {"review": _long_reviews()}, "glossary": glossary}
|
||||
ch = build_text_distr(_text_profile(), ctx)
|
||||
|
||||
assert ch is not None, "el capítulo debe activarse con una columna de texto largo"
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
leaves = _flatten(ch.blocks)
|
||||
kinds = [b.kind for b in leaves]
|
||||
assert "heading" in kinds
|
||||
assert "kv_table" in kinds # summary
|
||||
assert "figure" in kinds # histogram / bars
|
||||
assert "data_table" in kinds # top terms + n-grams
|
||||
|
||||
# KV summary mentions vocabulary metrics.
|
||||
kv = next(b for b in leaves if isinstance(b, KVTable))
|
||||
labels = " ".join(str(r[0]) for r in kv.rows)
|
||||
assert "TTR" in labels
|
||||
assert "Hapax" in labels or "hapax" in labels
|
||||
|
||||
# There is a terms table and at least one n-gram table.
|
||||
titles = [getattr(b, "title", "") or "" for b in leaves
|
||||
if isinstance(b, DataTable)]
|
||||
assert any("Términos" in t for t in titles)
|
||||
assert any("Bigrama" in t for t in titles)
|
||||
|
||||
# Glossary terms were registered (clickable destinations).
|
||||
assert glossary.has("ttr")
|
||||
assert glossary.has("hapax")
|
||||
|
||||
|
||||
def test_golden_render_pdf_pptx():
|
||||
profile = _text_profile()
|
||||
ctx = {"text_raw": {"review": _long_reviews()},
|
||||
"dataset_name": "reviews"}
|
||||
chapters = build_document(profile, ctx)
|
||||
ids = [c.id for c in chapters]
|
||||
assert "text_distr" in ids, f"text_distr ausente en {ids}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "t.pdf")
|
||||
pptx = os.path.join(d, "t.pptx")
|
||||
rp = render_automatic_eda_pdf(profile, pdf, {"title": "EDA", "ctx": ctx})
|
||||
rx = render_automatic_eda_pptx(profile, pptx, {"title": "EDA", "ctx": ctx})
|
||||
assert rp.get("path") and os.path.exists(pdf)
|
||||
assert rx.get("path") and os.path.exists(pptx)
|
||||
|
||||
text = "\n".join(p.extract_text() or "" for p in PdfReader(pdf).pages)
|
||||
assert "Texto libre" in text or "TTR" in text
|
||||
|
||||
prs = Presentation(pptx)
|
||||
ptext = []
|
||||
for slide in prs.slides:
|
||||
for shp in slide.shapes:
|
||||
if shp.has_text_frame:
|
||||
ptext.append(shp.text_frame.text)
|
||||
joined = "\n".join(ptext)
|
||||
assert "Texto libre" in joined or "TTR" in joined
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges — None.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_edge_none_sin_texto_largo():
|
||||
# titanic-like: short labels only → chapter must not apply.
|
||||
assert build_text_distr(_no_text_profile(), {}) is None
|
||||
|
||||
|
||||
def test_edge_none_palabras_cortas():
|
||||
# Char gate passes (len_mean high) but documents are short → confirmation
|
||||
# rejects them (median words below threshold).
|
||||
profile = _text_profile()
|
||||
short = ["palabra " * 3] * 30 # 3 words each, < _MIN_WORDS
|
||||
ctx = {"text_raw": {"review": short}}
|
||||
assert build_text_distr(profile, ctx) is None
|
||||
|
||||
|
||||
def test_edge_none_empty_profile():
|
||||
assert build_text_distr({}, {}) is None
|
||||
assert build_text_distr(None, None) is None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Degradation — optional libs absent.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_degradacion_sin_libs(monkeypatch):
|
||||
real_import = builtins.__import__
|
||||
blocked = ("langdetect", "textstat", "wordcloud", "datasketch")
|
||||
|
||||
def fake_import(name, *a, **k):
|
||||
if name in blocked or any(name.startswith(b + ".") for b in blocked):
|
||||
raise ImportError(f"simulado: {name}")
|
||||
return real_import(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
|
||||
ctx = {"text_raw": {"review": _long_reviews()}}
|
||||
ch = build_text_distr(_text_profile(), ctx)
|
||||
# Still builds (the cheap, stdlib-only pieces remain) and never raises.
|
||||
assert ch is not None
|
||||
leaves = _flatten(ch.blocks)
|
||||
assert any(isinstance(b, KVTable) for b in leaves)
|
||||
assert any(isinstance(b, DataTable) for b in leaves)
|
||||
# A degradation note is present mentioning the missing optional libs.
|
||||
notes = " ".join(b.text for b in leaves if isinstance(b, Note))
|
||||
assert "langdetect" in notes or "textstat" in notes or "datasketch" in notes
|
||||
@@ -31,6 +31,7 @@ CHAPTER_ORDER = [
|
||||
"analisis_llm", # LLM interpretation — sits next to overview (user request)
|
||||
"num_distr", # numeric distributions
|
||||
"cat_distr", # categorical distributions
|
||||
"text_distr", # free-text / NLP distributions (non-tabular content)
|
||||
"calidad", # data quality
|
||||
"correlacion", # correlations / associations
|
||||
"relaciones", # key relations: declared/candidate PK + FK (inter/intra-table)
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
id: compute_text_duplicates_py_datascience
|
||||
name: compute_text_duplicates
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict"
|
||||
description: "Detecta documentos duplicados en un corpus de texto. Los duplicados EXACTOS se calculan siempre con la stdlib: cada documento se normaliza (colapsa espacios, strip, lower) y se hashea con SHA-1; n_exact_dup es cuántos docs repiten uno ya visto y exact_dup_pct su porcentaje. Los CASI-duplicados (near-dup) usan la dependencia OPCIONAL datasketch (MinHash + LSH sobre 3-shingles de palabras); si no está instalada, esa parte degrada a available:False sin afectar al resto. Estilo dict-no-throw del grupo eda — nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, duplicates, minhash, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [hashlib, re]
|
||||
example: |
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
texts = ["El gato come pescado", "El gato come pescado", "Un perro ladra"]
|
||||
result = compute_text_duplicates(texts)
|
||||
# {"n_docs": 3, "n_exact_dup": 1, "exact_dup_pct": 33.33, "n_unique": 2,
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_duplicados_exactos"
|
||||
- "test_sin_duplicados"
|
||||
- "test_vacio"
|
||||
- "test_near_dup_degrada"
|
||||
test_file_path: "python/functions/datascience/compute_text_duplicates_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_duplicates.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de documentos de texto. Los elementos None o que no sean str se descartan silenciosamente; n_docs cuenta solo los documentos válidos. None como argumento se trata como lista vacía."
|
||||
- name: near_threshold
|
||||
desc: "Umbral de similitud Jaccard (0–1) para considerar dos documentos casi-duplicados en el cálculo near-dup vía MinHashLSH. Solo aplica si datasketch está instalada. Default 0.85."
|
||||
- name: sample_max
|
||||
desc: "Número máximo de documentos muestreados (los primeros) para el cálculo near-dup, que es O(n) en memoria de MinHashes. No afecta al conteo de duplicados exactos, que siempre recorre todo el corpus. Default 2000."
|
||||
output: "Dict con exactamente 5 claves, siempre presentes: n_docs (int, docs válidos), n_exact_dup (int, docs que repiten un texto normalizado ya visto = n_docs - n_unique), exact_dup_pct (float a 2 decimales = n_exact_dup/n_docs*100, o None si el corpus está vacío), n_unique (int, nº de textos normalizados distintos), y near_dup (sub-dict con available:bool y n_near_dup_docs:int; cuando available es True incluye además threshold con el near_threshold usado). La función nunca lanza: captura toda excepción y degrada."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
|
||||
# Tres copias del mismo texto (con espacios/casing distintos) + dos únicos.
|
||||
texts = [
|
||||
"El gato come pescado",
|
||||
"El gato come pescado",
|
||||
"el GATO come pescado", # mismo tras normalizar
|
||||
"Un perro ladra",
|
||||
"La luna brilla",
|
||||
]
|
||||
|
||||
compute_text_duplicates(texts)
|
||||
# {
|
||||
# "n_docs": 5,
|
||||
# "n_exact_dup": 2, # 3 copias del primer texto => 2 repeticiones
|
||||
# "exact_dup_pct": 40.0, # 2 / 5 * 100
|
||||
# "n_unique": 3, # 3 textos normalizados distintos
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}, # datasketch ausente
|
||||
# }
|
||||
|
||||
# Corpus vacío: contrato estable, exact_dup_pct None, sin excepción.
|
||||
compute_text_duplicates([])
|
||||
# {"n_docs": 0, "n_exact_dup": 0, "exact_dup_pct": None, "n_unique": 0,
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en la fase de calidad de un EDA de texto, cuando quieras saber cuánto de
|
||||
tu corpus es ruido duplicado antes de entrenar, vectorizar o muestrear: te da
|
||||
el porcentaje de duplicados exactos (`exact_dup_pct`), el número de documentos
|
||||
únicos (`n_unique`) y, si tienes `datasketch` instalada, una estimación de
|
||||
casi-duplicados (paráfrasis, copias con pequeñas ediciones) vía MinHash + LSH.
|
||||
Pásale directamente la columna/lista de textos crudos; la función filtra None y
|
||||
no-str por ti y nunca lanza, así que es segura para encadenar en pipelines de
|
||||
perfilado.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Near-dup requiere `datasketch` (opcional).** Si la librería no está
|
||||
instalada, `near_dup` degrada a `{"available": False, "n_near_dup_docs": 0}`
|
||||
(sin clave `threshold`) y el resto del resultado se calcula igual. Los
|
||||
duplicados **exactos** funcionan siempre porque solo usan la stdlib (hash).
|
||||
- **Normalización de exactos.** Dos textos cuentan como el mismo duplicado
|
||||
exacto si coinciden tras `" ".join(doc.split()).strip().lower()`: se colapsan
|
||||
espacios/tabuladores/saltos, se recortan extremos y se ignora el caso. Cambios
|
||||
de puntuación o acentos SÍ los distinguen (no se eliminan).
|
||||
- **`n_exact_dup` cuenta repeticiones, no grupos.** Con 3 copias de un mismo
|
||||
texto, `n_exact_dup` es 2 (las dos copias extra), no 1. Equivale a
|
||||
`n_docs - n_unique`.
|
||||
- **`exact_dup_pct` es `None` con corpus vacío** (no `ZeroDivisionError`); en
|
||||
cualquier otro caso es un float redondeado a 2 decimales.
|
||||
- **`sample_max` solo limita el near-dup.** El conteo de duplicados exactos
|
||||
recorre todo el corpus; el near-dup muestrea los primeros `sample_max`
|
||||
documentos para acotar memoria. Si el corpus está ordenado, considera barajar
|
||||
antes para que la muestra sea representativa.
|
||||
- **Elementos no-str se descartan.** `True`/`False` no cuentan como str y se
|
||||
ignoran igual que `None`; `n_docs` refleja solo los documentos válidos.
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Detección de documentos duplicados en un corpus de texto.
|
||||
|
||||
Función pura, estilo dict-no-throw del grupo `eda`: nunca lanza, siempre
|
||||
devuelve el mismo contrato de claves. Los duplicados EXACTOS se calculan
|
||||
siempre con la stdlib (normalización + hash SHA-1). Los CASI-duplicados
|
||||
(near-dup) requieren la dependencia opcional `datasketch`; si no está
|
||||
instalada, esa parte degrada limpiamente a ``available: False`` sin afectar
|
||||
al resto del cálculo.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
|
||||
def _compute_near_dup(valid, near_threshold, sample_max):
|
||||
"""Cuenta documentos con al menos otro casi-duplicado vía MinHash + LSH.
|
||||
|
||||
Import perezoso de ``datasketch``. Si la librería no está disponible (o
|
||||
cualquier paso falla), degrada a ``{"available": False, "n_near_dup_docs": 0}``
|
||||
sin propagar la excepción.
|
||||
|
||||
Args:
|
||||
valid: lista de str ya filtrada (sin None ni no-str).
|
||||
near_threshold: umbral de similitud Jaccard para LSH.
|
||||
sample_max: número máximo de documentos a muestrear.
|
||||
|
||||
Returns:
|
||||
dict con ``available`` (bool) y ``n_near_dup_docs`` (int). Cuando
|
||||
``available`` es True, incluye además ``threshold``.
|
||||
"""
|
||||
try:
|
||||
from datasketch import MinHash, MinHashLSH
|
||||
except Exception:
|
||||
return {"available": False, "n_near_dup_docs": 0}
|
||||
|
||||
try:
|
||||
docs = valid[:sample_max]
|
||||
num_perm = 128
|
||||
lsh = MinHashLSH(threshold=near_threshold, num_perm=num_perm)
|
||||
minhashes = {}
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
tokens = re.findall(r"\w+", doc.lower())
|
||||
shingles = set()
|
||||
for j in range(len(tokens) - 2):
|
||||
shingles.add(" ".join(tokens[j:j + 3]))
|
||||
# Documentos con menos de 3 tokens no generan 3-shingles: caemos a
|
||||
# los tokens sueltos para no perderlos del todo.
|
||||
if not shingles:
|
||||
shingles = set(tokens)
|
||||
if not shingles:
|
||||
# Documento sin tokens (cadena vacía / solo símbolos): se omite.
|
||||
continue
|
||||
m = MinHash(num_perm=num_perm)
|
||||
for sh in shingles:
|
||||
m.update(sh.encode("utf-8"))
|
||||
key = "d{}".format(i)
|
||||
minhashes[key] = m
|
||||
lsh.insert(key, m)
|
||||
|
||||
n_near = 0
|
||||
for key, m in minhashes.items():
|
||||
matches = lsh.query(m)
|
||||
if len(matches) > 1:
|
||||
n_near += 1
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_near_dup_docs": int(n_near),
|
||||
"threshold": near_threshold,
|
||||
}
|
||||
except Exception:
|
||||
return {"available": False, "n_near_dup_docs": 0}
|
||||
|
||||
|
||||
def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict:
|
||||
"""Detecta duplicados exactos y casi-duplicados en un corpus de texto.
|
||||
|
||||
Args:
|
||||
texts: lista de documentos. Los elementos None o que no sean str se
|
||||
descartan; ``n_docs`` cuenta solo los válidos.
|
||||
near_threshold: umbral de similitud Jaccard para considerar dos
|
||||
documentos casi-duplicados (solo near-dup, requiere datasketch).
|
||||
sample_max: tope de documentos muestreados para el cálculo near-dup.
|
||||
|
||||
Returns:
|
||||
dict con las claves ``n_docs``, ``n_exact_dup``, ``exact_dup_pct``
|
||||
(float redondeado a 2 decimales, o None si el corpus está vacío),
|
||||
``n_unique`` y ``near_dup`` (sub-dict con ``available`` y
|
||||
``n_near_dup_docs``, más ``threshold`` cuando está disponible).
|
||||
Nunca lanza: captura toda excepción y degrada.
|
||||
"""
|
||||
# Filtrado defensivo de documentos válidos.
|
||||
try:
|
||||
valid = [t for t in texts if isinstance(t, str)] if texts is not None else []
|
||||
except Exception:
|
||||
valid = []
|
||||
|
||||
n_docs = len(valid)
|
||||
|
||||
# Duplicados exactos: normalizar + hash SHA-1 (stdlib, siempre disponible).
|
||||
try:
|
||||
seen = set()
|
||||
n_exact_dup = 0
|
||||
for doc in valid:
|
||||
norm = " ".join(doc.split()).strip().lower()
|
||||
digest = hashlib.sha1(norm.encode("utf-8")).hexdigest()
|
||||
if digest in seen:
|
||||
n_exact_dup += 1
|
||||
else:
|
||||
seen.add(digest)
|
||||
n_unique = len(seen)
|
||||
except Exception:
|
||||
n_exact_dup = 0
|
||||
n_unique = 0
|
||||
|
||||
exact_dup_pct = round(n_exact_dup / n_docs * 100, 2) if n_docs > 0 else None
|
||||
|
||||
# Casi-duplicados: opcional vía datasketch, degrada solo.
|
||||
near_dup = _compute_near_dup(valid, near_threshold, sample_max)
|
||||
|
||||
return {
|
||||
"n_docs": n_docs,
|
||||
"n_exact_dup": n_exact_dup,
|
||||
"exact_dup_pct": exact_dup_pct,
|
||||
"n_unique": n_unique,
|
||||
"near_dup": near_dup,
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
"""Tests para compute_text_duplicates.
|
||||
|
||||
Importa el modulo hoja directamente (`datascience.compute_text_duplicates`)
|
||||
para no depender de que el paquete reexporte la funcion en su __init__.
|
||||
datasketch normalmente NO esta instalada en el venv, asi que near_dup
|
||||
degrada a available=False; los tests no requieren la libreria.
|
||||
"""
|
||||
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
|
||||
|
||||
EXPECTED_KEYS = {"n_docs", "n_exact_dup", "exact_dup_pct", "n_unique", "near_dup"}
|
||||
|
||||
|
||||
def test_duplicados_exactos():
|
||||
"""3 copias del mismo texto + 2 únicos: n_exact_dup=2, pct>0."""
|
||||
texts = [
|
||||
"El gato come pescado",
|
||||
"El gato come pescado",
|
||||
"el GATO come pescado", # mismo tras normalizar (espacios + case)
|
||||
"Un perro ladra",
|
||||
"La luna brilla",
|
||||
]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
assert set(result.keys()) == EXPECTED_KEYS
|
||||
assert result["n_docs"] == 5
|
||||
# 3 copias del primer texto (2 son repeticion) + 2 textos unicos.
|
||||
assert result["n_exact_dup"] == 2
|
||||
assert result["n_unique"] == 3
|
||||
assert result["exact_dup_pct"] is not None
|
||||
assert result["exact_dup_pct"] > 0
|
||||
# 2 / 5 * 100 = 40.0
|
||||
assert abs(result["exact_dup_pct"] - 40.0) < 1e-9
|
||||
|
||||
|
||||
def test_sin_duplicados():
|
||||
"""Corpus sin repeticiones: n_exact_dup=0, n_unique==n_docs."""
|
||||
texts = [
|
||||
"primero documento distinto",
|
||||
"segundo documento distinto",
|
||||
"tercero documento distinto",
|
||||
]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
assert result["n_docs"] == 3
|
||||
assert result["n_exact_dup"] == 0
|
||||
assert result["n_unique"] == 3
|
||||
assert abs(result["exact_dup_pct"] - 0.0) < 1e-9
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Corpus vacio: n_docs 0, exact_dup_pct None, no lanza."""
|
||||
result = compute_text_duplicates([])
|
||||
|
||||
assert set(result.keys()) == EXPECTED_KEYS
|
||||
assert result["n_docs"] == 0
|
||||
assert result["n_exact_dup"] == 0
|
||||
assert result["exact_dup_pct"] is None
|
||||
assert result["n_unique"] == 0
|
||||
assert result["near_dup"]["n_near_dup_docs"] == 0
|
||||
|
||||
|
||||
def test_near_dup_degrada():
|
||||
"""near_dup expone 'available' (bool) y no lanza aunque falte datasketch."""
|
||||
texts = ["uno dos tres cuatro", "uno dos tres cuatro cinco", "algo distinto"]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
near = result["near_dup"]
|
||||
assert "available" in near
|
||||
assert isinstance(near["available"], bool)
|
||||
assert "n_near_dup_docs" in near
|
||||
assert isinstance(near["n_near_dup_docs"], int)
|
||||
# Tambien tolera None y entradas no-str sin lanzar.
|
||||
mixed = compute_text_duplicates(["hola", None, 123, "hola"])
|
||||
assert mixed["n_docs"] == 2
|
||||
assert mixed["n_exact_dup"] == 1
|
||||
@@ -0,0 +1,86 @@
|
||||
---
|
||||
id: compute_text_length_stats_py_datascience
|
||||
name: compute_text_length_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_length_stats(texts, n_bins=20) -> dict"
|
||||
description: "Profiles the length distribution of a corpus of text documents for EDA: per-document characters, words (unicode \\w+ tokens) and sentences (segments split on .!?… with a minimum of 1 per non-empty doc), each summarized with mean/p50/p90/p99/min/max (nearest-rank percentiles), plus an equal-width histogram of per-document word counts. None and non-str items are discarded. Dict-no-throw: never raises. Stdlib only (re)."
|
||||
tags: [eda, datascience, text, nlp, length, statistics, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, math]
|
||||
example: |
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
result = compute_text_length_stats(["Hola mundo.", "Una frase mas larga aqui."], n_bins=5)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_basico"
|
||||
- "test_vacio"
|
||||
- "test_descarta_none"
|
||||
- "test_un_documento"
|
||||
test_file_path: "python/functions/datascience/compute_text_length_stats_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_length_stats.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "List of text documents (str). None entries and any non-str items (ints, floats, etc.) are discarded before any computation. An empty string \"\" is kept (chars 0, words 0, sentences 0)."
|
||||
- name: n_bins
|
||||
desc: "Number of equal-width bins for the per-document word-count histogram. Default 20. When all docs have the same word count, there are <2 docs, or n_bins < 1, a single covering bin is returned instead."
|
||||
output: "Dict with keys n_docs (int), chars, words, sentences and word_hist. Each of the three axis sub-dicts has the exact keys mean (float, 2 decimals), p50, p90, p99, min, max (ints). When there are no valid documents, n_docs is 0, every axis statistic is None and word_hist is []. word_hist is a list of {lo: float, hi: float, count: int} bins; the sum of all bin counts equals n_docs."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
|
||||
compute_text_length_stats(
|
||||
[
|
||||
"Hola mundo.",
|
||||
"Una frase mas larga con varias palabras aqui.",
|
||||
"Esto. Tiene. Tres frases distintas!",
|
||||
],
|
||||
n_bins=5,
|
||||
)
|
||||
# {
|
||||
# "n_docs": 3,
|
||||
# "chars": {"mean": 30.33, "p50": 35, "p90": 45, "p99": 45, "min": 11, "max": 45},
|
||||
# "words": {"mean": 5.0, "p50": 5, "p90": 8, "p99": 8, "min": 2, "max": 8},
|
||||
# "sentences": {"mean": 1.67, "p50": 1, "p90": 3, "p99": 3, "min": 1, "max": 3},
|
||||
# "word_hist": [
|
||||
# {"lo": 2.0, "hi": 3.2, "count": 1},
|
||||
# {"lo": 3.2, "hi": 4.4, "count": 0},
|
||||
# {"lo": 4.4, "hi": 5.6, "count": 1},
|
||||
# {"lo": 5.6, "hi": 6.8, "count": 0},
|
||||
# {"lo": 6.8, "hi": 8.0, "count": 1},
|
||||
# ],
|
||||
# }
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al perfilar una columna o corpus de texto libre en un EDA: cuando
|
||||
necesites saber lo largos que son los documentos (en caracteres, palabras y
|
||||
frases) y cómo se reparte esa longitud antes de tokenizar, vectorizar o decidir
|
||||
truncados/ventanas para un modelo. Pásale la lista de strings crudos de la
|
||||
columna; `None` y valores no-texto se descartan solos. Encaja en el grupo `eda`
|
||||
como bloque de longitud junto a `summarize_categorical`.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Función pura, solo stdlib (`re`). No usa numpy, pandas ni sklearn.
|
||||
- Percentiles por método **nearest-rank** (devuelven un valor real de la lista,
|
||||
no interpolan); por eso p50/p90/p99/min/max son enteros y `mean` es el único
|
||||
float (redondeado a 2 decimales).
|
||||
- El conteo de frases es una **aproximación** por puntuación (`.!?…`): un texto
|
||||
sin esa puntuación cuenta como 1 frase si no está vacío; abreviaturas o
|
||||
ellipsis pueden inflar o reducir el conteo.
|
||||
- `word_hist` es equal-width entre min y max de palabras: con todos los docs
|
||||
del mismo tamaño, menos de 2 docs, o `n_bins < 1`, devuelve un único bin.
|
||||
- Dict-no-throw: ante input inesperado devuelve la forma vacía
|
||||
(`n_docs` 0, ejes `None`, `word_hist` []) en vez de lanzar.
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Pure EDA helper: document length distribution for the `eda` group.
|
||||
|
||||
Given a list of text documents, computes the length distribution along three
|
||||
axes (characters, words and sentences) plus an equal-width histogram of the
|
||||
per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a
|
||||
hand-rolled nearest-rank percentile). No numpy, no sklearn.
|
||||
|
||||
The function is dict-no-throw: it never raises. On any unexpected input it
|
||||
degrades to the empty-shape result.
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
_WORD_RE = re.compile(r"\w+", re.UNICODE)
|
||||
_SENT_RE = re.compile(r"[.!?…]+")
|
||||
|
||||
|
||||
def _empty_axis() -> dict:
|
||||
"""Return an axis sub-dict with every statistic set to ``None``."""
|
||||
return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None}
|
||||
|
||||
|
||||
def _pct(sorted_vals, q):
|
||||
"""Nearest-rank percentile of an already-sorted list.
|
||||
|
||||
Args:
|
||||
sorted_vals: List of numbers sorted ascending.
|
||||
q: Percentile in the 0..100 range.
|
||||
|
||||
Returns:
|
||||
The value at the nearest rank, or ``None`` for an empty list.
|
||||
"""
|
||||
n = len(sorted_vals)
|
||||
if n == 0:
|
||||
return None
|
||||
if q <= 0:
|
||||
return sorted_vals[0]
|
||||
rank = math.ceil(q / 100.0 * n)
|
||||
if rank < 1:
|
||||
rank = 1
|
||||
if rank > n:
|
||||
rank = n
|
||||
return sorted_vals[rank - 1]
|
||||
|
||||
|
||||
def _axis_stats(values) -> dict:
|
||||
"""Compute mean/p50/p90/p99/min/max over a list of integer counts.
|
||||
|
||||
``mean`` is rounded to 2 decimals; every other statistic is an integer
|
||||
(they are counts). Returns an all-``None`` axis for an empty list.
|
||||
"""
|
||||
if not values:
|
||||
return _empty_axis()
|
||||
sv = sorted(values)
|
||||
return {
|
||||
"mean": round(sum(sv) / len(sv), 2),
|
||||
"p50": int(_pct(sv, 50)),
|
||||
"p90": int(_pct(sv, 90)),
|
||||
"p99": int(_pct(sv, 99)),
|
||||
"min": int(sv[0]),
|
||||
"max": int(sv[-1]),
|
||||
}
|
||||
|
||||
|
||||
def _word_hist(word_counts, n_bins) -> list:
|
||||
"""Equal-width histogram of per-document word counts.
|
||||
|
||||
Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When
|
||||
every document has the same number of words, there are fewer than 2
|
||||
documents, or ``n_bins`` is not at least 1, a single covering bin is
|
||||
returned. With no documents the result is ``[]``. The sum of bin ``count``
|
||||
always equals ``len(word_counts)``.
|
||||
"""
|
||||
if not word_counts:
|
||||
return []
|
||||
wmin = min(word_counts)
|
||||
wmax = max(word_counts)
|
||||
if wmax == wmin or len(word_counts) < 2 or n_bins < 1:
|
||||
return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}]
|
||||
|
||||
width = (wmax - wmin) / n_bins
|
||||
bins = []
|
||||
for i in range(n_bins):
|
||||
lo = wmin + i * width
|
||||
hi = wmin + (i + 1) * width
|
||||
bins.append({"lo": float(lo), "hi": float(hi), "count": 0})
|
||||
# Pin the last upper edge to the real maximum to avoid float drift.
|
||||
bins[-1]["hi"] = float(wmax)
|
||||
|
||||
for wc in word_counts:
|
||||
if wc >= wmax:
|
||||
idx = n_bins - 1
|
||||
else:
|
||||
idx = int((wc - wmin) / width)
|
||||
if idx < 0:
|
||||
idx = 0
|
||||
elif idx >= n_bins:
|
||||
idx = n_bins - 1
|
||||
bins[idx]["count"] += 1
|
||||
return bins
|
||||
|
||||
|
||||
def compute_text_length_stats(texts, n_bins=20) -> dict:
|
||||
"""Summarize the length distribution of a corpus of text documents.
|
||||
|
||||
For each document three lengths are measured: characters (``len(doc)``),
|
||||
words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments
|
||||
after splitting on ``.!?…``, with a minimum of 1 for any non-empty
|
||||
document). For each axis the mean, p50, p90, p99, min and max are reported,
|
||||
plus an equal-width histogram of the per-document word counts.
|
||||
|
||||
``None`` entries and any non-``str`` items in ``texts`` are discarded.
|
||||
The function never raises: on empty/``None`` input or any internal error it
|
||||
returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]``
|
||||
histogram).
|
||||
|
||||
Args:
|
||||
texts: List of text documents (``str``). ``None`` and non-``str``
|
||||
items are dropped.
|
||||
n_bins: Number of equal-width bins for the word-count histogram.
|
||||
Default 20.
|
||||
|
||||
Returns:
|
||||
Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and
|
||||
``word_hist``. Each of the three axes is a sub-dict with ``mean``
|
||||
(float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max``
|
||||
(ints), all ``None`` when there are no documents. ``word_hist`` is a
|
||||
list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``.
|
||||
"""
|
||||
empty_axis = _empty_axis()
|
||||
fallback = {
|
||||
"n_docs": 0,
|
||||
"chars": dict(empty_axis),
|
||||
"words": dict(empty_axis),
|
||||
"sentences": dict(empty_axis),
|
||||
"word_hist": [],
|
||||
}
|
||||
try:
|
||||
if not texts:
|
||||
return fallback
|
||||
|
||||
docs = [t for t in texts if isinstance(t, str)]
|
||||
n_docs = len(docs)
|
||||
if n_docs == 0:
|
||||
return fallback
|
||||
|
||||
char_counts = [len(d) for d in docs]
|
||||
word_counts = [len(_WORD_RE.findall(d)) for d in docs]
|
||||
|
||||
sent_counts = []
|
||||
for d in docs:
|
||||
segments = [s for s in _SENT_RE.split(d) if s.strip()]
|
||||
n = len(segments)
|
||||
if d and n == 0:
|
||||
# Non-empty document with no detectable sentence: count as 1.
|
||||
n = 1
|
||||
sent_counts.append(n)
|
||||
|
||||
return {
|
||||
"n_docs": n_docs,
|
||||
"chars": _axis_stats(char_counts),
|
||||
"words": _axis_stats(word_counts),
|
||||
"sentences": _axis_stats(sent_counts),
|
||||
"word_hist": _word_hist(word_counts, n_bins),
|
||||
}
|
||||
except Exception:
|
||||
return fallback
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Tests para compute_text_length_stats.
|
||||
|
||||
Inserta `python/functions` en sys.path (relativo a este archivo) para importar
|
||||
el modulo hoja por su paquete `datascience`, sin depender de que el paquete lo
|
||||
reexporte en su __init__.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
|
||||
|
||||
def test_basico():
|
||||
"""Varios textos de longitudes distintas: stats y histograma coherentes."""
|
||||
texts = [
|
||||
"Hola mundo.", # 2 words, 1 sentence
|
||||
"Una frase mas larga con varias palabras aqui.", # 8 words, 1 sentence
|
||||
"Corto.", # 1 word, 1 sentence
|
||||
"Esto. Tiene. Tres frases distintas!", # 5 words, 3 sentences
|
||||
]
|
||||
result = compute_text_length_stats(texts)
|
||||
|
||||
assert result["n_docs"] == 4
|
||||
# Diferentes longitudes en palabras -> max estrictamente mayor que min.
|
||||
assert result["words"]["max"] > result["words"]["min"]
|
||||
# El histograma de palabras no esta vacio.
|
||||
assert result["word_hist"] != []
|
||||
# La suma de counts del histograma cubre todos los documentos.
|
||||
assert sum(b["count"] for b in result["word_hist"]) == result["n_docs"]
|
||||
# mean es float redondeado; min/max son enteros.
|
||||
assert isinstance(result["words"]["mean"], float)
|
||||
assert isinstance(result["words"]["min"], int)
|
||||
assert isinstance(result["words"]["max"], int)
|
||||
# El documento con 3 frases empuja el max de sentences a >= 3.
|
||||
assert result["sentences"]["max"] >= 3
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Lista vacia: n_docs 0, subdicts None, word_hist []."""
|
||||
result = compute_text_length_stats([])
|
||||
assert result["n_docs"] == 0
|
||||
for axis in ("chars", "words", "sentences"):
|
||||
for key in ("mean", "p50", "p90", "p99", "min", "max"):
|
||||
assert result[axis][key] is None
|
||||
assert result["word_hist"] == []
|
||||
|
||||
|
||||
def test_descarta_none():
|
||||
"""None y valores no-str se descartan del computo."""
|
||||
result = compute_text_length_stats(["hello world", None, 123, 4.5, "foo bar baz"])
|
||||
# Solo dos strings validos.
|
||||
assert result["n_docs"] == 2
|
||||
assert result["words"]["min"] == 2 # "hello world"
|
||||
assert result["words"]["max"] == 3 # "foo bar baz"
|
||||
assert sum(b["count"] for b in result["word_hist"]) == 2
|
||||
|
||||
|
||||
def test_un_documento():
|
||||
"""Un solo documento: word_hist tiene exactamente un bin con count 1."""
|
||||
result = compute_text_length_stats(["solo un documento aqui"])
|
||||
assert result["n_docs"] == 1
|
||||
assert len(result["word_hist"]) == 1
|
||||
assert result["word_hist"][0]["count"] == 1
|
||||
# Con un unico documento, p50 == min == max == su numero de palabras (4).
|
||||
assert result["words"]["min"] == 4
|
||||
assert result["words"]["max"] == 4
|
||||
assert result["words"]["p50"] == 4
|
||||
@@ -0,0 +1,88 @@
|
||||
---
|
||||
id: compute_text_readability_py_datascience
|
||||
name: compute_text_readability
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_readability(texts, sample_max=500) -> dict"
|
||||
description: "Calcula la legibilidad Flesch Reading Ease de un corpus de texto usando textstat con import perezoso y degradación. Filtra None/no-str/vacíos, muestrea hasta sample_max documentos (los primeros) y agrega los scores Flesch en {mean, p50, min, max}. Si textstat no está instalada devuelve available=False sin lanzar. Estilo dict-no-throw del grupo eda — nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, readability, flesch, textstat, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [math, textstat]
|
||||
example: |
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
out = compute_text_readability(["The cat sat on the mat. It was warm and sunny."])
|
||||
# {"available": True, "n_scored": 1, "flesch": {"mean": 109.0, "p50": 109.0, "min": 108.96..., "max": 108.96...}}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_prosa_ingles"
|
||||
- "test_vacio"
|
||||
- "test_degradacion"
|
||||
test_file_path: "python/functions/datascience/compute_text_readability_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_readability.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de str (documentos del corpus). Los elementos None, no-str o vacíos tras strip() se descartan silenciosamente. El orden se respeta: el muestreo toma los primeros documentos válidos."
|
||||
- name: sample_max
|
||||
desc: "Número máximo de documentos válidos a puntuar (los primeros). Default 500. Acota el coste en corpus grandes. Valores no convertibles a int caen a 500; negativos se tratan como 0."
|
||||
output: "Dict con exactamente 3 claves siempre presentes: available (bool: True si textstat se pudo importar), n_scored (int: nº de documentos efectivamente puntuados), flesch (dict con mean, p50, min, max). mean y p50 redondeados a 1 decimal; p50 por nearest-rank sobre los scores ordenados; min/max son los scores extremos sin redondear. Todos los valores de flesch son None cuando n_scored es 0. La función nunca lanza: cualquier excepción global (incluida ImportError de textstat) degrada a available=False, n_scored=0 y flesch todo None."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
|
||||
textos = [
|
||||
"The cat sat on the mat. It was a warm and sunny day in the park.",
|
||||
"Reading is a wonderful habit. Books open doors to new worlds and ideas.",
|
||||
"He ran quickly to the store to buy some fresh bread and a bottle of milk.",
|
||||
]
|
||||
|
||||
compute_text_readability(textos)
|
||||
# {
|
||||
# "available": True,
|
||||
# "n_scored": 3,
|
||||
# "flesch": {"mean": 91.4, "p50": 95.4, "min": 70.08..., "max": 108.83...}
|
||||
# }
|
||||
|
||||
# Corpus vacío (textstat presente): available True pero nada que puntuar.
|
||||
compute_text_readability([])
|
||||
# {"available": True, "n_scored": 0,
|
||||
# "flesch": {"mean": None, "p50": None, "min": None, "max": None}}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en un EDA de texto cuando necesites una métrica única y comparable de
|
||||
**lo fácil que es de leer** un corpus de documentos (descripciones, reviews,
|
||||
artículos, tickets). Devuelve el resumen Flesch Reading Ease agregado
|
||||
(`mean`/`p50`/`min`/`max`) listo para un report o un bloque del notebook, sin
|
||||
tener que iterar `textstat` a mano. Pásale la lista de textos crudos y, si el
|
||||
corpus es grande, limita el coste con `sample_max`. El estilo dict-no-throw
|
||||
permite incrustarla en pipelines del grupo `eda` sin envolver en try/except.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **`textstat` es una dependencia opcional.** Si no está instalada (o falla al
|
||||
importar) la función NO lanza: devuelve `available=False`, `n_scored=0` y
|
||||
`flesch` todo `None`. Comprueba `available` antes de interpretar los números.
|
||||
- **Flesch Reading Ease está pensado para prosa en inglés.** Aplicado a otros
|
||||
idiomas o a texto no-prosa (código, listas, tablas, cadenas muy cortas) los
|
||||
scores no son interpretables, aunque se calculen sin error.
|
||||
- **Escala Flesch:** valores **altos** = más fácil de leer (≈90–100 muy fácil),
|
||||
valores **bajos** = más difícil (puede ser negativo en texto muy denso). No
|
||||
se recortan a ningún rango: se reportan tal cual los devuelve `textstat`.
|
||||
- **`available=True` con `n_scored=0`** significa que `textstat` está presente
|
||||
pero el corpus no aportó documentos puntuables (vacío, solo None/no-str, o
|
||||
todos los docs fallaron al puntuar). Es distinto de `available=False`.
|
||||
- **Muestreo = los primeros `sample_max`**, no aleatorio. Si el orden del corpus
|
||||
está sesgado, el resumen reflejará ese sesgo.
|
||||
- **`mean` y `p50` redondean a 1 decimal**; `min`/`max` se devuelven sin
|
||||
redondear (los scores extremos reales).
|
||||
@@ -0,0 +1,121 @@
|
||||
"""Legibilidad Flesch Reading Ease de un corpus de texto.
|
||||
|
||||
Función pura del grupo `eda`, estilo dict-no-throw: nunca lanza. Usa la
|
||||
librería `textstat` con import perezoso y degradación: si `textstat` no está
|
||||
instalada (o falla al importar), devuelve un resultado con `available=False`
|
||||
en lugar de propagar el error.
|
||||
"""
|
||||
|
||||
|
||||
def _percentile_nearest_rank(sorted_values, pct):
|
||||
"""Percentil por nearest-rank sobre una lista ya ordenada ascendente.
|
||||
|
||||
rank = ceil(pct/100 * n); índice 1-based recortado a [1, n].
|
||||
Devuelve None si la lista está vacía.
|
||||
"""
|
||||
n = len(sorted_values)
|
||||
if n == 0:
|
||||
return None
|
||||
import math
|
||||
|
||||
rank = math.ceil((pct / 100.0) * n)
|
||||
if rank < 1:
|
||||
rank = 1
|
||||
if rank > n:
|
||||
rank = n
|
||||
return sorted_values[rank - 1]
|
||||
|
||||
|
||||
def compute_text_readability(texts, sample_max=500) -> dict:
|
||||
"""Calcula la legibilidad Flesch Reading Ease de un corpus.
|
||||
|
||||
Args:
|
||||
texts: lista de str. Los elementos None, no-str o vacíos (tras strip)
|
||||
se descartan. Se muestrean los primeros `sample_max` documentos
|
||||
válidos.
|
||||
sample_max: número máximo de documentos a puntuar (los primeros).
|
||||
|
||||
Returns:
|
||||
Dict con la forma exacta::
|
||||
|
||||
{"available": bool, "n_scored": int,
|
||||
"flesch": {"mean": float|None, "p50": float|None,
|
||||
"min": float|None, "max": float|None}}
|
||||
|
||||
`available` es True si `textstat` se pudo importar. La función nunca
|
||||
lanza: cualquier excepción global degrada a `available=False`.
|
||||
"""
|
||||
empty = {
|
||||
"available": False,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
try:
|
||||
# Import perezoso con degradación: textstat es una dependencia opcional.
|
||||
try:
|
||||
import textstat
|
||||
except Exception:
|
||||
return {
|
||||
"available": False,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
|
||||
# Filtrar y muestrear documentos válidos (los primeros sample_max).
|
||||
docs = []
|
||||
if texts is not None:
|
||||
try:
|
||||
limit = int(sample_max)
|
||||
except Exception:
|
||||
limit = 500
|
||||
if limit < 0:
|
||||
limit = 0
|
||||
for item in texts:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
if item.strip() == "":
|
||||
continue
|
||||
docs.append(item)
|
||||
if len(docs) >= limit:
|
||||
break
|
||||
|
||||
scores = []
|
||||
for doc in docs:
|
||||
try:
|
||||
score = textstat.flesch_reading_ease(doc)
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
score = float(score)
|
||||
except Exception:
|
||||
continue
|
||||
scores.append(score)
|
||||
|
||||
n_scored = len(scores)
|
||||
if n_scored == 0:
|
||||
# textstat presente pero corpus vacío / sin puntuar.
|
||||
return {
|
||||
"available": True,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
|
||||
mean_val = round(sum(scores) / n_scored, 1)
|
||||
sorted_scores = sorted(scores)
|
||||
p50_raw = _percentile_nearest_rank(sorted_scores, 50)
|
||||
p50_val = round(p50_raw, 1) if p50_raw is not None else None
|
||||
min_val = sorted_scores[0]
|
||||
max_val = sorted_scores[-1]
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_scored": n_scored,
|
||||
"flesch": {
|
||||
"mean": mean_val,
|
||||
"p50": p50_val,
|
||||
"min": min_val,
|
||||
"max": max_val,
|
||||
},
|
||||
}
|
||||
except Exception:
|
||||
return empty
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Tests para compute_text_readability."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import builtins
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
|
||||
|
||||
EXPECTED_KEYS = {"available", "n_scored", "flesch"}
|
||||
FLESCH_KEYS = {"mean", "p50", "min", "max"}
|
||||
|
||||
|
||||
def test_prosa_ingles():
|
||||
"""Varios textos en prosa inglesa: available True, n_scored>0, mean no None."""
|
||||
texts = [
|
||||
"The cat sat on the mat. It was a warm and sunny day in the park.",
|
||||
"She sells sea shells by the sea shore. The shells she sells are surely sea shells.",
|
||||
"Reading is a wonderful habit. Books open doors to new worlds and ideas.",
|
||||
"He ran quickly to the store to buy some fresh bread and a bottle of milk.",
|
||||
]
|
||||
out = compute_text_readability(texts)
|
||||
|
||||
assert set(out.keys()) == EXPECTED_KEYS
|
||||
assert out["available"] is True
|
||||
assert out["n_scored"] > 0
|
||||
assert set(out["flesch"].keys()) == FLESCH_KEYS
|
||||
assert out["flesch"]["mean"] is not None
|
||||
assert out["flesch"]["p50"] is not None
|
||||
assert out["flesch"]["min"] is not None
|
||||
assert out["flesch"]["max"] is not None
|
||||
# min <= mean/p50 <= max coherente.
|
||||
assert out["flesch"]["min"] <= out["flesch"]["max"]
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Corpus vacío con textstat presente: available True, n_scored 0, flesch None."""
|
||||
out = compute_text_readability([])
|
||||
|
||||
assert set(out.keys()) == EXPECTED_KEYS
|
||||
assert out["available"] is True
|
||||
assert out["n_scored"] == 0
|
||||
assert out["flesch"]["mean"] is None
|
||||
assert out["flesch"]["p50"] is None
|
||||
assert out["flesch"]["min"] is None
|
||||
assert out["flesch"]["max"] is None
|
||||
|
||||
# Elementos no-str / vacíos también se descartan -> n_scored 0.
|
||||
out2 = compute_text_readability([None, "", " ", 123])
|
||||
assert out2["available"] is True
|
||||
assert out2["n_scored"] == 0
|
||||
|
||||
|
||||
def test_degradacion(monkeypatch):
|
||||
"""Sin textstat (ImportError forzado): degrada a available False sin lanzar."""
|
||||
import datascience.compute_text_readability as m
|
||||
|
||||
real = builtins.__import__
|
||||
|
||||
def fake(name, *a, **k):
|
||||
if name == "textstat" or name.startswith("textstat."):
|
||||
raise ImportError("simulado")
|
||||
return real(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake)
|
||||
out = m.compute_text_readability(["The cat sat on the mat. It was happy and warm."])
|
||||
assert out["available"] is False
|
||||
assert out["n_scored"] == 0
|
||||
assert out["flesch"]["mean"] is None
|
||||
assert out["flesch"]["p50"] is None
|
||||
assert out["flesch"]["min"] is None
|
||||
assert out["flesch"]["max"] is None
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
id: compute_top_ngrams_py_datascience
|
||||
name: compute_top_ngrams
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict"
|
||||
description: "Calcula los n-gramas de palabras más frecuentes de un corpus de texto (n=1 unigramas, 2 bigramas, 3 trigramas...). Tokeniza a minúsculas con re.findall(r'\\w+', ...), descarta tokens numéricos y, si remove_stopwords=True, elimina stopwords ES+EN ANTES de formar los n-gramas (n-gramas contiguos sobre la secuencia de tokens de contenido, sin cruzar documentos). Pura y autocontenida con collections.Counter, sin sklearn. Estilo dict-no-throw del grupo eda: nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, ngrams, bigrams, trigrams, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, collections]
|
||||
example: |
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
texts = ["machine learning rocks", "we love machine learning"]
|
||||
compute_top_ngrams(texts, n=2, top_k=5)
|
||||
# {"n": 2, "top": [{"ngram": "machine learning", "count": 2}, ...]}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_bigramas"
|
||||
- "test_trigramas"
|
||||
- "test_vacio"
|
||||
- "test_stopwords"
|
||||
test_file_path: "python/functions/datascience/compute_top_ngrams_test.py"
|
||||
file_path: "python/functions/datascience/compute_top_ngrams.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista (o tupla) de cadenas. Los elementos None o que no sean str se descartan silenciosamente. Cada documento se tokeniza por separado; los n-gramas no cruzan la frontera entre documentos."
|
||||
- name: n
|
||||
desc: "Tamaño del n-grama: 1 unigramas, 2 bigramas, 3 trigramas, etc. Valores < 1 o no enteros producen top vacío (se conserva tal cual en la clave 'n' del retorno)."
|
||||
- name: top_k
|
||||
desc: "Número máximo de n-gramas a devolver, ordenados por frecuencia descendente con desempate alfabético determinista. Default 15. Valores negativos se tratan como 0."
|
||||
- name: remove_stopwords
|
||||
desc: "Si True (default) elimina las stopwords ES+EN de una lista inline (~130 términos de altísima frecuencia) ANTES de formar los n-gramas, de modo que los n-gramas se construyen sobre la secuencia de tokens de contenido."
|
||||
output: "Dict con exactamente 2 claves: n (el n recibido, sin normalizar) y top (lista de dicts {'ngram': str, 'count': int} ordenada por count descendente, longitud <= top_k). ngram es la unión de los tokens del n-grama por un espacio. Corpus vacío, tokens insuficientes para formar n-gramas o cualquier excepción interna degradan a {'n': n, 'top': []}. La función nunca lanza."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
|
||||
texts = [
|
||||
"machine learning rocks",
|
||||
"machine learning is fun",
|
||||
"we love machine learning",
|
||||
]
|
||||
|
||||
# Bigramas (n=2): "machine learning" aparece en los 3 documentos.
|
||||
compute_top_ngrams(texts, n=2, top_k=5)
|
||||
# {
|
||||
# "n": 2,
|
||||
# "top": [
|
||||
# {"ngram": "machine learning", "count": 3},
|
||||
# {"ngram": "learning fun", "count": 1},
|
||||
# {"ngram": "learning rocks", "count": 1},
|
||||
# {"ngram": "love machine", "count": 1},
|
||||
# ],
|
||||
# }
|
||||
|
||||
# Unigramas con stopwords fuera (default): solo palabras de contenido.
|
||||
compute_top_ngrams(["the cat sat on the mat"], n=1, top_k=3)
|
||||
# {"n": 1, "top": [{"ngram": "cat", "count": 1},
|
||||
# {"ngram": "mat", "count": 1},
|
||||
# {"ngram": "sat", "count": 1}]}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en la fase de EDA de texto cuando, además del vocabulario suelto, necesites
|
||||
ver qué **combinaciones de palabras contiguas** dominan un corpus: colocaciones,
|
||||
frases técnicas recurrentes ("machine learning", "data analyst"), o patrones de
|
||||
trigramas en titulares/descripciones. Es el complemento natural de un perfil de
|
||||
vocabulario: pasa de "qué palabras aparecen" a "qué secuencias aparecen". Llámala
|
||||
con `n=1` para unigramas, `n=2` para bigramas y `n=3` para trigramas, y ajusta
|
||||
`top_k` al tamaño de la tabla que vas a renderizar. Deja `remove_stopwords=True`
|
||||
para que los n-gramas reflejen contenido y no conectores gramaticales.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Las stopwords se eliminan ANTES de formar los n-gramas.** Con
|
||||
`remove_stopwords=True` la frase "data of analysis" produce el bigrama
|
||||
"data analysis" (el "of" intermedio desaparece y los tokens de contenido se
|
||||
vuelven contiguos), no "data of" ni "of analysis". Si quieres preservar la
|
||||
adyacencia literal del texto original, pasa `remove_stopwords=False`.
|
||||
- **Los n-gramas NO cruzan documentos.** Cada elemento de `texts` se tokeniza y
|
||||
recorre por separado; el último token de un documento nunca se combina con el
|
||||
primero del siguiente.
|
||||
- **Tokens puramente numéricos se descartan** (`tok.isdigit()`), pero los
|
||||
alfanuméricos mixtos no: "3d" o "covid19" sí cuentan como tokens. Un decimal
|
||||
como "3.5" se parte en "3" y "5" por `\w+` y ambos se descartan por numéricos.
|
||||
- **La lista de stopwords es inline ES+EN**, pensada para textos generales en
|
||||
esos dos idiomas. Para otros idiomas o jerga específica de dominio puede dejar
|
||||
pasar conectores; en ese caso filtra el corpus aguas arriba o usa
|
||||
`remove_stopwords=False` y posfiltra.
|
||||
- **`top` puede tener menos de `top_k` elementos** si el corpus no tiene tantos
|
||||
n-gramas distintos. El desempate por frecuencia es alfabético (determinista),
|
||||
no por orden de aparición.
|
||||
@@ -0,0 +1,94 @@
|
||||
"""Top n-gramas de palabras más frecuentes de un corpus de texto.
|
||||
|
||||
Función pura, autocontenida (solo stdlib: re + collections.Counter). No depende
|
||||
de scikit-learn ni de ninguna otra librería externa. Estilo dict-no-throw del
|
||||
grupo `eda`: ante cualquier entrada degenerada o excepción interna devuelve
|
||||
``{"n": n, "top": []}`` en vez de lanzar.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# Lista inline de stopwords ES + EN (~80 términos de altísima frecuencia).
|
||||
# Se eliminan ANTES de formar los n-gramas: los n-gramas se construyen sobre la
|
||||
# secuencia de tokens de contenido, no sobre el texto original.
|
||||
_STOPWORDS = frozenset({
|
||||
# Español
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "mas",
|
||||
"pero", "sus", "le", "ya", "o", "este", "sí", "si", "porque", "esta",
|
||||
"entre", "cuando", "muy", "sin", "sobre", "también", "tambien", "me",
|
||||
"hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
||||
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
||||
"ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo",
|
||||
"otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes",
|
||||
"nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas",
|
||||
"algo", "nosotros",
|
||||
# Inglés
|
||||
"the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
|
||||
"are", "was", "be", "this", "that", "by", "an", "or", "at", "from", "but",
|
||||
"not", "have", "has", "had", "they", "you", "we", "he", "she", "his",
|
||||
"her", "their", "its", "i", "my", "me", "our", "us", "do", "does", "did",
|
||||
"will", "would", "can", "could", "should", "there", "which", "who", "what",
|
||||
"when", "where", "how", "all", "if", "so", "than", "then", "out", "up",
|
||||
})
|
||||
|
||||
|
||||
def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict:
|
||||
"""Calcula los n-gramas de palabras más frecuentes de un corpus.
|
||||
|
||||
Args:
|
||||
texts: lista de cadenas. Los elementos ``None`` o que no sean ``str`` se
|
||||
descartan silenciosamente.
|
||||
n: tamaño del n-grama (1 = unigramas, 2 = bigramas, 3 = trigramas...).
|
||||
Valores < 1 o no enteros producen ``top`` vacío.
|
||||
top_k: número máximo de n-gramas a devolver, ordenados por frecuencia
|
||||
descendente (con desempate alfabético determinista).
|
||||
remove_stopwords: si ``True`` elimina las stopwords ES+EN ANTES de
|
||||
formar los n-gramas, de modo que los n-gramas se construyen sobre la
|
||||
secuencia de tokens de contenido (no cruzando documentos).
|
||||
|
||||
Returns:
|
||||
``{"n": n, "top": [{"ngram": "w1 w2", "count": int}, ...]}``. Corpus
|
||||
vacío, sin tokens suficientes o cualquier excepción interna degrada a
|
||||
``{"n": n, "top": []}``. Nunca lanza.
|
||||
"""
|
||||
try:
|
||||
if not isinstance(n, int) or n < 1:
|
||||
return {"n": n, "top": []}
|
||||
|
||||
try:
|
||||
limit = int(top_k)
|
||||
except (TypeError, ValueError):
|
||||
limit = 0
|
||||
if limit < 0:
|
||||
limit = 0
|
||||
|
||||
if not isinstance(texts, (list, tuple)):
|
||||
return {"n": n, "top": []}
|
||||
|
||||
counter = Counter()
|
||||
for doc in texts:
|
||||
if not isinstance(doc, str):
|
||||
continue
|
||||
tokens = [
|
||||
tok
|
||||
for tok in re.findall(r"\w+", doc.lower(), re.UNICODE)
|
||||
if not tok.isdigit()
|
||||
]
|
||||
if remove_stopwords:
|
||||
tokens = [tok for tok in tokens if tok not in _STOPWORDS]
|
||||
if len(tokens) < n:
|
||||
continue
|
||||
for i in range(len(tokens) - n + 1):
|
||||
ngram = " ".join(tokens[i:i + n])
|
||||
counter[ngram] += 1
|
||||
|
||||
if not counter:
|
||||
return {"n": n, "top": []}
|
||||
|
||||
ordered = sorted(counter.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
top = [{"ngram": ngram, "count": count} for ngram, count in ordered[:limit]]
|
||||
return {"n": n, "top": top}
|
||||
except Exception:
|
||||
return {"n": n, "top": []}
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Tests para compute_top_ngrams."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# sys.path estándar: añade `python/functions/` para importar por paquete raíz.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
|
||||
|
||||
def test_bigramas():
|
||||
# "machine learning" se repite en cada documento -> bigrama más frecuente.
|
||||
texts = [
|
||||
"machine learning rocks",
|
||||
"machine learning is fun",
|
||||
"we love machine learning",
|
||||
]
|
||||
result = compute_top_ngrams(texts, n=2, top_k=5)
|
||||
assert result["n"] == 2
|
||||
assert result["top"], "esperaba al menos un bigrama"
|
||||
assert result["top"][0]["ngram"] == "machine learning"
|
||||
assert result["top"][0]["count"] == 3
|
||||
# Cada entrada respeta el contrato {"ngram": str, "count": int}.
|
||||
for item in result["top"]:
|
||||
assert isinstance(item["ngram"], str)
|
||||
assert isinstance(item["count"], int)
|
||||
|
||||
|
||||
def test_trigramas():
|
||||
texts = [
|
||||
"alpha beta gamma delta",
|
||||
"alpha beta gamma omega",
|
||||
]
|
||||
# Con stopwords desactivadas para no descartar tokens de contenido.
|
||||
result = compute_top_ngrams(texts, n=3, top_k=5, remove_stopwords=False)
|
||||
assert result["n"] == 3
|
||||
ngrams = {item["ngram"]: item["count"] for item in result["top"]}
|
||||
# "alpha beta gamma" aparece en ambos documentos.
|
||||
assert ngrams.get("alpha beta gamma") == 2
|
||||
# Trigramas únicos de cada documento.
|
||||
assert ngrams.get("beta gamma delta") == 1
|
||||
assert ngrams.get("beta gamma omega") == 1
|
||||
|
||||
|
||||
def test_vacio():
|
||||
assert compute_top_ngrams([], n=2) == {"n": 2, "top": []}
|
||||
# Documentos no-str / None se descartan -> corpus efectivamente vacío.
|
||||
assert compute_top_ngrams([None, 123, {"a": 1}], n=2) == {"n": 2, "top": []}
|
||||
|
||||
|
||||
def test_stopwords():
|
||||
# "the cat" debería desaparecer al quitar stopwords ("the" es stopword EN).
|
||||
texts = ["the cat the cat the cat"]
|
||||
con = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=True)
|
||||
sin = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=False)
|
||||
|
||||
con_ngrams = {item["ngram"] for item in con["top"]}
|
||||
sin_ngrams = {item["ngram"] for item in sin["top"]}
|
||||
|
||||
# Sin filtrar, el bigrama dominante es "the cat".
|
||||
assert "the cat" in sin_ngrams
|
||||
# Al filtrar stopwords, ya no aparece "the cat" (queda solo "cat cat").
|
||||
assert "the cat" not in con_ngrams
|
||||
assert con_ngrams != sin_ngrams
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
id: compute_vocabulary_stats_py_datascience
|
||||
name: compute_vocabulary_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_vocabulary_stats(texts: list, top_k: int = 20, remove_stopwords: bool = True) -> dict"
|
||||
description: "Profiles the vocabulary of a text corpus for EDA: tokenises a list of documents, counts term frequencies and derives lexical-richness measures — total tokens, unique types, type-token ratio (TTR), hapax legomena and the top-k most frequent terms. Pure, stdlib only (re + collections.Counter); no nltk, no sklearn. Inline ES+EN stopword list, opt-out via remove_stopwords. Never raises: empty/degenerate input returns the zeroed result."
|
||||
tags: [eda, datascience, text, nlp, vocabulary, ttr, hapax, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, collections]
|
||||
example: |
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
result = compute_vocabulary_stats(["el gato y el perro", "gato veloz"], top_k=5)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_basico"
|
||||
- "test_vacio"
|
||||
- "test_stopwords_quitadas"
|
||||
- "test_stopwords_conservadas"
|
||||
test_file_path: "python/functions/datascience/compute_vocabulary_stats_test.py"
|
||||
file_path: "python/functions/datascience/compute_vocabulary_stats.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "List of documents (strings) forming the corpus. Entries that are None or not a str are silently discarded. Tokens are extracted per document with re.findall(r'\\w+', doc.lower(), re.UNICODE); purely numeric tokens (tok.isdigit()) are dropped."
|
||||
- name: top_k
|
||||
desc: "Maximum number of most-frequent terms to return in top_terms. Default 20. Does not affect n_tokens/n_types/ttr/hapax — only the length of the top_terms list."
|
||||
- name: remove_stopwords
|
||||
desc: "When True (default) common Spanish+English stopwords from the inline _STOPWORDS set (~120 entries) are removed from the token stream before any counting. Set False to keep every word (raw lexical profile)."
|
||||
output: "Dict with the exact keys n_tokens (int), n_types (int), ttr (float|None, n_types/n_tokens rounded to 4 dp), n_hapax (int, terms occurring exactly once), hapax_pct (float|None, n_hapax/n_types*100 rounded to 2 dp) and top_terms (list of {term, count, pct} sorted by count descending, pct = count/n_tokens*100 rounded to 2 dp). For an empty corpus (no tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0, hapax_pct=None, top_terms=[]. Any exception degrades to that same empty result — the function never throws."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
|
||||
compute_vocabulary_stats(
|
||||
["el gato y el perro", "gato veloz corre", "perro perro perro"],
|
||||
top_k=5,
|
||||
)
|
||||
# {
|
||||
# "n_tokens": 6, # stopwords (el, y) eliminadas por defecto
|
||||
# "n_types": 3, # gato, perro, veloz, corre -> tras quitar stopwords
|
||||
# "ttr": 0.5, # n_types / n_tokens
|
||||
# "n_hapax": 2, # veloz, corre (1 aparicion cada uno)
|
||||
# "hapax_pct": 50.0, # n_hapax / n_types * 100
|
||||
# "top_terms": [
|
||||
# {"term": "perro", "count": 4, "pct": 44.44},
|
||||
# {"term": "gato", "count": 2, "pct": 22.22},
|
||||
# ...
|
||||
# ],
|
||||
# }
|
||||
|
||||
# Perfil lexico crudo (sin filtrar stopwords):
|
||||
compute_vocabulary_stats(["the cat and the dog"], remove_stopwords=False)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al perfilar una columna o corpus de texto libre en un EDA del grupo `eda`:
|
||||
cuando necesites medir la riqueza léxica (cuántos tokens y cuántas palabras
|
||||
distintas, type-token ratio, porcentaje de palabras que solo aparecen una vez) y
|
||||
ver qué términos dominan el vocabulario (top-k frecuencias). Pásale la lista de
|
||||
documentos crudos (filas de la columna); `None` y valores no-string se ignoran
|
||||
solos. Es el equivalente para texto largo de `summarize_categorical`, que perfila
|
||||
categorías cortas.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Función pura y stdlib-only, pero el resultado depende del **idioma**: la lista
|
||||
`_STOPWORDS` cubre español e inglés. Para otros idiomas pon
|
||||
`remove_stopwords=False` o filtra fuera, o el perfil mezclará stopwords no
|
||||
reconocidas en `top_terms`.
|
||||
- La tokenización es `\w+` con `re.UNICODE`: separa por puntuación y conserva
|
||||
acentos/ñ, pero NO hace stemming ni lematización — "gato" y "gatos" cuentan
|
||||
como tipos distintos. Tampoco hace stripping de acentos, así que "más" (con
|
||||
tilde) y "mas" son tokens diferentes (ambos están en la stoplist).
|
||||
- Los tokens **puramente numéricos** (`"123"`) se descartan siempre; un token
|
||||
alfanumérico mixto (`"covid19"`) se conserva.
|
||||
- `ttr` baja artificialmente en corpus grandes (más texto, más repetición): no
|
||||
compares TTR entre corpus de tamaños muy distintos sin normalizar.
|
||||
- Nunca lanza: entrada vacía, `None`, o cualquier excepción interna devuelven el
|
||||
resultado con ceros/`None`/`[]`. Comprueba `n_tokens == 0` para detectar el
|
||||
caso degenerado.
|
||||
@@ -0,0 +1,99 @@
|
||||
"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only).
|
||||
|
||||
Tokenises a list of documents, counts term frequencies and derives lexical
|
||||
richness measures (type-token ratio, hapax legomena) plus the top-k terms.
|
||||
No external NLP dependencies (no nltk, no sklearn) — only ``re`` and
|
||||
``collections`` from the standard library.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# Common Spanish + English stopwords. Inline, lowercase, no accents stripped
|
||||
# beyond what already appears here. Filtering is opt-in via remove_stopwords.
|
||||
_STOPWORDS = {
|
||||
# Spanish
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas",
|
||||
"más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque",
|
||||
"esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también",
|
||||
"me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
||||
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
||||
"ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro",
|
||||
"otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos",
|
||||
# English
|
||||
"the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
|
||||
"was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at",
|
||||
"from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its",
|
||||
"our", "their", "what", "which", "who", "whom", "has", "have", "had", "do",
|
||||
"does", "did", "will", "would", "can", "could", "should", "may", "might",
|
||||
"must", "if", "then", "than", "so", "too", "very", "just", "also", "were",
|
||||
"been", "being", "there", "here", "all", "any", "some", "more", "most",
|
||||
"out", "up", "down", "into", "over", "such", "only", "own", "same",
|
||||
}
|
||||
|
||||
|
||||
def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict:
|
||||
"""Profile the vocabulary of a corpus of documents.
|
||||
|
||||
Args:
|
||||
texts: List of strings (the corpus). Entries that are None or not a
|
||||
string are discarded silently.
|
||||
top_k: Maximum number of most-frequent terms to include in
|
||||
``top_terms``. Default 20. Does not affect the other measures.
|
||||
remove_stopwords: When True (default) common ES+EN stopwords are
|
||||
dropped from the token stream before any counting.
|
||||
|
||||
Returns:
|
||||
A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``,
|
||||
``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no
|
||||
tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0,
|
||||
hapax_pct=None, top_terms=[]. Never raises — any exception degrades to
|
||||
the empty-corpus result.
|
||||
"""
|
||||
empty = {
|
||||
"n_tokens": 0,
|
||||
"n_types": 0,
|
||||
"ttr": None,
|
||||
"n_hapax": 0,
|
||||
"hapax_pct": None,
|
||||
"top_terms": [],
|
||||
}
|
||||
try:
|
||||
tokens = []
|
||||
for doc in texts or []:
|
||||
if not isinstance(doc, str):
|
||||
continue
|
||||
for tok in re.findall(r"\w+", doc.lower(), re.UNICODE):
|
||||
if tok.isdigit():
|
||||
continue
|
||||
if remove_stopwords and tok in _STOPWORDS:
|
||||
continue
|
||||
tokens.append(tok)
|
||||
|
||||
n_tokens = len(tokens)
|
||||
if n_tokens == 0:
|
||||
return dict(empty)
|
||||
|
||||
counts = Counter(tokens)
|
||||
n_types = len(counts)
|
||||
ttr = round(n_types / n_tokens, 4)
|
||||
|
||||
n_hapax = sum(1 for c in counts.values() if c == 1)
|
||||
hapax_pct = round(n_hapax / n_types * 100, 2)
|
||||
|
||||
top_terms = [
|
||||
{"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)}
|
||||
for term, count in counts.most_common(top_k)
|
||||
]
|
||||
|
||||
return {
|
||||
"n_tokens": n_tokens,
|
||||
"n_types": n_types,
|
||||
"ttr": ttr,
|
||||
"n_hapax": n_hapax,
|
||||
"hapax_pct": hapax_pct,
|
||||
"top_terms": top_terms,
|
||||
}
|
||||
except Exception:
|
||||
return dict(empty)
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Tests para compute_vocabulary_stats."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.join(os.path.dirname(__file__), "..", "..", "functions")
|
||||
)
|
||||
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
|
||||
|
||||
def test_basico():
|
||||
# Corpus con repeticiones y hapax. Stopwords desactivadas para controlar
|
||||
# exactamente que tokens entran.
|
||||
texts = ["gato gato perro", "perro perro raton", "elefante"]
|
||||
r = compute_vocabulary_stats(texts, top_k=10, remove_stopwords=False)
|
||||
|
||||
# n_types < n_tokens cuando hay repeticiones.
|
||||
assert r["n_types"] < r["n_tokens"]
|
||||
assert r["n_tokens"] == 7
|
||||
assert r["n_types"] == 4 # gato, perro, raton, elefante
|
||||
|
||||
# ttr en (0, 1].
|
||||
assert 0 < r["ttr"] <= 1
|
||||
assert r["ttr"] == round(4 / 7, 4)
|
||||
|
||||
# top_terms ordenado por count descendente.
|
||||
counts = [t["count"] for t in r["top_terms"]]
|
||||
assert counts == sorted(counts, reverse=True)
|
||||
assert r["top_terms"][0]["term"] == "perro"
|
||||
assert r["top_terms"][0]["count"] == 3
|
||||
|
||||
# hapax: raton y elefante aparecen exactamente una vez.
|
||||
assert r["n_hapax"] == 2
|
||||
assert r["hapax_pct"] == round(2 / 4 * 100, 2)
|
||||
|
||||
# pct coherente con count/n_tokens.
|
||||
assert r["top_terms"][0]["pct"] == round(3 / 7 * 100, 2)
|
||||
|
||||
|
||||
def test_vacio():
|
||||
# Sin documentos validos -> ceros / None / [].
|
||||
for arg in ([], None, [None, 123, ""], ["123 456"]):
|
||||
r = compute_vocabulary_stats(arg)
|
||||
assert r["n_tokens"] == 0
|
||||
assert r["n_types"] == 0
|
||||
assert r["ttr"] is None
|
||||
assert r["n_hapax"] == 0
|
||||
assert r["hapax_pct"] is None
|
||||
assert r["top_terms"] == []
|
||||
|
||||
|
||||
def test_stopwords_quitadas():
|
||||
texts = ["the gato the perro", "de la casa azul"]
|
||||
r = compute_vocabulary_stats(texts, remove_stopwords=True)
|
||||
terms = {t["term"] for t in r["top_terms"]}
|
||||
# Stopwords ES+EN no deben aparecer.
|
||||
assert "the" not in terms
|
||||
assert "de" not in terms
|
||||
assert "la" not in terms
|
||||
# Palabras de contenido si.
|
||||
assert "gato" in terms
|
||||
assert "casa" in terms
|
||||
|
||||
|
||||
def test_stopwords_conservadas():
|
||||
texts = ["the gato the perro", "de la casa azul"]
|
||||
r = compute_vocabulary_stats(texts, remove_stopwords=False)
|
||||
terms = {t["term"] for t in r["top_terms"]}
|
||||
# Con el filtro desactivado, las stopwords se conservan.
|
||||
assert "the" in terms
|
||||
assert "de" in terms
|
||||
assert "la" in terms
|
||||
@@ -0,0 +1,80 @@
|
||||
---
|
||||
name: detect_corpus_language
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict"
|
||||
description: "Estima la distribucion de idiomas de un corpus de textos con la libreria langdetect (import perezoso). Funcion pura y defensiva del grupo eda: filtra documentos None/no-str/vacios, muestrea hasta sample_max docs, clasifica cada uno con detect() ignorando los que langdetect no puede resolver (LangDetectException), y devuelve la distribucion top_k por frecuencia mas el idioma dominante. Si langdetect no esta instalada o algo falla, degrada a {available: False, ...} y NUNCA lanza (dict-no-throw). Seed fija (DetectorFactory.seed=0) para deteccion determinista."
|
||||
tags: [eda, datascience, text, nlp, language-detection, langdetect, pure, python]
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de strings (documentos). Los elementos None, no-str o vacios tras strip se descartan antes de clasificar."
|
||||
- name: top_k
|
||||
desc: "Numero maximo de idiomas a devolver en distribution, ordenados por count descendente (desempate por codigo ISO ascendente). Default 10."
|
||||
- name: sample_max
|
||||
desc: "Numero maximo de documentos a clasificar (se toman los primeros del corpus) para acotar el coste. Default 1000."
|
||||
output: >
|
||||
Dict con forma fija (dict-no-throw, nunca lanza):
|
||||
{"available": bool, "n_detected": int,
|
||||
"distribution": [{"lang": str, "count": int, "pct": float}, ...],
|
||||
"dominant": str|None}.
|
||||
available=True si langdetect es importable; lang son codigos ISO 639-1 ("es","en","fr",...);
|
||||
pct = count/n_detected*100 redondeado a 2 decimales; n_detected = docs clasificados con exito;
|
||||
dominant = idioma mas frecuente (None si no hubo detecciones). Corpus vacio con langdetect
|
||||
presente -> available True, n_detected 0, distribution [], dominant None. Sin langdetect (o
|
||||
fallo global) -> available False y el resto de campos a su valor vacio.
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [langdetect]
|
||||
tested: true
|
||||
tests: ["test_mixto_es_en", "test_vacio", "test_degradacion"]
|
||||
test_file_path: "python/functions/datascience/detect_corpus_language_test.py"
|
||||
file_path: "python/functions/datascience/detect_corpus_language.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
|
||||
corpus = [
|
||||
"este es un texto bastante largo en español para detectar el idioma correctamente",
|
||||
"la inteligencia artificial transforma la manera en que trabajamos cada dia",
|
||||
"this is a fairly long english text to detect the language correctly without issues",
|
||||
]
|
||||
out = detect_corpus_language(corpus)
|
||||
# {"available": True, "n_detected": 3,
|
||||
# "distribution": [{"lang": "es", "count": 2, "pct": 66.67},
|
||||
# {"lang": "en", "count": 1, "pct": 33.33}],
|
||||
# "dominant": "es"}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando perfiles una columna o corpus de texto en un EDA y necesites saber en
|
||||
que idioma(s) esta escrito antes de elegir tokenizadores, stopwords, modelos
|
||||
NLP o stemmers. Util tambien como check de calidad: detectar corpus mezclados
|
||||
o un idioma inesperado. Llamala con la lista de textos crudos; la funcion
|
||||
limpia, muestrea y resume sola.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- `langdetect` es **opcional**: si no esta instalada, la funcion no lanza —
|
||||
devuelve `{"available": False, "n_detected": 0, "distribution": [], "dominant": None}`.
|
||||
Comprueba `out["available"]` antes de usar la distribucion.
|
||||
- **Textos cortos** (pocas palabras o sin features lingüisticas) pueden no
|
||||
detectarse: langdetect lanza `LangDetectException`, que se ignora y el doc no
|
||||
cuenta en `n_detected`. Pasa frases razonablemente largas para resultados fiables.
|
||||
- **Determinismo**: se fija `DetectorFactory.seed = 0` en cada llamada para que la
|
||||
deteccion sea reproducible; sin esa semilla langdetect puede dar resultados
|
||||
ligeramente distintos entre ejecuciones.
|
||||
- `distribution` esta truncada a `top_k`; si el corpus tiene mas idiomas que
|
||||
`top_k`, la suma de los `count` mostrados puede ser menor que `n_detected`
|
||||
(pero `dominant` siempre refleja el idioma mas frecuente del corpus completo).
|
||||
@@ -0,0 +1,91 @@
|
||||
"""Detecta la distribucion de idiomas de un corpus de textos.
|
||||
|
||||
Funcion pura y defensiva: el computo es determinista y local (sin I/O de red).
|
||||
La libreria opcional `langdetect` se importa de forma perezosa dentro de la
|
||||
funcion; si no esta instalada (o cualquier paso falla), la funcion degrada
|
||||
limpiamente a `available=False` y NUNCA lanza excepciones.
|
||||
"""
|
||||
|
||||
|
||||
def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict:
|
||||
"""Estima la distribucion de idiomas de un corpus con `langdetect`.
|
||||
|
||||
Args:
|
||||
texts: lista de strings (documentos). Los elementos None, no-str o
|
||||
vacios tras strip se descartan.
|
||||
top_k: numero maximo de idiomas a devolver en `distribution`,
|
||||
ordenados por frecuencia descendente.
|
||||
sample_max: numero maximo de documentos a clasificar (se toman los
|
||||
primeros) para acotar el coste.
|
||||
|
||||
Returns:
|
||||
dict con la forma fija (dict-no-throw):
|
||||
{
|
||||
"available": bool, # True si langdetect es importable
|
||||
"n_detected": int, # documentos clasificados con exito
|
||||
"distribution": [{"lang": str, "count": int, "pct": float}, ...],
|
||||
"dominant": str | None,
|
||||
}
|
||||
"""
|
||||
degraded = {
|
||||
"available": False,
|
||||
"n_detected": 0,
|
||||
"distribution": [],
|
||||
"dominant": None,
|
||||
}
|
||||
try:
|
||||
# Import perezoso con degradacion: si langdetect no esta disponible,
|
||||
# devolvemos el dict degradado sin lanzar.
|
||||
try:
|
||||
from langdetect import detect, DetectorFactory
|
||||
|
||||
# Semilla fija -> deteccion determinista entre ejecuciones.
|
||||
DetectorFactory.seed = 0
|
||||
except Exception:
|
||||
return dict(degraded)
|
||||
|
||||
# Normaliza y filtra el corpus.
|
||||
docs = []
|
||||
if texts:
|
||||
for t in texts:
|
||||
if isinstance(t, str):
|
||||
s = t.strip()
|
||||
if s:
|
||||
docs.append(s)
|
||||
|
||||
# Muestreo de los primeros `sample_max` documentos.
|
||||
if sample_max is not None and sample_max >= 0:
|
||||
docs = docs[:sample_max]
|
||||
|
||||
# Conteo por idioma; langdetect lanza LangDetectException en textos
|
||||
# sin features detectables -> se ignora y se sigue.
|
||||
counts: dict = {}
|
||||
for doc in docs:
|
||||
try:
|
||||
lang = detect(doc)
|
||||
except Exception:
|
||||
continue
|
||||
counts[lang] = counts.get(lang, 0) + 1
|
||||
|
||||
n_detected = sum(counts.values())
|
||||
|
||||
# Orden estable: por count descendente, desempate por codigo de idioma.
|
||||
ordered = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
|
||||
k = top_k if (top_k is not None and top_k >= 0) else len(ordered)
|
||||
distribution = []
|
||||
for lang, count in ordered[:k]:
|
||||
pct = round(count / n_detected * 100, 2) if n_detected else 0.0
|
||||
distribution.append({"lang": lang, "count": count, "pct": pct})
|
||||
|
||||
dominant = ordered[0][0] if ordered else None
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_detected": n_detected,
|
||||
"distribution": distribution,
|
||||
"dominant": dominant,
|
||||
}
|
||||
except Exception:
|
||||
# Cualquier fallo global degrada a available False sin lanzar.
|
||||
return dict(degraded)
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Tests para detect_corpus_language."""
|
||||
|
||||
import builtins
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Anade python/functions a sys.path para importar el paquete `datascience`.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
|
||||
_ES = [
|
||||
"este es un texto bastante largo en español para detectar el idioma correctamente sin problemas",
|
||||
"la inteligencia artificial transforma la manera en que trabajamos cada dia en muchos sectores",
|
||||
]
|
||||
_EN = [
|
||||
"this is a fairly long english text to detect the language correctly without any length issues",
|
||||
"machine learning models can classify documents into many different categories quite reliably",
|
||||
]
|
||||
|
||||
|
||||
def test_mixto_es_en():
|
||||
"""Golden: corpus mixto ES+EN claro -> available True, >=2 idiomas, counts coherentes."""
|
||||
out = detect_corpus_language(_ES + _EN)
|
||||
assert out["available"] is True
|
||||
assert out["dominant"] in {"es", "en"}
|
||||
assert len(out["distribution"]) >= 2
|
||||
total = sum(item["count"] for item in out["distribution"])
|
||||
assert total == out["n_detected"]
|
||||
assert out["n_detected"] == 4
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Edge: lista vacia con langdetect presente -> available True, sin detecciones."""
|
||||
out = detect_corpus_language([])
|
||||
assert out["available"] is True
|
||||
assert out["n_detected"] == 0
|
||||
assert out["distribution"] == []
|
||||
assert out["dominant"] is None
|
||||
|
||||
|
||||
def test_degradacion(monkeypatch):
|
||||
"""Error path: si langdetect no es importable -> degrada a available False sin lanzar."""
|
||||
import datascience.detect_corpus_language as m
|
||||
|
||||
real_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *a, **k):
|
||||
if name == "langdetect" or name.startswith("langdetect."):
|
||||
raise ImportError("simulado")
|
||||
return real_import(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
out = m.detect_corpus_language(["hola mundo", "hello world"])
|
||||
assert out["available"] is False
|
||||
assert out["n_detected"] == 0
|
||||
assert out["distribution"] == []
|
||||
assert out["dominant"] is None
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
name: extract_text_sample
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_text_sample(db_path: str, table: str, columns: list, backend: str = 'duckdb', sample: int = 2000) -> dict"
|
||||
description: "Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL (LIMIT sample), SIN traer la tabla entera a RAM. Funcion impura del grupo de capacidad `eda`: la usan los capitulos de texto/NLP del AutomaticEDA que necesitan valores crudos de texto (longitudes, tokens, ejemplos) sobre una muestra acotada. Construye el lector read-only query_fn(sql)->dict igual que build_eda_render_ctx (closure sobre duckdb_query_readonly / pg_query importados perezosamente desde infra). Escapa los identificadores con comillas dobles y lanza una sola query SELECT \"c1\", \"c2\" FROM \"table\" LIMIT n. Por columna, la lista de strings solo contiene valores NO None y NO vacios: cada celda no nula se convierte con str(...) y se descarta si queda cadena vacia. Estilo dict-no-throw del grupo eda: NUNCA lanza; ante cualquier fallo (query, conversion, backend desconocido) devuelve {status:'error', error:str, columns:{}, n:0}. La clave n reporta el numero de FILAS leidas por la query (antes de filtrar None/vacios)."
|
||||
tags: [eda, datascience, text, nlp, extraction, read-only, duckdb, postgres, python]
|
||||
uses_functions: [duckdb_query_readonly_py_infra, pg_query_py_infra]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
params:
|
||||
- name: db_path
|
||||
desc: "ruta al archivo DuckDB, o DSN PostgreSQL si backend='postgres'. Se inyecta en el closure query_fn. No se valida aqui: si la base no existe o el DSN es invalido, la query devuelve status error y el resultado es {status:'error', ...} (no lanza)."
|
||||
- name: table
|
||||
desc: "nombre de la tabla. Se escapa con comillas dobles en la query (SELECT ... FROM \"table\")."
|
||||
- name: columns
|
||||
desc: "lista de nombres de columna de texto a muestrear. Se filtra a las entradas que sean str no vacio; cada nombre se escapa con comillas dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0} sin tocar la base."
|
||||
- name: backend
|
||||
desc: "'duckdb' (default) o 'postgres'. Selecciona el lector read-only del registry (duckdb_query_readonly / pg_query). Cualquier otro valor -> {status:'error', error:'backend desconocido: <valor>', columns:{}, n:0}."
|
||||
- name: sample
|
||||
desc: "maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota memoria y tiempo: con tablas grandes obtienes el primer tramo por orden fisico (sin ORDER BY), no un muestreo uniforme."
|
||||
output: "dict dict-no-throw (NUNCA lanza): {status:'ok'|'error', columns:{col_name:[str,...]}, n:int, error:str}. En exito (status='ok') columns mapea cada columna pedida a la lista de sus valores de texto NO None y NO vacios (cada celda convertida con str(...)); n es el numero de FILAS leidas por la query (antes de filtrar None/vacios). columns vacio -> {status:'ok', columns:{}, n:0}. En error (backend desconocido, query con status!='ok', o cualquier excepcion) -> {status:'error', error:str, columns:{}, n:0}; la clave error solo aparece en este caso."
|
||||
tested: true
|
||||
tests: ["test_extract_basic", "test_backend_desconocido", "test_columns_vacio", "test_sample_limit"]
|
||||
test_file_path: "python/functions/datascience/extract_text_sample_test.py"
|
||||
file_path: "python/functions/datascience/extract_text_sample.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
# Import directo del submodulo (no requiere export en datascience/__init__.py).
|
||||
from datascience.extract_text_sample import extract_text_sample
|
||||
|
||||
# Muestrea hasta 2000 filas de dos columnas de texto de una tabla DuckDB.
|
||||
res = extract_text_sample(
|
||||
"data/reviews.duckdb", "reviews", ["title", "body"],
|
||||
backend="duckdb", sample=2000,
|
||||
)
|
||||
# res == {
|
||||
# "status": "ok",
|
||||
# "columns": {
|
||||
# "title": ["Gran producto", "No funciona", ...], # solo no-None, no-""
|
||||
# "body": ["Lo uso a diario...", ...],
|
||||
# },
|
||||
# "n": 2000, # filas leidas por la query (antes de filtrar None/vacios)
|
||||
# }
|
||||
|
||||
# Postgres: db_path es el DSN.
|
||||
res_pg = extract_text_sample(
|
||||
"postgresql://user:pass@localhost:5433/trends", "comentarios", ["texto"],
|
||||
backend="postgres", sample=500,
|
||||
)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando necesites valores CRUDOS de texto de una o varias columnas para analisis
|
||||
NLP/texto (distribucion de longitudes, conteo de tokens, ejemplos representativos,
|
||||
deteccion de idioma) pero NO quieras cargar la tabla entera en memoria. Es el
|
||||
muestreador de texto del grupo `eda`: una sola llamada con push-down `LIMIT`
|
||||
devuelve listas de strings por columna, limpias de None y vacios, listas para
|
||||
alimentar un capitulo de texto del AutomaticEDA o cualquier rutina de tokenizado.
|
||||
Usala junto a `profile_table` / `build_eda_render_ctx` cuando el perfil agregado
|
||||
no basta y hace falta el texto real.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: lee de la base de datos a traves de `query_fn` (closure sobre
|
||||
`duckdb_query_readonly` / `pg_query`). No abre conexiones fuera de esos wrappers
|
||||
del registry. Estilo dict-no-throw del grupo `eda`: NUNCA lanza; ante cualquier
|
||||
fallo devuelve `{status:'error', error:str, columns:{}, n:0}`.
|
||||
- **`error_type` en el frontmatter es `error_go_core` por convencion del registry**
|
||||
(toda funcion impura debe declararlo y el indexer lo exige), pero el codigo NO
|
||||
lanza esa excepcion: degrada al dict de error. Es metadata, no comportamiento.
|
||||
- **Backend desconocido**: con un `backend` que no sea `duckdb` ni `postgres`
|
||||
devuelve `{status:'error', error:'backend desconocido: <valor>', columns:{},
|
||||
n:0}` sin tocar la base.
|
||||
- **Las listas NO incluyen None ni cadenas vacias**: cada celda no nula se pasa
|
||||
por `str(...)` y se descarta si queda `""`. Por eso `len(columns[col])` puede ser
|
||||
menor que `n` (que cuenta las filas leidas). Si necesitas alineacion por fila
|
||||
(una entrada por fila aunque sea None), usa `build_eda_render_ctx` (raw_numeric),
|
||||
no esta funcion.
|
||||
- **`LIMIT sample` sin `ORDER BY`**: con tablas grandes obtienes el primer tramo
|
||||
por orden fisico del backend, no un muestreo uniforme ni reproducible. Sube
|
||||
`sample` para mas cobertura, o pre-ordena/aleatoriza la tabla si necesitas
|
||||
representatividad.
|
||||
- **DuckDB en sandbox por defecto**: `duckdb_query_readonly` abre la conexion con
|
||||
`enable_external_access=False`, asi que la query solo puede leer la propia base
|
||||
(no `read_csv`/`httpfs`/`ATTACH` a paths externos). Lee tablas ya existentes en
|
||||
el archivo DuckDB sin problema.
|
||||
- **No loguear los datos crudos**: las listas de `columns` pueden contener texto
|
||||
sensible (reviews, comentarios, PII). En trazas usa solo conteos (`n`,
|
||||
`len(columns[col])`) y nombres de columna, no el dict completo.
|
||||
@@ -0,0 +1,112 @@
|
||||
"""extract_text_sample — muestrea columnas de texto de una tabla sin cargarla en RAM.
|
||||
|
||||
Funcion impura (lee de la base de datos) del grupo de capacidad `eda`. Dado un
|
||||
``db_path`` + ``table`` (DuckDB o PostgreSQL) y una lista de ``columns`` de texto,
|
||||
trae una MUESTRA de esas columnas con push-down SQL (``LIMIT sample``), nunca la
|
||||
tabla entera. La usan los capitulos de texto/NLP del AutomaticEDA que necesitan
|
||||
valores crudos de texto (longitudes, tokens, ejemplos) sin materializar millones
|
||||
de filas en memoria.
|
||||
|
||||
El lector read-only ``query_fn(sql) -> dict`` se construye igual que en
|
||||
``build_eda_render_ctx`` / ``profile_table``: un closure sobre el wrapper del
|
||||
registry (``duckdb_query_readonly`` / ``pg_query``), importado perezosamente
|
||||
dentro de la funcion para no crear ciclos al cargar el ``__init__`` del paquete
|
||||
``datascience``. Nunca abre conexiones fuera de esos wrappers.
|
||||
|
||||
Estilo dict-no-throw del grupo `eda`: la funcion NUNCA lanza. Captura cualquier
|
||||
excepcion (query, conversion) y devuelve ``{"status":"error", "error":str(e),
|
||||
"columns":{}, "n":0}``. Si la query subyacente devuelve ``status != "ok"``, se
|
||||
propaga como error con el mensaje del wrapper.
|
||||
|
||||
Por columna, la lista de strings solo contiene valores NO nulos y NO vacios:
|
||||
cada celda no-None se convierte con ``str(...)`` y se descarta si queda ``""``.
|
||||
La clave ``n`` reporta el numero de FILAS leidas por la query (antes de filtrar
|
||||
los None/vacios), util para saber cuanto se muestreo realmente.
|
||||
"""
|
||||
|
||||
|
||||
def extract_text_sample(db_path, table, columns, backend="duckdb", sample=2000):
|
||||
"""Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL.
|
||||
|
||||
Args:
|
||||
db_path: ruta al archivo DuckDB, o DSN PostgreSQL si backend="postgres".
|
||||
Se inyecta en el closure query_fn. No se valida aqui: si la base no
|
||||
existe o el DSN es invalido, la query devuelve status error y el
|
||||
resultado es {status:'error', ...} (no lanza).
|
||||
table: nombre de la tabla. Se escapa con comillas dobles en la query.
|
||||
columns: lista de nombres de columna de texto a muestrear. Se filtra a las
|
||||
entradas que sean str no vacio; cada nombre se escapa con comillas
|
||||
dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0}.
|
||||
backend: "duckdb" (default) o "postgres". Selecciona el lector read-only
|
||||
del registry (duckdb_query_readonly / pg_query). Cualquier otro valor
|
||||
-> {status:'error', error:'backend desconocido: ...', columns:{}, n:0}.
|
||||
sample: maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota
|
||||
memoria y tiempo: con tablas grandes obtienes el primer tramo por
|
||||
orden fisico, no un muestreo uniforme.
|
||||
|
||||
Returns:
|
||||
dict (dict-no-throw, NUNCA lanza):
|
||||
{"status": "ok"|"error",
|
||||
"columns": {col_name: [str, str, ...], ...}, # solo no-None, no-""
|
||||
"n": int, # nº de filas leidas por la query (antes de filtrar)
|
||||
"error": str} # solo presente si status == "error"
|
||||
"""
|
||||
try:
|
||||
# 1) Lector read-only del backend activo, construido como en
|
||||
# build_eda_render_ctx (closure sobre el wrapper del registry). Imports
|
||||
# perezosos: este modulo vive en el paquete `datascience`, importar a
|
||||
# `infra` a nivel de modulo crearia un ciclo al cargar el __init__.
|
||||
if backend == "duckdb":
|
||||
from infra import duckdb_query_readonly
|
||||
|
||||
def query_fn(sql):
|
||||
return duckdb_query_readonly(db_path, sql)
|
||||
|
||||
elif backend == "postgres":
|
||||
from infra import pg_query
|
||||
|
||||
def query_fn(sql):
|
||||
return pg_query(db_path, sql)
|
||||
|
||||
else:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"backend desconocido: {backend}",
|
||||
"columns": {},
|
||||
"n": 0,
|
||||
}
|
||||
|
||||
# 2) Columnas validas (str no vacio). Si no queda ninguna, nada que
|
||||
# muestrear: ok con columns vacio.
|
||||
cols = []
|
||||
if isinstance(columns, (list, tuple)):
|
||||
cols = [c for c in columns if isinstance(c, str) and c != ""]
|
||||
if not cols:
|
||||
return {"status": "ok", "columns": {}, "n": 0}
|
||||
|
||||
# 3) Push-down: una sola query con LIMIT. Identificadores escapados con
|
||||
# comillas dobles, igual que build_eda_render_ctx.
|
||||
cols_sql = ", ".join(f'"{c}"' for c in cols)
|
||||
sql = f'SELECT {cols_sql} FROM "{table}" LIMIT {int(sample)}'
|
||||
q = query_fn(sql)
|
||||
if not isinstance(q, dict) or q.get("status") != "ok":
|
||||
err = q.get("error") if isinstance(q, dict) else "query sin resultado"
|
||||
return {"status": "error", "error": str(err), "columns": {}, "n": 0}
|
||||
|
||||
rows = q.get("rows") or []
|
||||
out = {c: [] for c in cols}
|
||||
for row in rows:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
for c in cols:
|
||||
value = row.get(c)
|
||||
if value is None:
|
||||
continue
|
||||
s = str(value)
|
||||
if s == "":
|
||||
continue
|
||||
out[c].append(s)
|
||||
|
||||
return {"status": "ok", "columns": out, "n": len(rows)}
|
||||
except Exception as exc: # noqa: BLE001 - dict-no-throw del grupo eda
|
||||
return {"status": "error", "error": str(exc), "columns": {}, "n": 0}
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Tests para extract_text_sample.
|
||||
|
||||
Self-contained: crea un DuckDB temporal pequeño con una columna de texto (algunas
|
||||
filas con NULL) y una numerica, y verifica que la muestra de texto trae solo los
|
||||
valores no nulos, que el backend desconocido y la lista de columnas vacia se
|
||||
manejan dict-no-throw, y que sample acota el numero de filas leidas.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..")) # python/functions
|
||||
if _FUNCTIONS not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS)
|
||||
|
||||
import duckdb # noqa: E402
|
||||
|
||||
from datascience.extract_text_sample import extract_text_sample # noqa: E402
|
||||
|
||||
_TABLE = "t"
|
||||
# 6 filas: txt VARCHAR con dos NULL, other INT siempre presente.
|
||||
_ROWS = [
|
||||
("alpha", 1),
|
||||
("beta", 2),
|
||||
(None, 3),
|
||||
("gamma", 4),
|
||||
(None, 5),
|
||||
("delta", 6),
|
||||
]
|
||||
_TXT_NON_NULL = {"alpha", "beta", "gamma", "delta"}
|
||||
|
||||
|
||||
def _make_db(tmp_path):
|
||||
"""Crea un DuckDB temporal con la tabla de prueba y devuelve su ruta."""
|
||||
db_path = os.path.join(str(tmp_path), "text_sample.duckdb")
|
||||
con = duckdb.connect(db_path)
|
||||
try:
|
||||
con.execute(f'CREATE TABLE "{_TABLE}" (txt VARCHAR, other INTEGER)')
|
||||
con.executemany(f'INSERT INTO "{_TABLE}" VALUES (?, ?)', _ROWS)
|
||||
finally:
|
||||
con.close()
|
||||
return db_path
|
||||
|
||||
|
||||
def test_extract_basic(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"])
|
||||
assert res["status"] == "ok"
|
||||
# n = filas leidas por la query (6), antes de filtrar None.
|
||||
assert res["n"] == len(_ROWS)
|
||||
# columns["txt"] trae solo los strings no nulos (los dos NULL fuera).
|
||||
assert "txt" in res["columns"]
|
||||
assert set(res["columns"]["txt"]) == _TXT_NON_NULL
|
||||
assert len(res["columns"]["txt"]) == len(_TXT_NON_NULL)
|
||||
# No se pidio "other", no debe aparecer.
|
||||
assert "other" not in res["columns"]
|
||||
|
||||
|
||||
def test_backend_desconocido(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"], backend="mysql")
|
||||
assert res["status"] == "error"
|
||||
assert "backend desconocido" in res["error"]
|
||||
assert res["columns"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_columns_vacio(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, [])
|
||||
assert res["status"] == "ok"
|
||||
assert res["columns"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_sample_limit(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"], sample=2)
|
||||
assert res["status"] == "ok"
|
||||
# sample=2 -> la query lee como mucho 2 filas.
|
||||
assert res["n"] == 2
|
||||
assert len(res["columns"]["txt"]) <= 2
|
||||
@@ -18,6 +18,7 @@ dependencies = [
|
||||
"google-cloud-bigquery-storage>=2.27",
|
||||
"google-cloud-storage>=3.10.1",
|
||||
"httpx",
|
||||
"langdetect>=1.0.9",
|
||||
"matplotlib>=3.10.9",
|
||||
"opencv-contrib-python-headless>=4.13.0.92",
|
||||
"openpyxl>=3.1.5",
|
||||
@@ -40,6 +41,7 @@ dependencies = [
|
||||
"seaborn>=0.13.2",
|
||||
"shapely>=2.1.2",
|
||||
"statsmodels>=0.14.6",
|
||||
"textstat>=0.7.13",
|
||||
"trimesh>=4.12.2",
|
||||
"xlrd>=2.0.2",
|
||||
]
|
||||
|
||||
Generated
+96
@@ -899,6 +899,7 @@ dependencies = [
|
||||
{ name = "google-cloud-bigquery-storage" },
|
||||
{ name = "google-cloud-storage" },
|
||||
{ name = "httpx" },
|
||||
{ name = "langdetect" },
|
||||
{ name = "matplotlib" },
|
||||
{ name = "opencv-contrib-python-headless" },
|
||||
{ name = "openpyxl" },
|
||||
@@ -906,9 +907,11 @@ dependencies = [
|
||||
{ name = "polars" },
|
||||
{ name = "pymeshlab" },
|
||||
{ name = "pymssql" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "pyproj" },
|
||||
{ name = "python-docx" },
|
||||
{ name = "python-pptx" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "qrcode", extra = ["pil"] },
|
||||
{ name = "rapidfuzz" },
|
||||
@@ -919,6 +922,7 @@ dependencies = [
|
||||
{ name = "seaborn" },
|
||||
{ name = "shapely" },
|
||||
{ name = "statsmodels" },
|
||||
{ name = "textstat" },
|
||||
{ name = "trimesh" },
|
||||
{ name = "xlrd" },
|
||||
]
|
||||
@@ -959,6 +963,7 @@ requires-dist = [
|
||||
{ name = "jupyter-collaboration", marker = "extra == 'jupyter'", specifier = ">=2.0" },
|
||||
{ name = "jupyter-mcp-server", marker = "extra == 'jupyter'" },
|
||||
{ name = "jupyterlab", marker = "extra == 'jupyter'", specifier = ">=4.0" },
|
||||
{ name = "langdetect", specifier = ">=1.0.9" },
|
||||
{ name = "matplotlib", specifier = ">=3.10.9" },
|
||||
{ name = "opencv-contrib-python-headless", specifier = ">=4.13.0.92" },
|
||||
{ name = "openpyxl", specifier = ">=3.1.5" },
|
||||
@@ -966,9 +971,11 @@ requires-dist = [
|
||||
{ name = "polars", specifier = ">=1.40.1" },
|
||||
{ name = "pymeshlab", specifier = ">=2025.7.post1" },
|
||||
{ name = "pymssql", specifier = ">=2.3.13" },
|
||||
{ name = "pymupdf", specifier = ">=1.28.0" },
|
||||
{ name = "pypdf", specifier = ">=6.10.0" },
|
||||
{ name = "pyproj", specifier = ">=3.7.2" },
|
||||
{ name = "python-docx", specifier = ">=1.2.0" },
|
||||
{ name = "python-pptx", specifier = ">=1.0.2" },
|
||||
{ name = "pyyaml", specifier = ">=6.0.3" },
|
||||
{ name = "qrcode", extras = ["pil"], specifier = ">=8.2" },
|
||||
{ name = "rapidfuzz", specifier = ">=3.14.5" },
|
||||
@@ -979,6 +986,7 @@ requires-dist = [
|
||||
{ name = "seaborn", specifier = ">=0.13.2" },
|
||||
{ name = "shapely", specifier = ">=2.1.2" },
|
||||
{ name = "statsmodels", specifier = ">=0.14.6" },
|
||||
{ name = "textstat", specifier = ">=0.7.13" },
|
||||
{ name = "trimesh", specifier = ">=4.12.2" },
|
||||
{ name = "xlrd", specifier = ">=2.0.2" },
|
||||
]
|
||||
@@ -2198,6 +2206,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/91/53255615acd2a1eaca307ede3c90eb550bae9c94581f8c00081b6b1c8f44/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:1f1489f769582498610e015a8ef2d36f28f505ab3096d0e16b4858a9ec214f57", size = 75987, upload-time = "2026-03-09T13:15:39.65Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "langdetect"
|
||||
version = "1.0.9"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz", hash = "sha256:cbc1fef89f8d062739774bd51eda3da3274006b3661d199c2655f6b3f6d605a0", size = 981474, upload-time = "2021-05-07T07:54:13.562Z" }
|
||||
|
||||
[[package]]
|
||||
name = "lark"
|
||||
version = "1.3.1"
|
||||
@@ -2699,6 +2716,21 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nltk"
|
||||
version = "3.9.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "joblib" },
|
||||
{ name = "regex" },
|
||||
{ name = "tqdm" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/74/a1/b3b4adf15585a5bc4c357adde150c01ebeeb642173ded4d871e89468767c/nltk-3.9.4.tar.gz", hash = "sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0", size = 2946864, upload-time = "2026-03-24T06:13:40.641Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/91/04e965f8e717ba0ab4bdca5c112deeab11c9e750d94c4d4602f050295d39/nltk-3.9.4-py3-none-any.whl", hash = "sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f", size = 1552087, upload-time = "2026-03-24T06:13:38.47Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "notebook-shim"
|
||||
version = "0.2.4"
|
||||
@@ -3750,6 +3782,23 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/25/50/4be9bd9cf4b43208a7175117a533ece200cfe4131a39f9909bdc7560ddeb/pymssql-2.3.13-cp314-cp314-win_amd64.whl", hash = "sha256:7d7037d2b5b907acc7906d0479924db2935a70c720450c41339146a4ada2b93d", size = 2049139, upload-time = "2026-02-14T05:00:23.951Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf"
|
||||
version = "1.28.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8e/e9/6d6c5d6c0a3551bffd47681a6240caf941727f195b45593cf20ab36f018f/pymupdf-1.28.0.tar.gz", hash = "sha256:e53f3567403a92da15caa9e7ae0164327fff48817e9f40175367fb9de524258d", size = 87637751, upload-time = "2026-06-29T09:08:47.547Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c8/b7/88043e38cc7529de070f0c9bd267fa258035cca0b4ad5260536b994594a7/pymupdf-1.28.0-cp310-abi3-macosx_10_15_x86_64.whl", hash = "sha256:892b89ba88e8f98b53133b62877a9dc9b5e7dc6a4aeb837b612db56a8d2e03ac", size = 24597385, upload-time = "2026-06-29T09:03:30.608Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/f4/23775bbda0781b61fc398cc75079a2b0e64696d8fcf93271748883e9627e/pymupdf-1.28.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d692dcf44d3566ae96bc6f6346c6ad432274a29ba617bf7a9fe18009e24adb4", size = 23828292, upload-time = "2026-06-29T09:03:46.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/f5/bf75fc7a415722f8b33662054f82d88520c0cbfd4c36d0e08aeaec605e49/pymupdf-1.28.0-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:47a5c29ed4eb0744de9c4e37bb49b1259b18d4d75fcc8a7c130f7c9fa15956f6", size = 25045507, upload-time = "2026-06-29T09:04:03.86Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/69/5d12c9f1f2d76f28383d6110a069c79fbfced5a4f97bb1ee6e8354f52bb7/pymupdf-1.28.0-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44f0973f5e5edbaec95bc34b64e71d1959d4ee90b1328de1b4f4f5b4fa78673f", size = 25716599, upload-time = "2026-06-29T09:04:19.367Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4d/b4/ec0e017bc42857cc86bd651441dbc41cc18be48d4698ecd27aac491e0c9a/pymupdf-1.28.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4d61ec323a706e153a12e262e51febfb43eeaa20977785ace135d18d48bcdc83", size = 25940489, upload-time = "2026-06-29T09:04:36.624Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/86/f831fef09013f33b3c9c09fb3923f2ff53e1e437f6ace14b8ae46392f558/pymupdf-1.28.0-cp310-abi3-win32.whl", hash = "sha256:caea2b3b67347fd79e5d15ed7929b0e886aac594ea228073b6d39de0078189da", size = 18489703, upload-time = "2026-06-29T20:50:30.599Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2e/5d/1a03f53eb0449900469335fcfc742ca28e3ba159b7d650e0921d50b8b308/pymupdf-1.28.0-cp310-abi3-win_amd64.whl", hash = "sha256:e01e90fd86abfeb37ceb921eddb951f988a11d45ff6ce6b7664f2039849068ec", size = 19773102, upload-time = "2026-06-29T09:04:49.773Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/f6/1e52ce243ca792254f6223b4017c5667194c146ce9b88baf37bc5eb3d1c9/pymupdf-1.28.0-cp313-abi3-pyemscripten_2025_0_wasm32.whl", hash = "sha256:74c6d00ba2a9aad3a635db73b07c15db462b480741d831a34a75a56535ebc22b", size = 18357011, upload-time = "2026-06-29T20:50:50.353Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/b1/46b5b3d8ef3cc71114667cf10c4d8b33f39af97253af32e9a0986775b638/pymupdf-1.28.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:b3e1399c7a64c6914239116a369efcdaac4cfb9e838bde2656d7accc4a85c72d", size = 25753599, upload-time = "2026-06-29T09:05:09.398Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyogrio"
|
||||
version = "0.12.1"
|
||||
@@ -3811,6 +3860,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/55/f2/7ebe366f633f30a6ad105f650f44f24f98cb1335c4157d21ae47138b3482/pypdf-6.10.0-py3-none-any.whl", hash = "sha256:90005e959e1596c6e6c84c8b0ad383285b3e17011751cedd17f2ce8fcdfc86de", size = 334459, upload-time = "2026-04-10T09:34:54.966Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyphen"
|
||||
version = "0.17.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/69/56/e4d7e1bd70d997713649c5ce530b2d15a5fc2245a74ca820fc2d51d89d4d/pyphen-0.17.2.tar.gz", hash = "sha256:f60647a9c9b30ec6c59910097af82bc5dd2d36576b918e44148d8b07ef3b4aa3", size = 2079470, upload-time = "2025-01-20T13:18:36.296Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/1f/c2142d2edf833a90728e5cdeb10bdbdc094dde8dbac078cee0cf33f5e11b/pyphen-0.17.2-py3-none-any.whl", hash = "sha256:3a07fb017cb2341e1d9ff31b8634efb1ae4dc4b130468c7c39dd3d32e7c3affd", size = 2079358, upload-time = "2025-01-20T13:18:29.629Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyproj"
|
||||
version = "3.7.2"
|
||||
@@ -3935,6 +3993,21 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/fd/0318007beb234790993d3ec5afd051d1dbceb733e81e3afe2b981ece3f37/python_multipart-0.0.30-py3-none-any.whl", hash = "sha256:830964def8c90607ac5daa00514e3987815865713ade8d20febc9177ac0c3c5b", size = 29730, upload-time = "2026-05-31T19:24:53.814Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-pptx"
|
||||
version = "1.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "lxml" },
|
||||
{ name = "pillow" },
|
||||
{ name = "typing-extensions" },
|
||||
{ name = "xlsxwriter" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "311"
|
||||
@@ -4936,6 +5009,20 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textstat"
|
||||
version = "0.7.13"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "nltk" },
|
||||
{ name = "pyphen" },
|
||||
{ name = "setuptools" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8c/0f/b673fcec5ad6e976b2e8368ef3651fe0fea3348a1191bacfcd41a17ddec6/textstat-0.7.13.tar.gz", hash = "sha256:a88d1da76287cd27ca4ce7bcba1ebaf2890544a5f0bb6a5758fa84cef3bceccb", size = 138932, upload-time = "2026-02-18T21:07:39.525Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ca/31/0eb4cc5bb021b4ceaaa602c59ba16ce99256b9dd30981bef3f3a53d8555f/textstat-0.7.13-py3-none-any.whl", hash = "sha256:04b1ec995d1e8b2e628759497e6b23204a9ec91dcd652447d8cbba9478f25471", size = 177050, upload-time = "2026-02-18T21:07:38.163Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "threadpoolctl"
|
||||
version = "3.6.0"
|
||||
@@ -5312,6 +5399,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/62/c8d562e7766786ba6587d09c5a8ba9f718ed3fa8af7f4553e8f91c36f302/xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9", size = 96555, upload-time = "2025-06-14T08:46:37.766Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xlsxwriter"
|
||||
version = "3.2.9"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xxhash"
|
||||
version = "3.7.0"
|
||||
|
||||
Reference in New Issue
Block a user