merge: capitulo AutomaticEDA calidad (verificado met)
This commit is contained in:
@@ -0,0 +1,266 @@
|
|||||||
|
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||||
|
|
||||||
|
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||||
|
chapter answers, in Spanish and as tables, the three things the user asked for:
|
||||||
|
|
||||||
|
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
||||||
|
their weights (completeness, validity, consistency) before any number, plus a
|
||||||
|
table-level summary (global score and aggregates).
|
||||||
|
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||||
|
its breakdown into completeness / validity / consistency.
|
||||||
|
3. **Problemas en español** — a second table listing, per column, the readable
|
||||||
|
issues in Spanish (kept separate from the type ``flags``).
|
||||||
|
|
||||||
|
The breakdown and the issues are NOT recomputed here: they come from the registry
|
||||||
|
function ``column_quality_score`` (group ``eda``), which already derives
|
||||||
|
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
||||||
|
This chapter is render-only — it consumes that function and lays the result out
|
||||||
|
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
||||||
|
header) and wrap long cells so nothing is ever cut.
|
||||||
|
|
||||||
|
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .. import model
|
||||||
|
|
||||||
|
# Reuse the registry's pure quality function (group ``eda``). Import defensively:
|
||||||
|
# if the package cannot be imported for any reason the chapter degrades to the
|
||||||
|
# per-column ``quality_score`` already present in the profile instead of failing.
|
||||||
|
try: # pragma: no cover - import wiring
|
||||||
|
from ...column_quality_score import column_quality_score as _column_quality_score
|
||||||
|
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||||
|
_column_quality_score = None
|
||||||
|
|
||||||
|
CHAPTER_VERSION = "1.0.0"
|
||||||
|
CHAPTER_ID = "calidad"
|
||||||
|
CHAPTER_TITLE = "Calidad"
|
||||||
|
|
||||||
|
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
||||||
|
# consistency 0.2. Kept here only to render the human explanation; the actual
|
||||||
|
# numbers always come from the function so the two never drift in computation.
|
||||||
|
_CRITERIA_INTRO = (
|
||||||
|
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
||||||
|
"criterios, cada uno con un peso:\n\n"
|
||||||
|
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
||||||
|
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
||||||
|
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
||||||
|
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
||||||
|
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
||||||
|
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
||||||
|
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
||||||
|
"Los problemas detectados por columna se listan en español más abajo."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cap for the joined issues cell so a single row never grows taller than a page;
|
||||||
|
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||||
|
_ISSUES_MAXLEN = 160
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_score(value) -> str:
|
||||||
|
"""Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
|
||||||
|
if value is None:
|
||||||
|
return "—"
|
||||||
|
try:
|
||||||
|
num = float(value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(value)
|
||||||
|
if num != num: # NaN
|
||||||
|
return "—"
|
||||||
|
text = f"{num:.1f}".rstrip("0").rstrip(".")
|
||||||
|
return f"{text} / 100"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_unit_pct(value) -> str:
|
||||||
|
"""Format a 0-1 fraction as a percentage (``95%``)."""
|
||||||
|
if value is None:
|
||||||
|
return "—"
|
||||||
|
try:
|
||||||
|
return f"{float(value) * 100:.0f}%"
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _quality_of(col: dict) -> dict:
|
||||||
|
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
||||||
|
|
||||||
|
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||||
|
back to the per-column ``quality_score`` already in the profile (number only,
|
||||||
|
empty breakdown/issues). Never raises.
|
||||||
|
"""
|
||||||
|
if not isinstance(col, dict):
|
||||||
|
col = {}
|
||||||
|
if _column_quality_score is not None:
|
||||||
|
try:
|
||||||
|
res = _column_quality_score(col)
|
||||||
|
if isinstance(res, dict):
|
||||||
|
return res
|
||||||
|
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||||
|
pass
|
||||||
|
# Fallback: only the final score is available pre-computed in the profile.
|
||||||
|
return {
|
||||||
|
"score": col.get("quality_score"),
|
||||||
|
"completeness": None,
|
||||||
|
"validity": None,
|
||||||
|
"consistency": None,
|
||||||
|
"issues": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _join_issues(issues) -> str:
|
||||||
|
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
||||||
|
|
||||||
|
The renderer wraps cell text, but a column with many long issues could make a
|
||||||
|
single row taller than a whole page; cap the length and append ``(+N más)``
|
||||||
|
so the count of hidden issues is honest rather than silently lost.
|
||||||
|
"""
|
||||||
|
if not isinstance(issues, (list, tuple)) or not issues:
|
||||||
|
return ""
|
||||||
|
parts = [model._safe_str(i).strip() for i in issues]
|
||||||
|
parts = [p for p in parts if p]
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
out = []
|
||||||
|
used = 0
|
||||||
|
for idx, part in enumerate(parts):
|
||||||
|
extra = len(part) + (2 if out else 0)
|
||||||
|
if used + extra > _ISSUES_MAXLEN and out:
|
||||||
|
remaining = len(parts) - idx
|
||||||
|
out.append(f"(+{remaining} más)")
|
||||||
|
return "; ".join(out)
|
||||||
|
out.append(part)
|
||||||
|
used += extra
|
||||||
|
return "; ".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _columns_with_quality(profile: dict):
|
||||||
|
"""Yield ``(col, quality_dict)`` for every column dict in the profile."""
|
||||||
|
cols = profile.get("columns") or []
|
||||||
|
for c in cols:
|
||||||
|
if isinstance(c, dict):
|
||||||
|
yield c, _quality_of(c)
|
||||||
|
|
||||||
|
|
||||||
|
def _summary_block(profile: dict, evaluated: list):
|
||||||
|
"""Table-level KVTable: global score and quality aggregates."""
|
||||||
|
rows = []
|
||||||
|
score = profile.get("quality_score")
|
||||||
|
rows.append(("Calidad global", _fmt_score(score)))
|
||||||
|
rows.append(("Columnas evaluadas", str(len(evaluated))))
|
||||||
|
|
||||||
|
comps = [q.get("completeness") for _, q in evaluated
|
||||||
|
if isinstance(q.get("completeness"), (int, float))]
|
||||||
|
vals = [q.get("validity") for _, q in evaluated
|
||||||
|
if isinstance(q.get("validity"), (int, float))]
|
||||||
|
cons = [q.get("consistency") for _, q in evaluated
|
||||||
|
if isinstance(q.get("consistency"), (int, float))]
|
||||||
|
if comps:
|
||||||
|
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||||
|
if vals:
|
||||||
|
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
||||||
|
if cons:
|
||||||
|
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
||||||
|
|
||||||
|
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||||
|
rows.append(("Columnas con problemas", str(n_problem)))
|
||||||
|
|
||||||
|
# Extra table-wide quality signals already in the profile, when present.
|
||||||
|
dup_pct = profile.get("duplicate_pct")
|
||||||
|
if dup_pct is not None:
|
||||||
|
rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
|
||||||
|
null_cell_pct = profile.get("null_cell_pct")
|
||||||
|
if null_cell_pct is not None:
|
||||||
|
rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
|
||||||
|
constant_cols = profile.get("constant_cols")
|
||||||
|
if isinstance(constant_cols, (list, tuple)) and constant_cols:
|
||||||
|
rows.append(("Columnas constantes", str(len(constant_cols))))
|
||||||
|
all_null_cols = profile.get("all_null_cols")
|
||||||
|
if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
|
||||||
|
rows.append(("Columnas 100% nulas", str(len(all_null_cols))))
|
||||||
|
|
||||||
|
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_unit_pct_or_pct(value) -> str:
|
||||||
|
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||||
|
try:
|
||||||
|
num = float(value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return model._safe_str(value)
|
||||||
|
if num != num: # NaN
|
||||||
|
return "—"
|
||||||
|
pct = num * 100 if num <= 1.0 else num
|
||||||
|
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||||
|
return f"{text}%"
|
||||||
|
|
||||||
|
|
||||||
|
def _scores_block(evaluated: list):
|
||||||
|
"""DataTable with per-column score and its three-criteria breakdown."""
|
||||||
|
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
||||||
|
rows = []
|
||||||
|
# Worst columns first so the reader sees the problems at the top.
|
||||||
|
ordered = sorted(
|
||||||
|
evaluated,
|
||||||
|
key=lambda cq: (cq[1].get("score")
|
||||||
|
if isinstance(cq[1].get("score"), (int, float)) else 101.0),
|
||||||
|
)
|
||||||
|
for col, q in ordered:
|
||||||
|
rows.append([
|
||||||
|
col.get("name") or "(col)",
|
||||||
|
_fmt_score(q.get("score")),
|
||||||
|
_fmt_unit_pct(q.get("completeness")),
|
||||||
|
_fmt_unit_pct(q.get("validity")),
|
||||||
|
_fmt_unit_pct(q.get("consistency")),
|
||||||
|
])
|
||||||
|
if not rows:
|
||||||
|
return None
|
||||||
|
return model.DataTable(header=header, rows=rows,
|
||||||
|
title="Scores de calidad por columna",
|
||||||
|
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
||||||
|
|
||||||
|
|
||||||
|
def _issues_block(evaluated: list):
|
||||||
|
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
||||||
|
header = ["Columna", "Problemas detectados (español)"]
|
||||||
|
rows = []
|
||||||
|
for col, q in evaluated:
|
||||||
|
joined = _join_issues(q.get("issues"))
|
||||||
|
if joined:
|
||||||
|
rows.append([col.get("name") or "(col)", joined])
|
||||||
|
if not rows:
|
||||||
|
return model.Note(
|
||||||
|
"No se detectaron problemas de calidad en las columnas evaluadas.")
|
||||||
|
return model.DataTable(header=header, rows=rows,
|
||||||
|
title="Problemas de calidad por columna")
|
||||||
|
|
||||||
|
|
||||||
|
def build_calidad(profile: dict, ctx: dict):
|
||||||
|
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||||
|
|
||||||
|
Reads everything defensively; returns ``None`` when there are no columns to
|
||||||
|
score (the chapter does not apply), and never raises on a malformed profile.
|
||||||
|
"""
|
||||||
|
profile = profile or {}
|
||||||
|
if not isinstance(profile, dict):
|
||||||
|
profile = {}
|
||||||
|
ctx = ctx or {}
|
||||||
|
|
||||||
|
evaluated = list(_columns_with_quality(profile))
|
||||||
|
if not evaluated:
|
||||||
|
return None # no columns to score -> chapter does not apply.
|
||||||
|
|
||||||
|
blocks = [
|
||||||
|
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||||
|
model.Markdown(text=_CRITERIA_INTRO),
|
||||||
|
_summary_block(profile, evaluated),
|
||||||
|
model.Heading(text="Scores por columna", level=2),
|
||||||
|
]
|
||||||
|
scores = _scores_block(evaluated)
|
||||||
|
if scores is not None:
|
||||||
|
blocks.append(scores)
|
||||||
|
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
||||||
|
blocks.append(_issues_block(evaluated))
|
||||||
|
|
||||||
|
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||||
|
version=CHAPTER_VERSION, blocks=blocks)
|
||||||
@@ -0,0 +1,194 @@
|
|||||||
|
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
|
||||||
|
|
||||||
|
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||||
|
and deterministic. Verifies that the chapter explains the quality criteria, shows
|
||||||
|
per-column scores with the completeness/validity/consistency breakdown, lists the
|
||||||
|
issues in Spanish (separate from the type flags), returns None when it does not
|
||||||
|
apply, and that a wide profile with long names renders to PDF and PPTX without
|
||||||
|
cutting any cell text (long content wraps, it is never truncated).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
from datascience.automatic_eda.chapters.calidad import (
|
||||||
|
build_calidad,
|
||||||
|
CHAPTER_VERSION,
|
||||||
|
)
|
||||||
|
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
||||||
|
|
||||||
|
|
||||||
|
def _profile() -> dict:
|
||||||
|
"""A small profile with one column per quality problem (nulls, outliers,
|
||||||
|
constant, high-cardinality id) plus one clean column."""
|
||||||
|
return {
|
||||||
|
"table": "demo",
|
||||||
|
"quality_score": 72.5,
|
||||||
|
"duplicate_pct": 0.04,
|
||||||
|
"null_cell_pct": 0.11,
|
||||||
|
"constant_cols": ["flag_const"],
|
||||||
|
"all_null_cols": [],
|
||||||
|
"columns": [
|
||||||
|
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
|
||||||
|
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
|
||||||
|
"quality_score": 60},
|
||||||
|
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
||||||
|
"unique_pct": 0.98, "quality_score": 80},
|
||||||
|
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
||||||
|
"flags": ["constant"], "quality_score": 50},
|
||||||
|
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
|
||||||
|
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _tables(chapter):
|
||||||
|
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
||||||
|
|
||||||
|
|
||||||
|
def _scores_table(chapter):
|
||||||
|
for t in _tables(chapter):
|
||||||
|
if "Scores" in (t.title or ""):
|
||||||
|
return t
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _issues_table(chapter):
|
||||||
|
for t in _tables(chapter):
|
||||||
|
if "Problemas" in (t.title or ""):
|
||||||
|
return t
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Golden
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_golden_chapter_estructura_y_version():
|
||||||
|
ch = build_calidad(_profile(), {})
|
||||||
|
assert ch is not None
|
||||||
|
assert ch.id == "calidad"
|
||||||
|
assert ch.version == CHAPTER_VERSION
|
||||||
|
kinds = [b.kind for b in ch.blocks]
|
||||||
|
# intro heading + markdown criteria + summary kv + scores table + issues table
|
||||||
|
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_intro_explica_criterios_y_pesos():
|
||||||
|
ch = build_calidad(_profile(), {})
|
||||||
|
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||||
|
for needle in ("Completitud", "Validez", "Consistencia",
|
||||||
|
"50%", "30%", "20%"):
|
||||||
|
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_scores_incluyen_desglose_por_criterio():
|
||||||
|
ch = build_calidad(_profile(), {})
|
||||||
|
scores = _scores_table(ch)
|
||||||
|
assert scores is not None
|
||||||
|
assert scores.header == ["Columna", "Calidad", "Completitud",
|
||||||
|
"Validez", "Consistencia"]
|
||||||
|
# 4 columns scored, none dropped.
|
||||||
|
assert len(scores.rows) == 4
|
||||||
|
names = {r[0] for r in scores.rows}
|
||||||
|
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_issues_en_espanol_separados_de_flags():
|
||||||
|
ch = build_calidad(_profile(), {})
|
||||||
|
issues = _issues_table(ch)
|
||||||
|
assert issues is not None
|
||||||
|
flat = " | ".join(" ".join(r) for r in issues.rows)
|
||||||
|
assert "nulos" in flat # completeness issue (ES)
|
||||||
|
assert "outliers" in flat # validity issue (ES)
|
||||||
|
assert "columna constante" in flat
|
||||||
|
assert "posible id de alta cardinalidad" in flat
|
||||||
|
# The raw type flag string must NOT leak as a "problem".
|
||||||
|
assert "constant" not in flat or "columna constante" in flat
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Edges
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_edge_none_vacio_sin_columnas_devuelve_none():
|
||||||
|
assert build_calidad(None, None) is None
|
||||||
|
assert build_calidad({}, {}) is None
|
||||||
|
assert build_calidad({"columns": []}, {}) is None
|
||||||
|
assert build_calidad("not a dict", {}) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||||
|
prof = {
|
||||||
|
"quality_score": 100,
|
||||||
|
"columns": [
|
||||||
|
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
|
||||||
|
"numeric": {"outlier_pct": 0.0}},
|
||||||
|
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
|
||||||
|
"numeric": {"outlier_pct": 0.0}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
ch = build_calidad(prof, {})
|
||||||
|
assert ch is not None
|
||||||
|
assert _issues_table(ch) is None # no issues table
|
||||||
|
notes = [b for b in ch.blocks if b.kind == "note"]
|
||||||
|
assert notes and "No se detectaron problemas" in notes[0].text
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Anti-cut: a wide profile with long names renders without truncation
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _wide_profile(ncols: int = 22) -> dict:
|
||||||
|
cols = [
|
||||||
|
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
||||||
|
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
|
||||||
|
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
||||||
|
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
|
||||||
|
]
|
||||||
|
for k in range(ncols - 2):
|
||||||
|
cols.append({
|
||||||
|
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
||||||
|
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||||
|
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
|
||||||
|
})
|
||||||
|
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
|
||||||
|
|
||||||
|
|
||||||
|
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
||||||
|
prof = _wide_profile(22)
|
||||||
|
full = build_document(prof, {"dataset_name": "ancha"})
|
||||||
|
assert any(c.id == "calidad" for c in full)
|
||||||
|
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
|
||||||
|
# this chapter (other chapters, e.g. portada, legitimately contain '…').
|
||||||
|
chapters = [c for c in full if c.id == "calidad"]
|
||||||
|
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
pdf = os.path.join(d, "q.pdf")
|
||||||
|
pptx = os.path.join(d, "q.pptx")
|
||||||
|
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
||||||
|
rx = render_pptx(chapters, pptx, {"title": "EDA"})
|
||||||
|
assert os.path.exists(pdf) and os.path.exists(pptx)
|
||||||
|
# The wide table forces pagination across several pages/slides.
|
||||||
|
assert (rp or {}).get("n_pages", 0) >= 2
|
||||||
|
|
||||||
|
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
|
||||||
|
# and there is no truncation marker.
|
||||||
|
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
||||||
|
assert "…" not in pdf_txt and "..." not in pdf_txt
|
||||||
|
norm = re.sub(r"\s+", "", pdf_txt)
|
||||||
|
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
||||||
|
|
||||||
|
# PPTX: long name present in some cell, untruncated.
|
||||||
|
allt = []
|
||||||
|
for s in Presentation(pptx).slides:
|
||||||
|
for sh in s.shapes:
|
||||||
|
if sh.has_text_frame:
|
||||||
|
allt.append(sh.text_frame.text)
|
||||||
|
if sh.has_table:
|
||||||
|
for row in sh.table.rows:
|
||||||
|
for c in row.cells:
|
||||||
|
allt.append(c.text)
|
||||||
|
joined = re.sub(r"\s+", "", "\n".join(allt))
|
||||||
|
assert long_name in joined, "el nombre largo se cortó en el PPTX"
|
||||||
Reference in New Issue
Block a user