From 00cd5274bc0a6bffffcbdeeb3c15ee775e526cbe Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 15:29:33 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20cap=C3=ADtulo=20GEOSPATIAL=20del?= =?UTF-8?q?=20AutomaticEDA=20(scatter=20geogr=C3=A1fico=20+=20zona/pa?= =?UTF-8?q?=C3=ADs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Capítulo nuevo chapters/geospatial.py (CHAPTER_VERSION 1.0.0). Cuando el dataset tiene un par de coordenadas, dibuja un scatter geográfico en proyección equirectangular (la escala respeta la latitud para no estirar la longitud) y analiza la extensión: bounding box, centroide, span, conteo por zona/país, hemisferios y una interpretación. Cuando NO hay coordenadas, build_geospatial devuelve None y el capítulo se omite. Sigue el contrato de capítulos (firma build_(profile, ctx) -> Chapter|None, lectura defensiva, nunca lanza) y el patrón de modelos/num_distr: delega el cálculo a las primitivas puras del registry (detect_latlon_columns, analyze_geo_extent, build_geo_scatter) y solo dibuja la figura matplotlib de forma perezosa. Las coordenadas crudas llegan por ctx['geo_points'] o ctx['raw_numeric'] (como modelos lee raw_numeric); sin ellas, degrada con un bounding box aproximado de numeric.min/max y una nota honesta. Anti-cortes: usa DataTable/KVTable/Figure/Markdown del modelo, que el paginador parte sin cortar. Test self-contained con golden + 6 edges + anti-cut (nombres largos + 2100 puntos en varias regiones renderizan a PDF y PPTX sin truncar). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/chapters/geospatial.py | 477 ++++++++++++++++++ .../automatic_eda/chapters/geospatial_test.py | 245 +++++++++ 2 files changed, 722 insertions(+) create mode 100644 python/functions/datascience/automatic_eda/chapters/geospatial.py create mode 100644 python/functions/datascience/automatic_eda/chapters/geospatial_test.py diff --git a/python/functions/datascience/automatic_eda/chapters/geospatial.py b/python/functions/datascience/automatic_eda/chapters/geospatial.py new file mode 100644 index 00000000..d60830cd --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/geospatial.py @@ -0,0 +1,477 @@ +"""Geospatial chapter (GEOSPATIAL) for AutomaticEDA. + +When the dataset carries a coordinate pair (latitude/longitude), this chapter +draws the points on a **geographic scatter** in an equirectangular projection +(scaled so degrees of longitude are not stretched at the data's latitude) and +analyses the **zone / country** the points fall in: bounding box, centroid, +geographic span, and a per-region count. When there is **no** coordinate pair the +chapter returns ``None`` — exactly the user requirement. + +Detection and the heavy lifting are delegated to pure ``eda``-group registry +functions, never reimplemented here: + +- ``detect_latlon_columns`` — finds the (lat, lon) column pair by name + value + range from the ``profile['columns']`` metadata. +- ``analyze_geo_extent`` — bbox, centroid, haversine span, per-region counts and + hemisphere from the raw coordinate arrays. +- ``build_geo_scatter`` — deterministically down-sampled points + bbox + the + aspect ratio for the equirectangular projection. This chapter only draws the + matplotlib figure from that prepared data (same split as ``num_distr`` does + with ``build_boxplot_stats``). + +The raw coordinate arrays are **not** in a standard TableProfile (it stores only +per-column aggregates), so — exactly like ``modelos`` reads ``raw_numeric`` from +``ctx`` — this chapter looks for the coordinates in ``ctx`` (or ``profile``) and +degrades honestly when they are absent: it still detects the columns and shows an +approximate bounding box derived from the per-column ``numeric.min/max``, with a +note that the raw points are needed for the map. + +ctx keys this chapter consumes (all optional): + geo_points : dict — ``{"lats": [...], "lons": [...]}`` raw coordinate arrays. + Used directly when present (forward-compatible with a calculation phase + that samples them from the table). + raw_numeric : dict — ``{col: [values]}`` raw numeric columns; when present + and ``geo_points`` is not, the detected lat/lon columns are read from it. + run_geo_llm : bool — when True, call ``ask_llm`` for a one-line narrative of + where the points concentrate (otherwise a derived note is used). + geo_llm_model : str — model id for the optional live LLM call. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +Reads everything defensively (``.get``) and never raises. +""" + +from __future__ import annotations + +import math + +from .. import model + +# Pure registry functions (group ``eda``) delegated to. Imported defensively so +# the chapter stays importable (degrading gracefully) if one is unavailable. +try: + from datascience.detect_latlon_columns import detect_latlon_columns +except Exception: # noqa: BLE001 — keep the chapter importable no matter what. + detect_latlon_columns = None # type: ignore[assignment] +try: + from datascience.analyze_geo_extent import analyze_geo_extent +except Exception: # noqa: BLE001 + analyze_geo_extent = None # type: ignore[assignment] +try: + from datascience.build_geo_scatter import build_geo_scatter +except Exception: # noqa: BLE001 + build_geo_scatter = None # type: ignore[assignment] + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "geospatial" +CHAPTER_TITLE = "Análisis geoespacial" + + +# --------------------------------------------------------------------------- # +# Formatting helpers (mirror the other chapters' defensive style). +# --------------------------------------------------------------------------- # +def _fmt_num(value, decimals: int = 4) -> str: + if value is None: + return "—" + if isinstance(value, bool): + return "sí" if value else "no" + if isinstance(value, int): + return f"{value:,}".replace(",", ".") + if isinstance(value, float): + if value != value: # NaN + return "NaN" + if value in (float("inf"), float("-inf")): + return str(value) + text = f"{value:.{decimals}f}".rstrip("0").rstrip(".") + return text if text else "0" + return model._safe_str(value) + + +def _fmt_coord(value, decimals: int = 4) -> str: + """Format a coordinate degree value, defensively.""" + try: + return f"{float(value):.{decimals}f}°" + except (TypeError, ValueError): + return model._safe_str(value) + + +def _fmt_km(value) -> str: + if value is None: + return "—" + try: + v = float(value) + except (TypeError, ValueError): + return model._safe_str(value) + if v >= 100: + return f"{v:,.0f} km".replace(",", ".") + return f"{v:.1f} km" + + +def _is_dict(v) -> bool: + return isinstance(v, dict) + + +def _clean_floats(seq) -> list: + """Return a list of floats from an arbitrary sequence (drop None/NaN).""" + out = [] + if not isinstance(seq, (list, tuple)): + return out + for v in seq: + try: + f = float(v) + except (TypeError, ValueError): + out.append(None) + continue + out.append(f if f == f else None) # NaN -> None + return out + + +# --------------------------------------------------------------------------- # +# Resolve the (lat, lon) columns and the raw coordinate arrays. +# --------------------------------------------------------------------------- # +def _detect_columns(profile: dict) -> dict: + """Detect the lat/lon column pair from the profile metadata, or {}.""" + cols = profile.get("columns") + if not isinstance(cols, list) or not cols or detect_latlon_columns is None: + return {} + try: + det = detect_latlon_columns(cols) + except Exception: # noqa: BLE001 — never break the chapter. + return {} + return det if _is_dict(det) else {} + + +def _resolve_coords(profile: dict, ctx: dict, detected: dict): + """Return (lats, lons, source_label). + + Order: ctx/profile['geo_points'] (explicit arrays) → ctx/profile + ['raw_numeric'] keyed by the detected lat/lon column names → (None, None). + """ + gp = ctx.get("geo_points") or profile.get("geo_points") + if _is_dict(gp): + lats = gp.get("lats") + if lats is None: + lats = gp.get("lat") + lons = gp.get("lons") + if lons is None: + lons = gp.get("lon") + if lats and lons: + return list(lats), list(lons), "geo_points" + + lat_col = (detected or {}).get("lat_col") + lon_col = (detected or {}).get("lon_col") + if lat_col and lon_col: + raw = ctx.get("raw_numeric") or profile.get("raw_numeric") + if _is_dict(raw): + lats = raw.get(lat_col) + lons = raw.get(lon_col) + if lats and lons: + return list(lats), list(lons), "raw_numeric" + return None, None, "none" + + +def _column_by_name(profile: dict, name): + if not name: + return None + for col in profile.get("columns") or []: + if isinstance(col, dict) and col.get("name") == name: + return col + return None + + +def _bbox_from_profile(profile: dict, detected: dict): + """Approximate bbox from the per-column numeric.min/max (no raw points).""" + lat_c = _column_by_name(profile, (detected or {}).get("lat_col")) + lon_c = _column_by_name(profile, (detected or {}).get("lon_col")) + lat_n = lat_c.get("numeric") if _is_dict(lat_c) else None + lon_n = lon_c.get("numeric") if _is_dict(lon_c) else None + if not _is_dict(lat_n) or not _is_dict(lon_n): + return None + try: + return { + "lat_min": float(lat_n.get("min")), + "lat_max": float(lat_n.get("max")), + "lon_min": float(lon_n.get("min")), + "lon_max": float(lon_n.get("max")), + } + except (TypeError, ValueError): + return None + + +# --------------------------------------------------------------------------- # +# Figure builder (lazy: matplotlib only imported when the renderer draws it). +# --------------------------------------------------------------------------- # +def _make_geo_scatter(scatter: dict, lat_col: str, lon_col: str): + """Return a zero-arg callable drawing the geographic scatter, or None.""" + points = scatter.get("points") or [] + if not points: + return None + bbox = scatter.get("bbox") if _is_dict(scatter.get("bbox")) else {} + aspect = scatter.get("aspect") or 1.0 + pad = scatter.get("pad") if _is_dict(scatter.get("pad")) else {} + n_total = scatter.get("n_total") + n_shown = scatter.get("n_shown") + + def _draw(): + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + xs = [p[0] for p in points if isinstance(p, (list, tuple)) and len(p) >= 2] + ys = [p[1] for p in points if isinstance(p, (list, tuple)) and len(p) >= 2] + + fig, ax = plt.subplots(figsize=(6.6, 5.0)) + # More points -> smaller markers + lower alpha so dense clouds read as + # density without saturating the page with ink (Tufte). + n = max(len(xs), 1) + size = 18 if n <= 200 else (8 if n <= 1000 else 4) + alpha = 0.75 if n <= 200 else (0.5 if n <= 1000 else 0.35) + ax.scatter(xs, ys, s=size, c="#2a6f97", alpha=alpha, linewidths=0, + zorder=3) + + # Bounding box rectangle for orientation. + if bbox: + try: + lo_x, hi_x = float(bbox["lon_min"]), float(bbox["lon_max"]) + lo_y, hi_y = float(bbox["lat_min"]), float(bbox["lat_max"]) + ax.plot([lo_x, hi_x, hi_x, lo_x, lo_x], + [lo_y, lo_y, hi_y, hi_y, lo_y], + color="#e15759", linewidth=1.0, linestyle="--", + alpha=0.8, zorder=4, label="Bounding box") + px = float(pad.get("lon", 0.0) or 0.0) + py = float(pad.get("lat", 0.0) or 0.0) + ax.set_xlim(lo_x - px, hi_x + px) + ax.set_ylim(lo_y - py, hi_y + py) + except (TypeError, ValueError, KeyError): + pass + + # Equirectangular: scale Y/X so longitude is not stretched at this + # latitude (integridad de proyección, Tufte). aspect = 1/cos(lat). + try: + ax.set_aspect(float(aspect)) + except (TypeError, ValueError): + pass + + ax.set_xlabel(f"Longitud ({lon_col})", fontsize=8) + ax.set_ylabel(f"Latitud ({lat_col})", fontsize=8) + ax.tick_params(labelsize=7) + ax.grid(color="#e6e6e6", linewidth=0.5, zorder=0) + title = "Distribución geográfica de las coordenadas" + if n_shown is not None and n_total is not None and n_shown < n_total: + title += f"\n(mostrando {n_shown:,} de {n_total:,} puntos)".replace(",", ".") + ax.set_title(title, fontsize=10) + ax.legend(loc="best", fontsize=7, frameon=True, framealpha=0.9) + fig.tight_layout() + return fig + + return _draw + + +# --------------------------------------------------------------------------- # +# Section builders. +# --------------------------------------------------------------------------- # +def _intro_block(detected: dict, lat_col: str, lon_col: str) -> list: + conf = (detected or {}).get("confidence") + reason = model._safe_str((detected or {}).get("reason")) + conf_txt = "" + if conf is not None: + try: + conf_txt = f" (confianza {float(conf) * 100:.0f}%)" + except (TypeError, ValueError): + conf_txt = "" + text = ( + "Este dataset contiene **coordenadas geográficas**: se identificó el par " + f"**latitud = «{lat_col}»** y **longitud = «{lon_col}»**{conf_txt}. La " + "detección combina el nombre de la columna y el rango de sus valores " + "(latitud en [−90, 90], longitud en [−180, 180])." + ) + if reason: + text += f"\n\n*Criterio de detección:* {reason}." + return [model.Heading(text=CHAPTER_TITLE, level=1), + model.Markdown(text=text)] + + +def _extent_blocks(extent: dict) -> list: + """KVTable with bbox/centroid/span + DataTable with the per-region counts.""" + if not _is_dict(extent) or not extent.get("n_points"): + return [] + blocks = [] + bbox = extent.get("bbox") if _is_dict(extent.get("bbox")) else {} + centroid = extent.get("centroid") if _is_dict(extent.get("centroid")) else {} + hemi = extent.get("hemisphere") if _is_dict(extent.get("hemisphere")) else {} + + rows = [("Puntos con coordenadas", _fmt_num(extent.get("n_points")))] + if bbox: + rows.append(("Latitud (mín. / máx.)", + f"{_fmt_coord(bbox.get('lat_min'))} a " + f"{_fmt_coord(bbox.get('lat_max'))}")) + rows.append(("Longitud (mín. / máx.)", + f"{_fmt_coord(bbox.get('lon_min'))} a " + f"{_fmt_coord(bbox.get('lon_max'))}")) + if centroid: + rows.append(("Centroide", + f"{_fmt_coord(centroid.get('lat'))}, " + f"{_fmt_coord(centroid.get('lon'))}")) + if extent.get("span_km") is not None: + rows.append(("Extensión (diagonal)", _fmt_km(extent.get("span_km")))) + if hemi: + n, s = hemi.get("north"), hemi.get("south") + e, w = hemi.get("east"), hemi.get("west") + rows.append(("Hemisferios", + f"N {_fmt_num(n)} / S {_fmt_num(s)} · " + f"E {_fmt_num(e)} / O {_fmt_num(w)}")) + blocks.append(model.KVTable(rows=rows, title="Extensión geográfica")) + + by_region = extent.get("by_region") + if isinstance(by_region, list) and by_region: + total = sum(r.get("count", 0) for r in by_region if _is_dict(r)) or 0 + rrows = [] + for r in by_region: + if not _is_dict(r): + continue + cnt = r.get("count", 0) + pct = (cnt / total) if total else None + pct_txt = f"{pct * 100:.1f}%" if pct is not None else "—" + rrows.append([model._safe_str(r.get("region")), _fmt_num(cnt), + pct_txt]) + if rrows: + blocks.append(model.DataTable( + header=["Zona / país", "Puntos", "% del total"], rows=rrows, + title="Distribución por zona", + note="Asignación aproximada por bounding box de cada región " + "(no es reverse-geocoding exacto de fronteras).")) + return blocks + + +def _narrative_block(profile: dict, ctx: dict, extent: dict) -> list: + """A one-line narrative of where the points concentrate. + + Uses the derived ``note`` from analyze_geo_extent by default; optionally + calls an LLM (ctx['run_geo_llm']) for a richer one-liner. + """ + note = model._safe_str((extent or {}).get("note")) + if ctx.get("run_geo_llm"): + by_region = (extent or {}).get("by_region") or [] + bbox = (extent or {}).get("bbox") or {} + try: + from core.ask_llm import ask_llm + prompt = ( + "Eres un analista de datos. En UNA frase en español, describe " + "dónde se concentran geográficamente estos puntos. Sé concreto " + "y no inventes precisión que los datos no tienen.\n" + f"Conteo por zona: {by_region}\nBounding box: {bbox}." + ) + out = ask_llm(prompt, + model=ctx.get("geo_llm_model", + "claude-haiku-4-5-20251001"), + echo=False) + if out and isinstance(out, str) and out.strip(): + note = out.strip() + except Exception: # noqa: BLE001 — degrade to the derived note. + pass + if not note: + return [] + return [model.Markdown(text=f"**Interpretación.** {note}")] + + +def _no_points_block(profile: dict, detected: dict) -> list: + """Degrade honestly when the raw coordinate arrays are not available.""" + blocks = [] + bbox = _bbox_from_profile(profile, detected) + if bbox: + rows = [ + ("Latitud (mín. / máx.)", + f"{_fmt_coord(bbox.get('lat_min'))} a " + f"{_fmt_coord(bbox.get('lat_max'))}"), + ("Longitud (mín. / máx.)", + f"{_fmt_coord(bbox.get('lon_min'))} a " + f"{_fmt_coord(bbox.get('lon_max'))}"), + ] + blocks.append(model.KVTable( + rows=rows, title="Extensión geográfica (aproximada)")) + blocks.append(model.Note( + "No se incluyeron las coordenadas crudas en el contexto, por lo que el " + "mapa y el análisis por zona no se han dibujado. El bounding box " + "mostrado se deriva de los mínimos y máximos por columna. Para el " + "scatter geográfico completo, pasa los arrays en " + "ctx['geo_points'] = {'lats': [...], 'lons': [...]} o las columnas en " + "ctx['raw_numeric'].")) + return blocks + + +# --------------------------------------------------------------------------- # +# Entry point. +# --------------------------------------------------------------------------- # +def build_geospatial(profile: dict, ctx: dict): + """Build the GEOSPATIAL Chapter, or None if the dataset has no coordinates. + + Args: + profile: the ``eda`` group TableProfile dict. + ctx: presentation context; may carry ``geo_points``/``raw_numeric`` with + the raw coordinate arrays and the ``run_geo_llm`` flag. + + Returns: + A ``model.Chapter`` with the geographic scatter + zone/country analysis, + or ``None`` when no latitude/longitude column pair is detected. + """ + profile = profile or {} + ctx = ctx or {} + if not isinstance(profile, dict): + return None + + detected = _detect_columns(profile) + lats, lons, source = _resolve_coords(profile, ctx, detected) + + has_detection = bool((detected or {}).get("lat_col") and + (detected or {}).get("lon_col")) + has_points = bool(lats and lons) + if not has_detection and not has_points: + return None # chapter does not apply: no coordinates in this dataset. + + # Labels for axes / intro. When only raw arrays were given (no detection), + # fall back to generic names. + lat_col = (detected or {}).get("lat_col") or "lat" + lon_col = (detected or {}).get("lon_col") or "lon" + + blocks = _intro_block(detected, lat_col, lon_col) + + if has_points: + clean_lats = _clean_floats(lats) + clean_lons = _clean_floats(lons) + + # Zone / country analysis. + extent = {} + if analyze_geo_extent is not None: + try: + extent = analyze_geo_extent(clean_lats, clean_lons) or {} + except Exception: # noqa: BLE001 + extent = {} + + # The geographic scatter figure (its own page/slide). + scatter = {} + if build_geo_scatter is not None: + try: + scatter = build_geo_scatter(clean_lats, clean_lons) or {} + except Exception: # noqa: BLE001 + scatter = {} + maker = _make_geo_scatter(scatter, lat_col, lon_col) if scatter else None + if maker is not None: + blocks.append(model.Figure( + make=maker, + caption="Cada punto es una observación situada por sus " + "coordenadas; el recuadro rojo es el bounding box. La " + "escala respeta la latitud (proyección equirectangular).")) + else: + blocks.append(model.Note( + "No se pudo construir el scatter geográfico a partir de las " + "coordenadas proporcionadas.")) + + blocks += _extent_blocks(extent) + blocks += _narrative_block(profile, ctx, extent) + else: + # Columns detected but no raw points available — degrade honestly. + blocks += _no_points_block(profile, detected) + + if not blocks: + return None + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/geospatial_test.py b/python/functions/datascience/automatic_eda/chapters/geospatial_test.py new file mode 100644 index 00000000..434eae4e --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/geospatial_test.py @@ -0,0 +1,245 @@ +"""Tests for the GEOSPATIAL chapter — DoD: golden + edges + anti-cut. + +Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast +and deterministic. The raw coordinate arrays are passed through ``ctx`` exactly +as the chapter's contract documents (``ctx['geo_points']`` / ``ctx['raw_numeric']``). + +Verifies that the chapter detects the lat/lon pair, draws the geographic scatter +figure, analyses the zone/country (bounding box + per-region counts), returns +None when there are no coordinates, degrades honestly when the raw points are +absent, and that a profile with long column names + many points + several +regions renders to PDF and PPTX without cutting any text (long content wraps, it +is never truncated). +""" + +import os +import re +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.chapters.geospatial import ( + build_geospatial, + CHAPTER_VERSION, +) +from datascience.automatic_eda import build_document, render_pdf, render_pptx + + +# --------------------------------------------------------------------------- # +# Synthetic data helpers +# --------------------------------------------------------------------------- # +def _grid(lat0: float, lon0: float, n: int, spread: float = 1.0): + """A small deterministic cloud of n points around (lat0, lon0).""" + lats, lons = [], [] + for i in range(n): + # deterministic pseudo-spread, no randomness. + f = (i % 11) / 11.0 - 0.5 + g = (i % 7) / 7.0 - 0.5 + lats.append(lat0 + f * spread) + lons.append(lon0 + g * spread) + return lats, lons + + +def _profile_with_coords(lat_name="lat", lon_name="lon", lats=None, lons=None): + """A profile carrying a lat/lon column pair with valid ranges.""" + lats = lats if lats is not None else [40.4, 41.0, 39.8, 40.1] + lons = lons if lons is not None else [-3.7, -3.6, -4.0, -3.9] + return { + "table": "lugares", + "columns": [ + {"name": lat_name, "inferred_type": "numeric", + "numeric": {"min": min(lats), "max": max(lats), + "mean": sum(lats) / len(lats)}}, + {"name": lon_name, "inferred_type": "numeric", + "numeric": {"min": min(lons), "max": max(lons), + "mean": sum(lons) / len(lons)}}, + {"name": "valor", "inferred_type": "numeric", + "numeric": {"min": 0, "max": 100, "mean": 50}}, + ], + } + + +def _ctx_points(lats, lons): + return {"geo_points": {"lats": lats, "lons": lons}} + + +def _kinds(chapter): + return [getattr(b, "kind", None) for b in chapter.blocks] + + +def _tables(chapter): + return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"] + + +def _figures(chapter): + return [b for b in chapter.blocks if getattr(b, "kind", None) == "figure"] + + +# --------------------------------------------------------------------------- # +# Golden +# --------------------------------------------------------------------------- # +def test_golden_estructura_y_version(): + lats, lons = [40.4, 41.0, 39.8, 40.1], [-3.7, -3.6, -4.0, -3.9] + ch = build_geospatial(_profile_with_coords(lats=lats, lons=lons), + _ctx_points(lats, lons)) + assert ch is not None + assert ch.id == "geospatial" + assert ch.version == CHAPTER_VERSION + kinds = _kinds(ch) + # intro heading + markdown + scatter figure + extent kv + per-region table. + assert "heading" in kinds + assert "markdown" in kinds + assert "figure" in kinds, "falta el scatter geográfico" + assert "kv_table" in kinds, "falta la tabla de extensión" + + +def test_golden_detecta_columnas_y_nombra_ejes(): + lats, lons = _grid(40.4, -3.7, 30, spread=0.8) + prof = _profile_with_coords("latitude", "longitude", lats, lons) + ch = build_geospatial(prof, _ctx_points(lats, lons)) + intro = [b for b in ch.blocks if b.kind == "markdown"][0].text + assert "latitude" in intro and "longitude" in intro + + +def test_golden_figura_es_perezosa_y_dibujable(): + lats, lons = _grid(40.4, -3.7, 50, spread=0.6) + ch = build_geospatial(_profile_with_coords(lats=lats, lons=lons), + _ctx_points(lats, lons)) + fig_block = _figures(ch)[0] + assert fig_block.make is not None and fig_block.fig is None # lazy + fig = fig_block.make() # must draw without raising + assert fig is not None + import matplotlib.pyplot as plt + plt.close(fig) + + +def test_golden_analisis_por_zona_espana(): + lats, lons = _grid(40.4, -3.7, 40, spread=0.5) # Madrid area + ch = build_geospatial(_profile_with_coords(lats=lats, lons=lons), + _ctx_points(lats, lons)) + tables = _tables(ch) + region_tbl = [t for t in tables if "zona" in (t.title or "").lower()] + assert region_tbl, "falta la tabla por zona/país" + flat = " ".join(" ".join(str(c) for c in r) for r in region_tbl[0].rows) + # Spain-area points must resolve to a Spain/European region, not empty. + assert region_tbl[0].rows + assert any(c for c in (region_tbl[0].rows[0])) + + +def test_golden_raw_numeric_source(): + """Coordinates can also come from ctx['raw_numeric'] keyed by detected cols.""" + lats, lons = _grid(48.85, 2.35, 25, spread=0.4) # Paris area + prof = _profile_with_coords("lat", "lon", lats, lons) + ctx = {"raw_numeric": {"lat": lats, "lon": lons}} + ch = build_geospatial(prof, ctx) + assert ch is not None + assert _figures(ch), "el scatter debe construirse desde raw_numeric" + + +# --------------------------------------------------------------------------- # +# Edges +# --------------------------------------------------------------------------- # +def test_edge_sin_coordenadas_devuelve_none(): + prof = { + "table": "ventas", + "columns": [ + {"name": "precio", "inferred_type": "numeric", + "numeric": {"min": 0, "max": 1000}}, + {"name": "categoria", "inferred_type": "text"}, + ], + } + assert build_geospatial(prof, {}) is None + + +def test_edge_none_y_vacio_no_rompen(): + assert build_geospatial(None, None) is None + assert build_geospatial({}, {}) is None + assert build_geospatial({"columns": []}, {}) is None + assert build_geospatial("not a dict", {}) is None + + +def test_edge_nombre_lat_pero_rango_invalido_no_aplica(): + """A column named 'lat' whose values are out of [-90,90] is NOT a coordinate.""" + prof = { + "table": "x", + "columns": [ + {"name": "lat", "inferred_type": "numeric", + "numeric": {"min": 1000, "max": 9999}}, + {"name": "lon", "inferred_type": "numeric", + "numeric": {"min": 1000, "max": 9999}}, + ], + } + assert build_geospatial(prof, {}) is None + + +def test_edge_columnas_detectadas_sin_puntos_degrada(): + """Detected lat/lon but no raw arrays -> honest note + approx bbox, no crash.""" + prof = _profile_with_coords(lats=[40.0, 41.0], lons=[-3.0, -4.0]) + ch = build_geospatial(prof, {}) # no geo_points / raw_numeric + assert ch is not None + assert not _figures(ch), "sin puntos no debe dibujarse el scatter" + notes = [b for b in ch.blocks if b.kind == "note"] + assert notes and "coordenadas crudas" in notes[0].text + + +def test_edge_coordenadas_con_nan_se_filtran(): + lats = [40.4, float("nan"), 41.0, None, 39.8] + lons = [-3.7, -3.6, float("nan"), -3.9, -4.0] + ch = build_geospatial(_profile_with_coords(lats=[39.8, 41.0], + lons=[-4.0, -3.6]), + _ctx_points(lats, lons)) + assert ch is not None # must not raise on NaN/None + + +# --------------------------------------------------------------------------- # +# Anti-cut: long names + many points + several regions render without truncation +# --------------------------------------------------------------------------- # +def _multiregion_points(per: int = 700): + """Points spread across Spain, France and the USA to fill the region table.""" + lats, lons = [], [] + for (la, lo) in ((40.4, -3.7), (48.85, 2.35), (39.0, -98.0)): + gl, gn = _grid(la, lo, per, spread=2.0) + lats += gl + lons += gn + return lats, lons + + +def test_anticut_pdf_y_pptx_no_truncan(): + lat_name = "latitud_geografica_del_punto_de_observacion_registrado" + lon_name = "longitud_geografica_del_punto_de_observacion_registrado" + lats, lons = _multiregion_points(700) + prof = _profile_with_coords(lat_name, lon_name, lats, lons) + ctx = {"geo_points": {"lats": lats, "lons": lons}} + + full = build_document(prof, ctx) + assert any(c.id == "geospatial" for c in full) + chapters = [c for c in full if c.id == "geospatial"] + + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "g.pdf") + pptx = os.path.join(d, "g.pptx") + rp = render_pdf(chapters, pdf, {"title": "EDA"}) + rx = render_pptx(chapters, pptx, {"title": "EDA"}) + assert os.path.exists(pdf) and os.path.exists(pptx) + assert (rp or {}).get("n_pages", 0) >= 1 + + # PDF: the long lat column name survives whole (wraps, not cut) and there + # is no truncation marker in this chapter. + pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages) + assert "…" not in pdf_txt and "..." not in pdf_txt + norm = re.sub(r"\s+", "", pdf_txt) + assert lat_name in norm, "el nombre largo de la columna se cortó en el PDF" + + # PPTX: long name present in some shape/cell, untruncated. + allt = [] + for s in Presentation(pptx).slides: + for sh in s.shapes: + if sh.has_text_frame: + allt.append(sh.text_frame.text) + if sh.has_table: + for row in sh.table.rows: + for c in row.cells: + allt.append(c.text) + joined = re.sub(r"\s+", "", "\n".join(allt)) + assert lat_name in joined, "el nombre largo de la columna se cortó en el PPTX"