feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)

Extrae al registry funciones del proyecto interno footprint_aurgi:
- core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb
- geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket
- geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout
- valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n
- datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull
- datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column
- datascience viz (2): plot_kde_2d, plot_heatmap_log
- infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest
- pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone
- types geo (4): LonLat, BBox, IsochroneRequest, Centro

Incluye:
- apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose)
- 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH)
- Issue tracker dev/issues/0052-footprint-aurgi-extraction.md
- Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi
- Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines)

Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 23:35:22 +02:00
parent f73ea072bd
commit faac610745
193 changed files with 13146 additions and 3 deletions
@@ -0,0 +1,45 @@
"""Tests para add_header_logo."""
from __future__ import annotations
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import numpy as np
import pytest
def test_figura_nueva_con_imagen_zeros_no_lanza_excepcion():
"""figura nueva con imagen zeros no lanza excepcion"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.add_header_logo import add_header_logo
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(11.69, 8.27))
image = np.zeros((50, 200, 3), dtype=np.uint8)
# Should not raise
add_header_logo(fig, image)
plt.close(fig)
def test_axes_de_logo_tiene_axis_off():
"""axes de logo tiene axis off"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.add_header_logo import add_header_logo
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(11.69, 8.27))
initial_axes_count = len(fig.axes)
image = np.zeros((10, 10, 3), dtype=np.uint8)
add_header_logo(fig, image, x=0.88, y=0.905, width=0.08, height=0.08)
# A new axes should have been added
assert len(fig.axes) == initial_axes_count + 1
logo_ax = fig.axes[-1]
# axis("off") disables both x and y axis visibility
assert not logo_ax.axison
plt.close(fig)
@@ -0,0 +1,62 @@
"""Tests para compress_pdf_ghostscript."""
from __future__ import annotations
import shutil
import tempfile
from pathlib import Path
import pytest
def _make_simple_pdf(path: Path) -> None:
"""Create a minimal valid PDF using fpdf2."""
try:
from fpdf import FPDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Helvetica", size=12)
pdf.cell(200, 10, text="Test PDF for ghostscript compression", ln=True)
pdf.output(str(path))
except ImportError:
# Fallback: write a minimal PDF manually
content = (
b"%PDF-1.4\n"
b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n"
b"xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n"
b"0000000068 00000 n \n0000000125 00000 n \n"
b"trailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n210\n%%EOF\n"
)
path.write_bytes(content)
def test_crea_pdf_temporal_y_comprime_retorna_bool_sin_excepcion():
"""crea pdf temporal y comprime - retorna bool sin excepcion"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.compress_pdf_ghostscript import compress_pdf_ghostscript
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = Path(tmpdir) / "test.pdf"
_make_simple_pdf(pdf_path)
assert pdf_path.exists()
result = compress_pdf_ghostscript(pdf_path)
assert isinstance(result, bool)
# File must still exist regardless of whether compression happened
assert pdf_path.exists()
def test_retorna_False_cuando_gs_no_esta_disponible(monkeypatch):
"""retorna False cuando gs no esta disponible"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.compress_pdf_ghostscript import compress_pdf_ghostscript
monkeypatch.setattr("shutil.which", lambda x: None)
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = Path(tmpdir) / "test.pdf"
_make_simple_pdf(pdf_path)
result = compress_pdf_ghostscript(pdf_path)
assert result is False
@@ -0,0 +1,38 @@
"""Tests para osm2pgsql_ingest."""
from __future__ import annotations
import shutil
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
def test_lanza_FileNotFoundError_con_path_inexistente():
"""lanza FileNotFoundError con path inexistente"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.osm2pgsql_ingest import osm2pgsql_ingest
with pytest.raises(FileNotFoundError):
osm2pgsql_ingest("/tmp/non_existent_file_that_does_not_exist.osm.pbf")
def test_lanza_RuntimeError_si_osm2pgsql_no_esta_en_PATH():
"""lanza RuntimeError si osm2pgsql no esta en PATH"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.osm2pgsql_ingest import osm2pgsql_ingest
with tempfile.TemporaryDirectory() as tmpdir:
pbf_path = Path(tmpdir) / "fake.osm.pbf"
# Create a dummy file so FileNotFoundError is not raised first
pbf_path.write_bytes(b"PBF")
# Skip test if osm2pgsql is actually in PATH (CI environment may have it)
if shutil.which("osm2pgsql") is not None:
pytest.skip("osm2pgsql is available in PATH; skipping RuntimeError test")
with pytest.raises(RuntimeError, match="osm2pgsql"):
osm2pgsql_ingest(pbf_path)
@@ -0,0 +1,53 @@
"""Tests para render_table_page_pdfpages."""
from __future__ import annotations
import tempfile
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import pytest
def test_50_filas_con_max_rows_28_genera_2_paginas_en_pdf_no_vacio():
"""50 filas con max_rows=28 genera 2 paginas en pdf no vacio"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.render_table_page_pdfpages import render_table_page_pdfpages
from matplotlib.backends.backend_pdf import PdfPages
rows = [[str(i), f"valor_{i}", f"extra_{i}"] for i in range(50)]
col_labels = ["ID", "Valor", "Extra"]
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = Path(tmpdir) / "test_table.pdf"
with PdfPages(str(pdf_path)) as pdf:
render_table_page_pdfpages(pdf, "Test Tabla", rows, col_labels, max_rows=28)
assert pdf_path.exists()
assert pdf_path.stat().st_size > 0
# Verify 2 pages were generated by reading PDF metadata
try:
from pypdf import PdfReader
reader = PdfReader(str(pdf_path))
assert len(reader.pages) == 2
except ImportError:
# If pypdf not available, just check file size
assert pdf_path.stat().st_size > 1000
def test_0_filas_genera_1_pagina_vacia_sin_excepcion():
"""0 filas genera 1 pagina vacia sin excepcion"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from infra.render_table_page_pdfpages import render_table_page_pdfpages
from matplotlib.backends.backend_pdf import PdfPages
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = Path(tmpdir) / "empty_table.pdf"
with PdfPages(str(pdf_path)) as pdf:
render_table_page_pdfpages(pdf, "Vacío", [], ["Col1", "Col2"])
assert pdf_path.exists()
assert pdf_path.stat().st_size > 0