feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)

Extrae al registry funciones del proyecto interno footprint_aurgi:
- core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb
- geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket
- geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout
- valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n
- datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull
- datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column
- datascience viz (2): plot_kde_2d, plot_heatmap_log
- infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest
- pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone
- types geo (4): LonLat, BBox, IsochroneRequest, Centro

Incluye:
- apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose)
- 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH)
- Issue tracker dev/issues/0052-footprint-aurgi-extraction.md
- Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi
- Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines)

Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 23:35:22 +02:00
parent f73ea072bd
commit faac610745
193 changed files with 13146 additions and 3 deletions
@@ -0,0 +1,22 @@
"""Tests para add_basemap_osm."""
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from geo.add_basemap_osm import add_basemap_osm
def test_no_lanza_excepcion_con_Axes_valido():
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlim(-430000, -350000)
ax.set_ylim(4500000, 4600000)
# Must not raise regardless of network availability
add_basemap_osm(ax, zoom=5)
plt.close(fig)
@@ -0,0 +1,23 @@
"""Tests para add_basemap_with_timeout."""
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from geo.add_basemap_with_timeout import add_basemap_with_timeout
def test_timeout_muy_corto_retorna_False_sin_colgar():
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlim(-430000, -350000)
ax.set_ylim(4500000, 4600000)
# 0.001 s timeout — should fail/timeout fast and return False
result = add_basemap_with_timeout(ax, zoom=9, timeout_s=0.001)
plt.close(fig)
assert result is False, f"expected False with 0.001s timeout, got {result}"
@@ -0,0 +1,25 @@
"""Tests para distance_bucket."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.distance_bucket import distance_bucket
def test_bucket_0_5():
assert distance_bucket(3.0) == "0-5"
def test_bucket_5_10():
assert distance_bucket(7.0) == "5-10"
def test_bucket_borde_exacto():
# 10 <= 10 → "5-10"
assert distance_bucket(10.0) == "5-10"
def test_bucket_160_mas():
assert distance_bucket(200.0) == "160+"
@@ -0,0 +1,19 @@
"""Tests para extent_with_padding."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.extent_with_padding import extent_with_padding
def test_bbox_cuadrado_con_10_pct():
result = extent_with_padding((0.0, 0.0, 10.0, 10.0), 0.1)
assert result == (-1.0, 11.0, -1.0, 11.0)
def test_pad_ratio_cero_no_cambia():
bounds = (2.0, 3.0, 8.0, 9.0)
result = extent_with_padding(bounds, 0.0)
assert result == (2.0, 8.0, 3.0, 9.0)
@@ -0,0 +1,18 @@
"""Tests para haversine_km."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.haversine_km import haversine_km
def test_madrid_barcelona_aproximado():
d = haversine_km(-3.7038, 40.4168, 2.1686, 41.3874)
assert abs(d - 504.0) < 2.0, f"Esperado ~504 km, got {d:.1f}"
def test_misma_coordenada_es_cero():
d = haversine_km(0.0, 0.0, 0.0, 0.0)
assert d == 0.0, f"Misma coordenada debe ser 0, got {d}"
@@ -0,0 +1,61 @@
"""Tests para load_boundary_gdf."""
import json
import sys
import tempfile
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from geo.load_boundary_gdf import load_boundary_gdf
def _write_geojson(data: dict) -> Path:
f = tempfile.NamedTemporaryFile(
mode="w", suffix=".geojson", delete=False, encoding="utf-8"
)
json.dump(data, f)
f.close()
return Path(f.name)
def test_retorna_GeoDataFrame_con_CRS_EPSG4326():
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[-3.7, 40.4],
[-3.6, 40.4],
[-3.6, 40.5],
[-3.7, 40.5],
[-3.7, 40.4],
]
],
},
"properties": {"name": "test"},
}
],
}
path = _write_geojson(geojson)
try:
gdf = load_boundary_gdf(path, crs="EPSG:4326")
import geopandas as gpd # type: ignore
assert isinstance(gdf, gpd.GeoDataFrame), "result should be a GeoDataFrame"
assert gdf.crs is not None, "CRS should be set"
assert gdf.crs.to_epsg() == 4326, f"expected EPSG:4326, got {gdf.crs}"
assert len(gdf) == 1, f"expected 1 feature, got {len(gdf)}"
finally:
path.unlink(missing_ok=True)
def test_archivo_inexistente_lanza_FileNotFoundError():
import pytest
with pytest.raises(FileNotFoundError):
load_boundary_gdf("/tmp/this_file_does_not_exist_xyz.geojson")
@@ -0,0 +1,59 @@
"""Tests para load_geojson_polygons."""
import json
import sys
import tempfile
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from geo.load_geojson_polygons import load_geojson_polygons
def _write_geojson(data: dict) -> Path:
f = tempfile.NamedTemporaryFile(
mode="w", suffix=".geojson", delete=False, encoding="utf-8"
)
json.dump(data, f)
f.close()
return Path(f.name)
def test_polygon_simple_produce_1_poligono_con_1_anillo():
geojson = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[-3.7, 40.4],
[-3.6, 40.4],
[-3.6, 40.5],
[-3.7, 40.5],
[-3.7, 40.4],
]
],
},
"properties": {},
}
],
}
path = _write_geojson(geojson)
try:
result = load_geojson_polygons(path)
assert len(result) == 1, f"expected 1 polygon, got {len(result)}"
assert len(result[0]) == 1, "expected 1 ring"
assert len(result[0][0]) >= 4, "ring should have >= 4 points"
assert isinstance(result[0][0][0], tuple), "points should be tuples"
finally:
path.unlink(missing_ok=True)
def test_archivo_inexistente_lanza_FileNotFoundError():
import pytest
with pytest.raises(FileNotFoundError):
load_geojson_polygons("/tmp/this_file_does_not_exist_xyz.geojson")
@@ -0,0 +1,29 @@
"""Tests para point_in_polygon."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.point_in_polygon import point_in_polygon
OUTER = [(0.0, 0.0), (4.0, 0.0), (4.0, 4.0), (0.0, 4.0)]
HOLE = [(1.0, 1.0), (3.0, 1.0), (3.0, 3.0), (1.0, 3.0)]
def test_punto_en_exterior():
# Punto en el anillo exterior, fuera del hole
assert point_in_polygon(0.5, 0.5, [OUTER, HOLE]) is True
def test_punto_en_hole():
# Punto dentro del hole → False
assert point_in_polygon(2.0, 2.0, [OUTER, HOLE]) is False
def test_punto_fuera():
assert point_in_polygon(10.0, 10.0, [OUTER, HOLE]) is False
def test_poligono_vacio():
assert point_in_polygon(0.5, 0.5, []) is False
@@ -0,0 +1,25 @@
"""Tests para point_in_polygons_bbox."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.point_in_polygons_bbox import point_in_polygons_bbox
from python.functions.geo.polygon_bbox import polygon_bbox
P1 = [[(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]]
P2 = [[(5.0, 5.0), (6.0, 5.0), (6.0, 6.0), (5.0, 6.0)]]
BBOXES = [polygon_bbox(P1), polygon_bbox(P2)]
def test_punto_en_primer_poligono():
assert point_in_polygons_bbox(0.5, 0.5, [P1, P2], BBOXES) is True
def test_punto_en_segundo_poligono():
assert point_in_polygons_bbox(5.5, 5.5, [P1, P2], BBOXES) is True
def test_punto_fuera_de_todos():
assert point_in_polygons_bbox(10.0, 10.0, [P1, P2], BBOXES) is False
@@ -0,0 +1,22 @@
"""Tests para point_in_ring."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.point_in_ring import point_in_ring
SQUARE = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
def test_punto_dentro_cuadrado():
assert point_in_ring(0.5, 0.5, SQUARE) is True
def test_punto_fuera_cuadrado():
assert point_in_ring(2.0, 2.0, SQUARE) is False
def test_ring_menor_3_vertices():
assert point_in_ring(0.0, 0.0, [(0.0, 0.0), (1.0, 1.0)]) is False
@@ -0,0 +1,19 @@
"""Tests para polygon_bbox."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.geo.polygon_bbox import polygon_bbox
def test_cuadrado_unitario():
ring = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
assert polygon_bbox([ring]) == (0.0, 0.0, 1.0, 1.0)
def test_poligono_con_hole():
outer = [(0.0, 0.0), (5.0, 0.0), (5.0, 5.0), (0.0, 5.0)]
hole = [(1.0, 1.0), (3.0, 1.0), (3.0, 3.0), (1.0, 3.0)]
assert polygon_bbox([outer, hole]) == (0.0, 0.0, 5.0, 5.0)
@@ -0,0 +1,36 @@
"""Tests para valhalla_isochrone."""
from __future__ import annotations
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import httpx
import pytest
from valhalla_isochrone import valhalla_isochrone
def _valhalla_alive(url: str = "http://localhost:8002") -> bool:
try:
r = httpx.get(f"{url}/status", timeout=2.0)
return r.status_code < 500
except Exception:
return False
VALHALLA_OK = _valhalla_alive()
skip_if_no_valhalla = pytest.mark.skipif(
not VALHALLA_OK, reason="Valhalla no activo en :8002"
)
@skip_if_no_valhalla
def test_isócrona_10_min_madrid_contiene_features():
"""isócrona 10 min Madrid contiene features"""
gj = valhalla_isochrone(lat=40.4168, lon=-3.7038, minutes=10)
assert gj is not None, "Esperaba GeoJSON, obtuvo None"
assert "features" in gj, "GeoJSON no contiene 'features'"
assert len(gj["features"]) > 0, "features está vacío"
@@ -0,0 +1,43 @@
"""Tests para valhalla_isochrones_async."""
from __future__ import annotations
import asyncio
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import httpx
import pytest
from valhalla_isochrones_async import valhalla_isochrones_async
def _valhalla_alive(url: str = "http://localhost:8002") -> bool:
try:
r = httpx.get(f"{url}/status", timeout=2.0)
return r.status_code < 500
except Exception:
return False
VALHALLA_OK = _valhalla_alive()
skip_if_no_valhalla = pytest.mark.skipif(
not VALHALLA_OK, reason="Valhalla no activo en :8002"
)
@skip_if_no_valhalla
def test_3_puntos_madrid_retornan_lista_de_3():
"""3 puntos Madrid retornan lista de 3"""
pts = [
{"lat": 40.4168, "lon": -3.7038, "minutes": 10, "id": "sol"},
{"lat": 40.4530, "lon": -3.6883, "minutes": 10, "id": "retiro"},
{"lat": 40.4005, "lon": -3.7057, "minutes": 10, "id": "atocha"},
]
results = asyncio.run(valhalla_isochrones_async(pts))
assert len(results) == 3, f"Esperaba 3 resultados, obtuvo {len(results)}"
for i, gj in enumerate(results):
assert gj is not None, f"Resultado {i} es None"
assert "features" in gj, f"Resultado {i} no contiene 'features'"
@@ -0,0 +1,46 @@
"""Tests para valhalla_matrix_1_to_n."""
from __future__ import annotations
import math
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import httpx
import pytest
from valhalla_matrix_1_to_n import valhalla_matrix_1_to_n
def _valhalla_alive(url: str = "http://localhost:8002") -> bool:
try:
r = httpx.get(f"{url}/status", timeout=2.0)
return r.status_code < 500
except Exception:
return False
VALHALLA_OK = _valhalla_alive()
skip_if_no_valhalla = pytest.mark.skipif(
not VALHALLA_OK, reason="Valhalla no activo en :8002"
)
@skip_if_no_valhalla
def test_matrix_1_origen_2_destinos_retorna_2_dicts_con_meters_mayor_0():
"""matrix 1 origen 2 destinos retorna 2 dicts con meters > 0"""
origins = [(40.4168, -3.7038)] # Madrid
destinations = [
(41.3874, 2.1686), # Barcelona
(37.3886, -5.9823), # Sevilla
]
pairs = [(0, 0), (0, 1)]
results = valhalla_matrix_1_to_n(origins, destinations, pairs)
assert len(results) == 2, f"Esperaba 2 resultados, obtuvo {len(results)}"
for i, r in enumerate(results):
assert r["error"] == 0, f"Par {i} tiene error={r['error']}"
assert r["meters"] > 0, f"Par {i} tiene meters={r['meters']}"
assert not math.isnan(r["seconds"]), f"Par {i} tiene seconds=NaN"
@@ -0,0 +1,41 @@
"""Tests para valhalla_route."""
from __future__ import annotations
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import httpx
import pytest
from valhalla_route import valhalla_route
def _valhalla_alive(url: str = "http://localhost:8002") -> bool:
try:
r = httpx.get(f"{url}/status", timeout=2.0)
return r.status_code < 500
except Exception:
return False
VALHALLA_OK = _valhalla_alive()
skip_if_no_valhalla = pytest.mark.skipif(
not VALHALLA_OK, reason="Valhalla no activo en :8002"
)
@skip_if_no_valhalla
def test_ruta_madrid_barcelona_supera_500_km():
"""ruta Madrid-Barcelona supera 500 km"""
result = valhalla_route(
locations=[
{"lat": 40.4168, "lon": -3.7038},
{"lat": 41.3874, "lon": 2.1686},
]
)
assert result is not None, "Esperaba respuesta, obtuvo None"
summary = result["trip"]["summary"]
assert summary["length"] > 500, f"Distancia {summary['length']} km < 500 km"