feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)

Extrae al registry funciones del proyecto interno footprint_aurgi:
- core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb
- geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket
- geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout
- valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n
- datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull
- datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column
- datascience viz (2): plot_kde_2d, plot_heatmap_log
- infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest
- pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone
- types geo (4): LonLat, BBox, IsochroneRequest, Centro

Incluye:
- apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose)
- 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH)
- Issue tracker dev/issues/0052-footprint-aurgi-extraction.md
- Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi
- Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines)

Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 23:35:22 +02:00
parent f73ea072bd
commit faac610745
193 changed files with 13146 additions and 3 deletions
@@ -0,0 +1,62 @@
"""Tests para compute_centers_reachability_pipeline.
Usa 2 orígenes y 2 centros reales en España con el stack Valhalla activo.
"""
from __future__ import annotations
import asyncio
import math
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.pipelines.compute_centers_reachability_pipeline import (
compute_centers_reachability_pipeline,
)
def test_compute_centers_reachability_pipeline():
"""Matrix 2×2 y 2 isócronas con Valhalla en localhost:8002."""
origins = [
(40.4168, -3.7038), # Madrid
(37.3891, -5.9845), # Sevilla
]
centers = [
(41.3851, 2.1734), # Barcelona
(43.2627, -2.9253), # Bilbao
]
result = asyncio.run(
compute_centers_reachability_pipeline(
origins=origins,
centers=centers,
isochrone_minutes=15,
base_url="http://localhost:8002",
concurrency=4,
)
)
assert isinstance(result, dict)
assert "matrix" in result
assert "isochrones" in result
# Matrix: 2 orígenes × 2 centros = 4 entradas
matrix = result["matrix"]
assert len(matrix) == 4, f"Esperadas 4 entradas en matrix, got {len(matrix)}"
for entry in matrix:
assert "i" in entry and "j" in entry
assert "meters" in entry and "seconds" in entry and "error" in entry
# Si Valhalla resolvió el par, meters > 0
if entry["error"] == 0:
assert entry["meters"] > 0, "meters debe ser > 0 cuando no hay error"
# Isochrones: 2 centros → 2 entradas
isochrones = result["isochrones"]
assert len(isochrones) == 2, f"Esperadas 2 isócronas, got {len(isochrones)}"
# Al menos una isócrona debe ser un dict GeoJSON válido
valid_isos = [iso for iso in isochrones if isinstance(iso, dict)]
assert len(valid_isos) >= 1, "Al menos una isócrona debe ser un dict GeoJSON"
@@ -0,0 +1,86 @@
"""Tests para count_points_per_zone_pipeline.
Usa un cuadrado sintético como zona y puntos aleatorios entre lat[40,41] lon[-4,-3].
"""
from __future__ import annotations
import json
import os
import random
import sys
import tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.pipelines.count_points_per_zone_pipeline import (
count_points_per_zone_pipeline,
)
def _make_square_geojson(min_lon: float, min_lat: float, max_lon: float, max_lat: float) -> dict:
"""Crea un GeoJSON Polygon cuadrado con las coordenadas dadas."""
return {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[min_lon, min_lat],
[max_lon, min_lat],
[max_lon, max_lat],
[min_lon, max_lat],
[min_lon, min_lat],
]
],
},
"properties": {},
}
],
}
def test_count_points_per_zone_pipeline():
"""100 puntos aleatorios en [40,41]x[-4,-3]. Zona = cuadrado interior [40.2,40.8]x[-3.8,-3.2]."""
random.seed(42)
# Puntos (lon, lat) — orden GeoJSON
points = [
(random.uniform(-4.0, -3.0), random.uniform(40.0, 41.0))
for _ in range(100)
]
# Zona interior: cuadrado centrado que cubre ~36% del área total → espera ~36 puntos
zone_min_lon, zone_max_lon = -3.8, -3.2
zone_min_lat, zone_max_lat = 40.2, 40.8
# Cuántos puntos deben caer (referencia para assert)
expected_inside = sum(
1 for lon, lat in points
if zone_min_lon <= lon <= zone_max_lon and zone_min_lat <= lat <= zone_max_lat
)
with tempfile.TemporaryDirectory() as tmpdir:
zone_path = os.path.join(tmpdir, "zone_centro.geojson")
with open(zone_path, "w") as f:
json.dump(
_make_square_geojson(zone_min_lon, zone_min_lat, zone_max_lon, zone_max_lat),
f,
)
zones = [{"label": "Centro", "geojson_path": zone_path}]
result = count_points_per_zone_pipeline(points=points, zones=zones)
assert isinstance(result, dict)
assert set(result.keys()) == {"counts", "total_points", "total_assigned", "unassigned"}
assert result["total_points"] == 100
assert result["counts"]["Centro"] > 0, "Debe haber puntos en la zona"
assert result["counts"]["Centro"] == expected_inside, (
f"Esperados {expected_inside} puntos, got {result['counts']['Centro']}"
)
assert result["total_assigned"] == expected_inside
assert result["unassigned"] == 100 - expected_inside
@@ -0,0 +1,93 @@
"""Tests para extract_graph_from_text pipeline.
Usa stubs para GLiNER2 para validar el flujo completo sin descargar modelos.
"""
from __future__ import annotations
import os
import sys
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.pipelines.extract_graph_from_text import extract_graph_from_text
class _Schema:
def entities(self, labels):
return self
def relations(self, labels):
return self
class _StubModel:
"""Stub que retorna un grafo conocido para cualquier texto."""
def create_schema(self):
return _Schema()
def extract(self, text, schema=None, threshold=0.3, include_confidence=False):
return {
"entities": {
"person": ["Carlos Torres"],
"organization": ["BBVA"],
"location": ["Bilbao"],
},
"relation_extraction": {
"president_of": [("Carlos Torres", "BBVA")],
"headquartered_in": [("BBVA", "Bilbao")],
},
}
ENTITY_LABELS = ["person", "organization", "location"]
RELATION_LABELS = ["president_of", "headquartered_in", "works_at"]
ALLOWED = {
"president_of": (["person"], ["organization"]),
"headquartered_in": (["organization"], ["location"]),
}
def test_texto_corto_produce_nodos_y_aristas_esperados():
"""texto corto produce nodos y aristas esperados con stub model"""
text = "Carlos Torres es presidente de BBVA con sede en Bilbao."
result = extract_graph_from_text(
text=text,
entity_labels=ENTITY_LABELS,
relation_labels=RELATION_LABELS,
allowed=ALLOWED,
model=_StubModel(),
threshold=0.3,
)
node_ids = {n["id"] for n in result["nodes"]}
assert "Carlos Torres" in node_ids
assert "BBVA" in node_ids
assert "Bilbao" in node_ids
edge_kinds = {e["kind"] for e in result["edges"]}
assert "president_of" in edge_kinds
assert "headquartered_in" in edge_kinds
def test_stats_tiene_todos_los_campos_requeridos():
"""stats tiene todos los campos requeridos"""
text = "Texto de prueba para el pipeline."
result = extract_graph_from_text(
text=text,
entity_labels=ENTITY_LABELS,
relation_labels=RELATION_LABELS,
allowed=ALLOWED,
model=_StubModel(),
)
stats = result["stats"]
assert "n_chunks" in stats
assert "n_nodes" in stats
assert "n_edges" in stats
assert "n_dropped_typed" in stats
assert "elapsed_s" in stats
assert stats["n_chunks"] >= 1
assert stats["n_nodes"] >= 0
@@ -0,0 +1,117 @@
"""Tests para generate_isochrones_by_zone_pipeline.
Crea archivos GeoJSON temporales con zonas sintéticas sobre Madrid
y verifica el resultado del pipeline con el stack Valhalla activo.
"""
from __future__ import annotations
import asyncio
import json
import os
import sys
import tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.pipelines.generate_isochrones_by_zone_pipeline import (
generate_isochrones_by_zone_pipeline,
)
def _make_square_geojson(center_lon: float, center_lat: float, half: float) -> dict:
"""Crea un GeoJSON Polygon cuadrado alrededor de (center_lon, center_lat)."""
lo, hi_lon = center_lon - half, center_lon + half
la, hi_lat = center_lat - half, center_lat + half
return {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[lo, la],
[hi_lon, la],
[hi_lon, hi_lat],
[lo, hi_lat],
[lo, la],
]
],
},
"properties": {},
}
],
}
def test_generate_isochrones_by_zone_pipeline():
"""Dos zonas cuadradas sintéticas, 50 puntos con seconds=600, 3 centros en Madrid."""
with tempfile.TemporaryDirectory() as tmpdir:
# Zona norte: cuadrado alrededor de Chamartín (Madrid norte)
zone_norte_path = os.path.join(tmpdir, "zone_norte.geojson")
with open(zone_norte_path, "w") as f:
json.dump(_make_square_geojson(-3.685, 40.47, 0.05), f)
# Zona sur: cuadrado alrededor de Vallecas (Madrid sur)
zone_sur_path = os.path.join(tmpdir, "zone_sur.geojson")
with open(zone_sur_path, "w") as f:
json.dump(_make_square_geojson(-3.666, 40.38, 0.05), f)
zones = [
{"label": "Norte", "geojson_path": zone_norte_path, "exclude_geojson_path": None},
{"label": "Sur", "geojson_path": zone_sur_path, "exclude_geojson_path": None},
]
# 50 puntos: 25 en zona norte, 25 en zona sur, todos con seconds=600 (10 min)
points_norte = [
{"lat": 40.47 + i * 0.001, "lon": -3.685 + i * 0.001, "seconds": 600.0}
for i in range(-12, 13)
]
points_sur = [
{"lat": 40.38 + i * 0.001, "lon": -3.666 + i * 0.001, "seconds": 600.0}
for i in range(-12, 13)
]
points = points_norte + points_sur
# 3 centros: 2 en norte, 1 en sur
centers = [
{"lat": 40.47, "lon": -3.685, "id": "centro_norte_a"},
{"lat": 40.465, "lon": -3.680, "id": "centro_norte_b"},
{"lat": 40.380, "lon": -3.666, "id": "centro_sur_a"},
]
result = asyncio.run(
generate_isochrones_by_zone_pipeline(
zones=zones,
points=points,
centers=centers,
base_url="http://localhost:8002",
concurrency=4,
)
)
assert isinstance(result, dict)
assert "zones" in result
zone_results = result["zones"]
assert len(zone_results) == 2, f"Esperadas 2 zonas, got {len(zone_results)}"
for z in zone_results:
assert "label" in z
assert "minutes" in z
assert "n_points" in z
assert "n_centers" in z
assert "isochrones" in z
assert isinstance(z["isochrones"], list)
# p75 de 600s = 10 min → minutes ≈ 10
for z in zone_results:
if z["n_points"] > 0:
assert z["minutes"] is not None
assert 9.5 <= z["minutes"] <= 10.5, f"p75 esperado ~10 min, got {z['minutes']}"
# Al menos una zona debe tener isócronas (Valhalla activo)
total_isos = sum(len(z["isochrones"]) for z in zone_results)
assert total_isos >= 1, "Al menos una isócrona debe generarse"
@@ -0,0 +1,38 @@
"""Tests para setup_geo_stack_docker_pipeline.
El geo stack ya está corriendo en localhost:8002 (Valhalla), por lo que
verify=True retorna flags reales del stack activo.
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
from python.functions.pipelines.setup_geo_stack_docker_pipeline import (
setup_geo_stack_docker_pipeline,
)
def test_setup_geo_stack_docker_pipeline():
"""Verifica el geo stack activo en localhost (docker ya arrancado)."""
# Llamamos con verify=True pero sin relanzar docker compose
# (pasamos wait_seconds=0 para no esperar, el stack ya está up)
result = setup_geo_stack_docker_pipeline(
compose_path="apps/footprint_geo_stack/docker-compose.yml",
wait_seconds=0,
verify=True,
)
assert isinstance(result, dict)
assert set(result.keys()) == {"docker_up", "valhalla_ok", "postgis_ok", "martin_ok"}
# docker_up puede ser False si el compose no existe en CI, pero verify sí corre
# Lo importante: los flags son bool
for key in ("docker_up", "valhalla_ok", "postgis_ok", "martin_ok"):
assert isinstance(result[key], bool), f"{key} debe ser bool"
# Valhalla está activo en localhost:8002
assert result["valhalla_ok"] is True, "Valhalla debe responder en localhost:8002"