feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)
Extrae al registry funciones del proyecto interno footprint_aurgi: - core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb - geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket - geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout - valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n - datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull - datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column - datascience viz (2): plot_kde_2d, plot_heatmap_log - infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest - pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone - types geo (4): LonLat, BBox, IsochroneRequest, Centro Incluye: - apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose) - 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH) - Issue tracker dev/issues/0052-footprint-aurgi-extraction.md - Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi - Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines) Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,62 @@
|
||||
"""Tests para compute_centers_reachability_pipeline.
|
||||
|
||||
Usa 2 orígenes y 2 centros reales en España con el stack Valhalla activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.compute_centers_reachability_pipeline import (
|
||||
compute_centers_reachability_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def test_compute_centers_reachability_pipeline():
|
||||
"""Matrix 2×2 y 2 isócronas con Valhalla en localhost:8002."""
|
||||
origins = [
|
||||
(40.4168, -3.7038), # Madrid
|
||||
(37.3891, -5.9845), # Sevilla
|
||||
]
|
||||
centers = [
|
||||
(41.3851, 2.1734), # Barcelona
|
||||
(43.2627, -2.9253), # Bilbao
|
||||
]
|
||||
|
||||
result = asyncio.run(
|
||||
compute_centers_reachability_pipeline(
|
||||
origins=origins,
|
||||
centers=centers,
|
||||
isochrone_minutes=15,
|
||||
base_url="http://localhost:8002",
|
||||
concurrency=4,
|
||||
)
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "matrix" in result
|
||||
assert "isochrones" in result
|
||||
|
||||
# Matrix: 2 orígenes × 2 centros = 4 entradas
|
||||
matrix = result["matrix"]
|
||||
assert len(matrix) == 4, f"Esperadas 4 entradas en matrix, got {len(matrix)}"
|
||||
|
||||
for entry in matrix:
|
||||
assert "i" in entry and "j" in entry
|
||||
assert "meters" in entry and "seconds" in entry and "error" in entry
|
||||
# Si Valhalla resolvió el par, meters > 0
|
||||
if entry["error"] == 0:
|
||||
assert entry["meters"] > 0, "meters debe ser > 0 cuando no hay error"
|
||||
|
||||
# Isochrones: 2 centros → 2 entradas
|
||||
isochrones = result["isochrones"]
|
||||
assert len(isochrones) == 2, f"Esperadas 2 isócronas, got {len(isochrones)}"
|
||||
|
||||
# Al menos una isócrona debe ser un dict GeoJSON válido
|
||||
valid_isos = [iso for iso in isochrones if isinstance(iso, dict)]
|
||||
assert len(valid_isos) >= 1, "Al menos una isócrona debe ser un dict GeoJSON"
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Tests para count_points_per_zone_pipeline.
|
||||
|
||||
Usa un cuadrado sintético como zona y puntos aleatorios entre lat[40,41] lon[-4,-3].
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.count_points_per_zone_pipeline import (
|
||||
count_points_per_zone_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def _make_square_geojson(min_lon: float, min_lat: float, max_lon: float, max_lat: float) -> dict:
|
||||
"""Crea un GeoJSON Polygon cuadrado con las coordenadas dadas."""
|
||||
return {
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Polygon",
|
||||
"coordinates": [
|
||||
[
|
||||
[min_lon, min_lat],
|
||||
[max_lon, min_lat],
|
||||
[max_lon, max_lat],
|
||||
[min_lon, max_lat],
|
||||
[min_lon, min_lat],
|
||||
]
|
||||
],
|
||||
},
|
||||
"properties": {},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_count_points_per_zone_pipeline():
|
||||
"""100 puntos aleatorios en [40,41]x[-4,-3]. Zona = cuadrado interior [40.2,40.8]x[-3.8,-3.2]."""
|
||||
random.seed(42)
|
||||
# Puntos (lon, lat) — orden GeoJSON
|
||||
points = [
|
||||
(random.uniform(-4.0, -3.0), random.uniform(40.0, 41.0))
|
||||
for _ in range(100)
|
||||
]
|
||||
|
||||
# Zona interior: cuadrado centrado que cubre ~36% del área total → espera ~36 puntos
|
||||
zone_min_lon, zone_max_lon = -3.8, -3.2
|
||||
zone_min_lat, zone_max_lat = 40.2, 40.8
|
||||
|
||||
# Cuántos puntos deben caer (referencia para assert)
|
||||
expected_inside = sum(
|
||||
1 for lon, lat in points
|
||||
if zone_min_lon <= lon <= zone_max_lon and zone_min_lat <= lat <= zone_max_lat
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
zone_path = os.path.join(tmpdir, "zone_centro.geojson")
|
||||
with open(zone_path, "w") as f:
|
||||
json.dump(
|
||||
_make_square_geojson(zone_min_lon, zone_min_lat, zone_max_lon, zone_max_lat),
|
||||
f,
|
||||
)
|
||||
|
||||
zones = [{"label": "Centro", "geojson_path": zone_path}]
|
||||
|
||||
result = count_points_per_zone_pipeline(points=points, zones=zones)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert set(result.keys()) == {"counts", "total_points", "total_assigned", "unassigned"}
|
||||
|
||||
assert result["total_points"] == 100
|
||||
assert result["counts"]["Centro"] > 0, "Debe haber puntos en la zona"
|
||||
assert result["counts"]["Centro"] == expected_inside, (
|
||||
f"Esperados {expected_inside} puntos, got {result['counts']['Centro']}"
|
||||
)
|
||||
assert result["total_assigned"] == expected_inside
|
||||
assert result["unassigned"] == 100 - expected_inside
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Tests para extract_graph_from_text pipeline.
|
||||
|
||||
Usa stubs para GLiNER2 para validar el flujo completo sin descargar modelos.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.extract_graph_from_text import extract_graph_from_text
|
||||
|
||||
|
||||
class _Schema:
|
||||
def entities(self, labels):
|
||||
return self
|
||||
|
||||
def relations(self, labels):
|
||||
return self
|
||||
|
||||
|
||||
class _StubModel:
|
||||
"""Stub que retorna un grafo conocido para cualquier texto."""
|
||||
|
||||
def create_schema(self):
|
||||
return _Schema()
|
||||
|
||||
def extract(self, text, schema=None, threshold=0.3, include_confidence=False):
|
||||
return {
|
||||
"entities": {
|
||||
"person": ["Carlos Torres"],
|
||||
"organization": ["BBVA"],
|
||||
"location": ["Bilbao"],
|
||||
},
|
||||
"relation_extraction": {
|
||||
"president_of": [("Carlos Torres", "BBVA")],
|
||||
"headquartered_in": [("BBVA", "Bilbao")],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
ENTITY_LABELS = ["person", "organization", "location"]
|
||||
RELATION_LABELS = ["president_of", "headquartered_in", "works_at"]
|
||||
ALLOWED = {
|
||||
"president_of": (["person"], ["organization"]),
|
||||
"headquartered_in": (["organization"], ["location"]),
|
||||
}
|
||||
|
||||
|
||||
def test_texto_corto_produce_nodos_y_aristas_esperados():
|
||||
"""texto corto produce nodos y aristas esperados con stub model"""
|
||||
text = "Carlos Torres es presidente de BBVA con sede en Bilbao."
|
||||
result = extract_graph_from_text(
|
||||
text=text,
|
||||
entity_labels=ENTITY_LABELS,
|
||||
relation_labels=RELATION_LABELS,
|
||||
allowed=ALLOWED,
|
||||
model=_StubModel(),
|
||||
threshold=0.3,
|
||||
)
|
||||
|
||||
node_ids = {n["id"] for n in result["nodes"]}
|
||||
assert "Carlos Torres" in node_ids
|
||||
assert "BBVA" in node_ids
|
||||
assert "Bilbao" in node_ids
|
||||
|
||||
edge_kinds = {e["kind"] for e in result["edges"]}
|
||||
assert "president_of" in edge_kinds
|
||||
assert "headquartered_in" in edge_kinds
|
||||
|
||||
|
||||
def test_stats_tiene_todos_los_campos_requeridos():
|
||||
"""stats tiene todos los campos requeridos"""
|
||||
text = "Texto de prueba para el pipeline."
|
||||
result = extract_graph_from_text(
|
||||
text=text,
|
||||
entity_labels=ENTITY_LABELS,
|
||||
relation_labels=RELATION_LABELS,
|
||||
allowed=ALLOWED,
|
||||
model=_StubModel(),
|
||||
)
|
||||
stats = result["stats"]
|
||||
assert "n_chunks" in stats
|
||||
assert "n_nodes" in stats
|
||||
assert "n_edges" in stats
|
||||
assert "n_dropped_typed" in stats
|
||||
assert "elapsed_s" in stats
|
||||
assert stats["n_chunks"] >= 1
|
||||
assert stats["n_nodes"] >= 0
|
||||
@@ -0,0 +1,117 @@
|
||||
"""Tests para generate_isochrones_by_zone_pipeline.
|
||||
|
||||
Crea archivos GeoJSON temporales con zonas sintéticas sobre Madrid
|
||||
y verifica el resultado del pipeline con el stack Valhalla activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.generate_isochrones_by_zone_pipeline import (
|
||||
generate_isochrones_by_zone_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def _make_square_geojson(center_lon: float, center_lat: float, half: float) -> dict:
|
||||
"""Crea un GeoJSON Polygon cuadrado alrededor de (center_lon, center_lat)."""
|
||||
lo, hi_lon = center_lon - half, center_lon + half
|
||||
la, hi_lat = center_lat - half, center_lat + half
|
||||
return {
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Polygon",
|
||||
"coordinates": [
|
||||
[
|
||||
[lo, la],
|
||||
[hi_lon, la],
|
||||
[hi_lon, hi_lat],
|
||||
[lo, hi_lat],
|
||||
[lo, la],
|
||||
]
|
||||
],
|
||||
},
|
||||
"properties": {},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_generate_isochrones_by_zone_pipeline():
|
||||
"""Dos zonas cuadradas sintéticas, 50 puntos con seconds=600, 3 centros en Madrid."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Zona norte: cuadrado alrededor de Chamartín (Madrid norte)
|
||||
zone_norte_path = os.path.join(tmpdir, "zone_norte.geojson")
|
||||
with open(zone_norte_path, "w") as f:
|
||||
json.dump(_make_square_geojson(-3.685, 40.47, 0.05), f)
|
||||
|
||||
# Zona sur: cuadrado alrededor de Vallecas (Madrid sur)
|
||||
zone_sur_path = os.path.join(tmpdir, "zone_sur.geojson")
|
||||
with open(zone_sur_path, "w") as f:
|
||||
json.dump(_make_square_geojson(-3.666, 40.38, 0.05), f)
|
||||
|
||||
zones = [
|
||||
{"label": "Norte", "geojson_path": zone_norte_path, "exclude_geojson_path": None},
|
||||
{"label": "Sur", "geojson_path": zone_sur_path, "exclude_geojson_path": None},
|
||||
]
|
||||
|
||||
# 50 puntos: 25 en zona norte, 25 en zona sur, todos con seconds=600 (10 min)
|
||||
points_norte = [
|
||||
{"lat": 40.47 + i * 0.001, "lon": -3.685 + i * 0.001, "seconds": 600.0}
|
||||
for i in range(-12, 13)
|
||||
]
|
||||
points_sur = [
|
||||
{"lat": 40.38 + i * 0.001, "lon": -3.666 + i * 0.001, "seconds": 600.0}
|
||||
for i in range(-12, 13)
|
||||
]
|
||||
points = points_norte + points_sur
|
||||
|
||||
# 3 centros: 2 en norte, 1 en sur
|
||||
centers = [
|
||||
{"lat": 40.47, "lon": -3.685, "id": "centro_norte_a"},
|
||||
{"lat": 40.465, "lon": -3.680, "id": "centro_norte_b"},
|
||||
{"lat": 40.380, "lon": -3.666, "id": "centro_sur_a"},
|
||||
]
|
||||
|
||||
result = asyncio.run(
|
||||
generate_isochrones_by_zone_pipeline(
|
||||
zones=zones,
|
||||
points=points,
|
||||
centers=centers,
|
||||
base_url="http://localhost:8002",
|
||||
concurrency=4,
|
||||
)
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "zones" in result
|
||||
zone_results = result["zones"]
|
||||
|
||||
assert len(zone_results) == 2, f"Esperadas 2 zonas, got {len(zone_results)}"
|
||||
|
||||
for z in zone_results:
|
||||
assert "label" in z
|
||||
assert "minutes" in z
|
||||
assert "n_points" in z
|
||||
assert "n_centers" in z
|
||||
assert "isochrones" in z
|
||||
assert isinstance(z["isochrones"], list)
|
||||
|
||||
# p75 de 600s = 10 min → minutes ≈ 10
|
||||
for z in zone_results:
|
||||
if z["n_points"] > 0:
|
||||
assert z["minutes"] is not None
|
||||
assert 9.5 <= z["minutes"] <= 10.5, f"p75 esperado ~10 min, got {z['minutes']}"
|
||||
|
||||
# Al menos una zona debe tener isócronas (Valhalla activo)
|
||||
total_isos = sum(len(z["isochrones"]) for z in zone_results)
|
||||
assert total_isos >= 1, "Al menos una isócrona debe generarse"
|
||||
@@ -0,0 +1,38 @@
|
||||
"""Tests para setup_geo_stack_docker_pipeline.
|
||||
|
||||
El geo stack ya está corriendo en localhost:8002 (Valhalla), por lo que
|
||||
verify=True retorna flags reales del stack activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.setup_geo_stack_docker_pipeline import (
|
||||
setup_geo_stack_docker_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def test_setup_geo_stack_docker_pipeline():
|
||||
"""Verifica el geo stack activo en localhost (docker ya arrancado)."""
|
||||
# Llamamos con verify=True pero sin relanzar docker compose
|
||||
# (pasamos wait_seconds=0 para no esperar, el stack ya está up)
|
||||
result = setup_geo_stack_docker_pipeline(
|
||||
compose_path="apps/footprint_geo_stack/docker-compose.yml",
|
||||
wait_seconds=0,
|
||||
verify=True,
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert set(result.keys()) == {"docker_up", "valhalla_ok", "postgis_ok", "martin_ok"}
|
||||
|
||||
# docker_up puede ser False si el compose no existe en CI, pero verify sí corre
|
||||
# Lo importante: los flags son bool
|
||||
for key in ("docker_up", "valhalla_ok", "postgis_ok", "martin_ok"):
|
||||
assert isinstance(result[key], bool), f"{key} debe ser bool"
|
||||
|
||||
# Valhalla está activo en localhost:8002
|
||||
assert result["valhalla_ok"] is True, "Valhalla debe responder en localhost:8002"
|
||||
Reference in New Issue
Block a user