feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)
Extrae al registry funciones del proyecto interno footprint_aurgi: - core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb - geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket - geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout - valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n - datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull - datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column - datascience viz (2): plot_kde_2d, plot_heatmap_log - infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest - pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone - types geo (4): LonLat, BBox, IsochroneRequest, Centro Incluye: - apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose) - 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH) - Issue tracker dev/issues/0052-footprint-aurgi-extraction.md - Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi - Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines) Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,66 @@
|
||||
---
|
||||
id: compute_centers_reachability_pipeline_py_pipelines
|
||||
name: compute_centers_reachability_pipeline
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "async def compute_centers_reachability_pipeline(origins, centers, isochrone_minutes, base_url, concurrency) -> dict"
|
||||
description: "Calcula la accesibilidad de centros de servicio: matriz tiempo/distancia clientes→centros e isócronas por centro usando Valhalla."
|
||||
tags: [pipeline, geo, footprint, valhalla, isochrone, matrix, reachability]
|
||||
uses_functions: ["valhalla_matrix_1_to_n_py_geo", "valhalla_isochrones_async_py_geo"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
example: |
|
||||
import asyncio
|
||||
result = asyncio.run(compute_centers_reachability_pipeline(
|
||||
origins=[(40.4168, -3.7038), (37.3891, -5.9845)],
|
||||
centers=[(41.3851, 2.1734), (43.2627, -2.9253)],
|
||||
isochrone_minutes=15,
|
||||
))
|
||||
# result["matrix"] tiene 4 entries (2 orígenes × 2 centros)
|
||||
# result["isochrones"] tiene 2 GeoJSON dicts
|
||||
tested: true
|
||||
tests: ["test_compute_centers_reachability_pipeline"]
|
||||
test_file_path: "python/functions/pipelines/tests/test_compute_centers_reachability_pipeline.py"
|
||||
file_path: "python/functions/pipelines/compute_centers_reachability_pipeline.py"
|
||||
params:
|
||||
- {name: origins, desc: "Lista de (lat, lon) de los clientes u orígenes. Coordenadas WGS84."}
|
||||
- {name: centers, desc: "Lista de (lat, lon) de los centros de servicio. Coordenadas WGS84."}
|
||||
- {name: isochrone_minutes, desc: "Minutos de tiempo de viaje para las isócronas de cada centro. Default 15."}
|
||||
- {name: base_url, desc: "URL base del servidor Valhalla. Default http://localhost:8002."}
|
||||
- {name: concurrency, desc: "Número máximo de requests async simultáneos para isócronas. Default 6."}
|
||||
output: "Dict con 'matrix' (list[dict] con i,j,meters,seconds,error por par) e 'isochrones' (list[dict|None] con GeoJSON por centro)."
|
||||
source_repo: "internal:footprint_aurgi"
|
||||
source_license: "internal-aurgi"
|
||||
source_file: "internal:composed"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
|
||||
result = asyncio.run(compute_centers_reachability_pipeline(
|
||||
origins=[(40.4168, -3.7038), (37.3891, -5.9845)], # Madrid, Sevilla
|
||||
centers=[(41.3851, 2.1734), (43.2627, -2.9253)], # Barcelona, Bilbao
|
||||
isochrone_minutes=15,
|
||||
base_url="http://localhost:8002",
|
||||
))
|
||||
|
||||
for entry in result["matrix"]:
|
||||
print(f"origen[{entry['i']}] → centro[{entry['j']}]: {entry['meters']:.0f}m / {entry['seconds']:.0f}s")
|
||||
|
||||
print(f"Isócronas calculadas: {sum(1 for iso in result['isochrones'] if iso is not None)}")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Función async — requiere `asyncio.run(...)` o `await` dentro de un contexto async.
|
||||
La matriz se calcula con threads (valhalla_matrix_1_to_n usa ThreadPoolExecutor).
|
||||
Las isócronas se calculan con httpx async (valhalla_isochrones_async).
|
||||
Orden de la matrix: [(0,0),(0,1),...,(0,N-1),(1,0),...,(M-1,N-1)] donde M=orígenes, N=centros.
|
||||
@@ -0,0 +1,82 @@
|
||||
"""Pipeline: calcula matriz de tiempo/distancia y isócronas para centros de servicio."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
|
||||
_FUNCTIONS_DIR = os.path.join(os.path.dirname(__file__), "..")
|
||||
if _FUNCTIONS_DIR not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS_DIR)
|
||||
|
||||
from geo.valhalla_matrix_1_to_n import valhalla_matrix_1_to_n
|
||||
from geo.valhalla_isochrones_async import valhalla_isochrones_async
|
||||
|
||||
|
||||
async def compute_centers_reachability_pipeline(
|
||||
origins: list[tuple[float, float]],
|
||||
centers: list[tuple[float, float]],
|
||||
isochrone_minutes: int = 15,
|
||||
base_url: str = "http://localhost:8002",
|
||||
concurrency: int = 6,
|
||||
) -> dict:
|
||||
"""Calcula la accesibilidad de centros de servicio desde orígenes clientes.
|
||||
|
||||
Compone valhalla_matrix_1_to_n para obtener tiempos/distancias de todos
|
||||
los pares (origin, center) y valhalla_isochrones_async para generar una
|
||||
isócrona por cada centro.
|
||||
|
||||
Args:
|
||||
origins: Lista de (lat, lon) de los clientes o puntos de origen.
|
||||
centers: Lista de (lat, lon) de los centros de servicio.
|
||||
isochrone_minutes: Minutos de isócrona a calcular para cada centro.
|
||||
base_url: URL base del servidor Valhalla.
|
||||
concurrency: Número máximo de requests async simultáneos para isócronas.
|
||||
|
||||
Returns:
|
||||
Dict con claves:
|
||||
"matrix" (list[dict]): Un dict por par (origin_i, center_j) con
|
||||
{i, j, meters, seconds, error}. Orden: todos los centros
|
||||
para origin[0], luego origin[1], etc.
|
||||
"isochrones" (list[dict|None]): Una isócrona GeoJSON por cada center,
|
||||
en el mismo orden. None si Valhalla falló para ese centro.
|
||||
"""
|
||||
n_origins = len(origins)
|
||||
n_centers = len(centers)
|
||||
|
||||
# Build all pairs: (origin_idx, center_idx)
|
||||
pairs = [(i, j) for i in range(n_origins) for j in range(n_centers)]
|
||||
|
||||
# 1. Matrix: origins as sources, centers as destinations
|
||||
raw_matrix = valhalla_matrix_1_to_n(
|
||||
origins=origins,
|
||||
destinations=centers,
|
||||
pairs=pairs,
|
||||
base_url=base_url,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
|
||||
matrix = [
|
||||
{
|
||||
"i": pairs[k][0],
|
||||
"j": pairs[k][1],
|
||||
"meters": raw_matrix[k]["meters"],
|
||||
"seconds": raw_matrix[k]["seconds"],
|
||||
"error": raw_matrix[k]["error"],
|
||||
}
|
||||
for k in range(len(pairs))
|
||||
]
|
||||
|
||||
# 2. Isochrones: one per center
|
||||
iso_requests = [
|
||||
{"lat": lat, "lon": lon, "minutes": isochrone_minutes, "id": str(idx)}
|
||||
for idx, (lat, lon) in enumerate(centers)
|
||||
]
|
||||
isochrones = await valhalla_isochrones_async(
|
||||
requests=iso_requests,
|
||||
base_url=base_url,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
|
||||
return {"matrix": matrix, "isochrones": list(isochrones)}
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
id: count_points_per_zone_pipeline_py_pipelines
|
||||
name: count_points_per_zone_pipeline
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def count_points_per_zone_pipeline(points: list[tuple[float, float]], zones: list[dict]) -> dict"
|
||||
description: "Cuenta cuántos puntos (lon, lat) caen dentro de cada zona geográfica definida por GeoJSON."
|
||||
tags: [pipeline, geo, footprint, geojson, spatial, count, zone]
|
||||
uses_functions: ["load_geojson_polygons_py_geo", "polygon_bbox_py_geo", "point_in_polygons_bbox_py_geo"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
example: |
|
||||
result = count_points_per_zone_pipeline(
|
||||
points=[(-3.70, 40.41), (-3.65, 40.45)],
|
||||
zones=[{"label": "Madrid centro", "geojson_path": "zones/madrid_centro.geojson"}],
|
||||
)
|
||||
# result["counts"] == {"Madrid centro": 2}, result["unassigned"] == 0
|
||||
tested: true
|
||||
tests: ["test_count_points_per_zone_pipeline"]
|
||||
test_file_path: "python/functions/pipelines/tests/test_count_points_per_zone_pipeline.py"
|
||||
file_path: "python/functions/pipelines/count_points_per_zone_pipeline.py"
|
||||
params:
|
||||
- {name: points, desc: "Lista de (lon, lat) de los puntos a clasificar por zona. Coordenadas WGS84."}
|
||||
- {name: zones, desc: "Lista de dicts con label y geojson_path. Define las zonas geográficas a evaluar."}
|
||||
output: "Dict con counts (label→int), total_points, total_assigned y unassigned."
|
||||
source_repo: "internal:footprint_aurgi"
|
||||
source_license: "internal-aurgi"
|
||||
source_file: "internal:composed"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
result = count_points_per_zone_pipeline(
|
||||
points=[(-3.70, 40.41), (-3.65, 40.45), (-4.50, 39.80)],
|
||||
zones=[
|
||||
{"label": "Madrid centro", "geojson_path": "zones/madrid_centro.geojson"},
|
||||
{"label": "Área metropolitana", "geojson_path": "zones/madrid_metro.geojson"},
|
||||
],
|
||||
)
|
||||
|
||||
print(result["counts"])
|
||||
# {"Madrid centro": 2, "Área metropolitana": 2}
|
||||
print(f"Sin zona: {result['unassigned']}") # 1 (el punto en Toledo)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Los puntos se representan como (lon, lat) — orden GeoJSON estándar (longitud primero).
|
||||
Un punto puede contarse en múltiples zonas si los polígonos se solapan: `total_assigned` puede superar `total_points`.
|
||||
`unassigned` cuenta los puntos que no cayeron en ninguna de las zonas especificadas.
|
||||
La carga de polígonos se hace una vez por zona; el prefiltraje por bbox reduce el coste de ray-casting.
|
||||
@@ -0,0 +1,66 @@
|
||||
"""Pipeline: cuenta cuántos puntos caen dentro de cada zona geográfica GeoJSON."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
_FUNCTIONS_DIR = os.path.join(os.path.dirname(__file__), "..")
|
||||
if _FUNCTIONS_DIR not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS_DIR)
|
||||
|
||||
from geo.load_geojson_polygons import load_geojson_polygons
|
||||
from geo.polygon_bbox import polygon_bbox
|
||||
from geo.point_in_polygons_bbox import point_in_polygons_bbox
|
||||
|
||||
|
||||
def count_points_per_zone_pipeline(
|
||||
points: list[tuple[float, float]],
|
||||
zones: list[dict],
|
||||
) -> dict:
|
||||
"""Cuenta cuántos puntos caen dentro de cada zona geográfica.
|
||||
|
||||
Para cada zona carga los polígonos GeoJSON, precalcula los bboxes
|
||||
y aplica prefiltraje+ray-casting para contar puntos dentro.
|
||||
Un mismo punto puede contarse en varias zonas si los polígonos se solapan.
|
||||
|
||||
Args:
|
||||
points: Lista de (lon, lat) de los puntos a clasificar.
|
||||
zones: Lista de dicts con:
|
||||
- "label" (str): nombre de la zona.
|
||||
- "geojson_path" (str): ruta al GeoJSON de la zona.
|
||||
|
||||
Returns:
|
||||
Dict con claves:
|
||||
"counts" (dict[str, int]): {label: n_puntos_en_zona}.
|
||||
"total_points" (int): total de puntos de entrada.
|
||||
"total_assigned" (int): suma de conteos (puede superar total_points si hay solapamiento).
|
||||
"unassigned" (int): puntos que no caen en ninguna zona.
|
||||
"""
|
||||
counts: dict[str, int] = {}
|
||||
# Track which points were assigned to at least one zone
|
||||
assigned_flags = [False] * len(points)
|
||||
|
||||
for zone in zones:
|
||||
label = zone["label"]
|
||||
polygons = load_geojson_polygons(zone["geojson_path"])
|
||||
bboxes = [polygon_bbox(p) for p in polygons]
|
||||
|
||||
zone_count = 0
|
||||
for idx, (lon, lat) in enumerate(points):
|
||||
if point_in_polygons_bbox(float(lon), float(lat), polygons, bboxes):
|
||||
zone_count += 1
|
||||
assigned_flags[idx] = True
|
||||
|
||||
counts[label] = zone_count
|
||||
|
||||
total_points = len(points)
|
||||
total_assigned = sum(counts.values())
|
||||
unassigned = sum(1 for f in assigned_flags if not f)
|
||||
|
||||
return {
|
||||
"counts": counts,
|
||||
"total_points": total_points,
|
||||
"total_assigned": total_assigned,
|
||||
"unassigned": unassigned,
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
---
|
||||
name: extract_graph_from_text
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_graph_from_text(text: str, entity_labels: list[str], relation_labels: list | dict, allowed: dict, model: Any, threshold: float = 0.3, max_chars_per_chunk: int = 1500, overlap_sentences: int = 2) -> dict"
|
||||
description: "Pipeline E2E: texto -> grafo de entidades y relaciones. Orquesta chunking, extraccion con GLiNER2 por chunk, agregacion, filtrado tipado y resolucion de alias. Refactorizacion del playground del analisis gliner_glirel_tuning."
|
||||
tags: [pipeline, graph, ner, relation-extraction, gliner2, nlp, e2e, knowledge-graph, datascience, python]
|
||||
uses_functions:
|
||||
- chunk_with_overlap_py_core
|
||||
- extract_graph_gliner2_py_datascience
|
||||
- aggregate_extraction_results_py_core
|
||||
- filter_relations_by_entity_types_py_core
|
||||
- merge_entity_aliases_py_core
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [time, typing.Any]
|
||||
params:
|
||||
- name: text
|
||||
desc: "Texto de entrada de cualquier longitud. Se auto-chunkea si supera max_chars_per_chunk. Recomendado: pre-limpiar con clean_pdf_text si viene de un PDF."
|
||||
- name: entity_labels
|
||||
desc: "Tipos de entidad para GLiNER2. E.g. ['person', 'organization', 'location']. Usar snake_case (mejor recall segun notebook 08)."
|
||||
- name: relation_labels
|
||||
desc: "Tipos de relacion. Lista de strings o dict {label: description}. E.g. ['works_at', 'ceo_of'] o {'ceo_of': 'person is CEO of organization'}."
|
||||
- name: allowed
|
||||
desc: "Reglas de filtrado tipado {rel_type: (head_types, tail_types)}. Pasar {} para desactivar el filtrado. E.g. {'ceo_of': (['person'], ['organization'])}."
|
||||
- name: model
|
||||
desc: "Instancia GLiNER2 cargada con gliner2_load_model. Inyectada por el caller para permitir cache entre llamadas."
|
||||
- name: threshold
|
||||
desc: "Umbral de confianza GLiNER2 (0-1). 0.3 validado empiricamente. Menor = mas recall, mas ruido."
|
||||
- name: max_chars_per_chunk
|
||||
desc: "Maximo de caracteres por chunk antes de dividir. 1500 es el valor optimo para GLiNER2-large."
|
||||
- name: overlap_sentences
|
||||
desc: "Frases de overlap entre chunks consecutivos. 2 evita perder entidades en los bordes de chunk."
|
||||
output: "Dict con 'nodes' (lista de {id, type, count}), 'edges' (lista de {from, to, kind}) y 'stats' ({n_chunks, n_nodes, n_edges, n_dropped_typed, elapsed_s}). Listo para serializar a JSON y visualizar con Sigma/D3."
|
||||
tested: true
|
||||
tests:
|
||||
- "texto corto produce nodos y aristas esperados con stub model"
|
||||
- "stats tiene todos los campos requeridos"
|
||||
test_file_path: "python/functions/pipelines/tests/test_extract_graph_from_text.py"
|
||||
file_path: "python/functions/pipelines/extract_graph_from_text.py"
|
||||
notes: |
|
||||
Refactorizacion directa del playground/server.py del analisis
|
||||
projects/osint_graph/analysis/gliner_glirel_tuning.
|
||||
Todas las recetas validadas empiricamente en los notebooks 04, 06 y 08:
|
||||
- threshold=0.3 (notebook 04)
|
||||
- overlap_sentences=2 (notebook 06)
|
||||
- filtrado tipado (notebook 08)
|
||||
- coreference normalize+substring (playground/server.py)
|
||||
|
||||
Para PDFs: usar extract_pdf_text + clean_pdf_text antes de llamar a este pipeline.
|
||||
Para OpenIE sin vocabulario fijo: usar extract_triples_spacy_es como alternativa.
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.gliner2_load_model import gliner2_load_model
|
||||
from pipelines.extract_graph_from_text import extract_graph_from_text
|
||||
|
||||
model = gliner2_load_model(device="auto")
|
||||
|
||||
ENTITY_LABELS = ["person", "organization", "location"]
|
||||
RELATION_LABELS = ["works_at", "ceo_of", "headquartered_in", "president_of"]
|
||||
ALLOWED = {
|
||||
"ceo_of": (["person"], ["organization"]),
|
||||
"president_of": (["person"], ["organization"]),
|
||||
"works_at": (["person"], ["organization"]),
|
||||
"headquartered_in": (["organization"], ["location"]),
|
||||
}
|
||||
|
||||
text = """Carlos Torres Blanco es el presidente de BBVA.
|
||||
BBVA tiene su sede corporativa en Bilbao, aunque opera en mas de 30 paises."""
|
||||
|
||||
graph = extract_graph_from_text(
|
||||
text=text,
|
||||
entity_labels=ENTITY_LABELS,
|
||||
relation_labels=RELATION_LABELS,
|
||||
allowed=ALLOWED,
|
||||
model=model,
|
||||
)
|
||||
# graph["nodes"] -> [{"id": "Carlos Torres Blanco", "type": "person", "count": 1}, ...]
|
||||
# graph["edges"] -> [{"from": "Carlos Torres Blanco", "to": "BBVA", "kind": "president_of"}]
|
||||
# graph["stats"] -> {"n_chunks": 1, "n_nodes": 3, "n_edges": 2, ...}
|
||||
```
|
||||
@@ -0,0 +1,147 @@
|
||||
"""Pipeline E2E: text -> entities + relations + graph nodes/edges.
|
||||
|
||||
Compone las funciones del registry:
|
||||
- chunk_with_overlap (si len(text) > max_chars_per_chunk)
|
||||
- extract_graph_gliner2 (por chunk)
|
||||
- aggregate_extraction_results
|
||||
- filter_relations_by_entity_types
|
||||
- merge_entity_aliases
|
||||
|
||||
Es el flujo completo del playground (server.py del analisis gliner_glirel_tuning)
|
||||
refactorizado como funcion componible.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
if _ROOT not in sys.path:
|
||||
sys.path.insert(0, _ROOT)
|
||||
|
||||
from python.functions.core.chunk_with_overlap import chunk_with_overlap
|
||||
from python.functions.core.aggregate_extraction_results import aggregate_extraction_results
|
||||
from python.functions.core.filter_relations_by_entity_types import filter_relations_by_entity_types
|
||||
from python.functions.core.merge_entity_aliases import merge_entity_aliases
|
||||
from python.functions.datascience.extract_graph_gliner2 import extract_graph_gliner2
|
||||
|
||||
|
||||
def extract_graph_from_text(
|
||||
text: str,
|
||||
entity_labels: list[str],
|
||||
relation_labels: list | dict,
|
||||
allowed: dict,
|
||||
model: Any,
|
||||
threshold: float = 0.3,
|
||||
max_chars_per_chunk: int = 1500,
|
||||
overlap_sentences: int = 2,
|
||||
) -> dict:
|
||||
"""Full pipeline: text -> graph (nodes + edges).
|
||||
|
||||
Orchestrates chunking, per-chunk extraction, aggregation, typed filtering
|
||||
and alias resolution. Returns a graph ready for visualization.
|
||||
|
||||
Args:
|
||||
text: Input text (any length). Auto-chunked if > max_chars_per_chunk.
|
||||
entity_labels: E.g. ["person", "organization", "location"].
|
||||
relation_labels: E.g. ["works_at", "ceo_of", "located_in"] or dict
|
||||
with descriptions per label.
|
||||
allowed: Typed filter rules {rel_type: (head_types, tail_types)}.
|
||||
Pass {} to skip typed filtering.
|
||||
model: GLiNER2 model instance from gliner2_load_model.
|
||||
threshold: Confidence threshold (0.3 validated empirically).
|
||||
max_chars_per_chunk: Max chars per chunk before splitting.
|
||||
overlap_sentences: Sentence overlap between consecutive chunks.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"nodes": [{"id": str, "type": str, "count": int}, ...],
|
||||
"edges": [{"from": str, "to": str, "kind": str}, ...],
|
||||
"stats": {
|
||||
"n_chunks": int,
|
||||
"n_nodes": int,
|
||||
"n_edges": int,
|
||||
"n_dropped_typed": int,
|
||||
"elapsed_s": float
|
||||
}
|
||||
}
|
||||
"""
|
||||
t0 = time.time()
|
||||
|
||||
# 1. Chunking
|
||||
if len(text) <= max_chars_per_chunk:
|
||||
chunks = [text]
|
||||
else:
|
||||
chunks = [
|
||||
c["text"]
|
||||
for c in chunk_with_overlap(
|
||||
text,
|
||||
max_chars=max_chars_per_chunk,
|
||||
overlap_sentences=overlap_sentences,
|
||||
)
|
||||
]
|
||||
|
||||
# 2. Extraccion por chunk
|
||||
results = [
|
||||
extract_graph_gliner2(
|
||||
chunk,
|
||||
entity_labels=entity_labels,
|
||||
relation_labels=relation_labels,
|
||||
model=model,
|
||||
threshold=threshold,
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
# 3. Agregacion
|
||||
agg = aggregate_extraction_results(results)
|
||||
|
||||
# 4. name_to_type para el filtrado tipado
|
||||
name_to_type = {key[1]: data["type"] for key, data in agg["entities"].items()}
|
||||
|
||||
# 5. Convertir Counter a dict {rel_type: [(h, t), ...]}
|
||||
raw_relations: dict[str, list] = {}
|
||||
for (h, rt, t), _count in agg["relations"].items():
|
||||
raw_relations.setdefault(rt, []).append((h, t))
|
||||
|
||||
# 6. Filtrado tipado
|
||||
keep, drop = filter_relations_by_entity_types(raw_relations, name_to_type, allowed)
|
||||
|
||||
# 7. Coreference / alias
|
||||
original_names = [data["name"] for data in agg["entities"].values()]
|
||||
alias = merge_entity_aliases(original_names)
|
||||
|
||||
# 8. Construir nodos con alias aplicado
|
||||
nodes_dict: dict[str, dict] = {}
|
||||
for (typ, _key), data in agg["entities"].items():
|
||||
canon = alias.get(data["name"], data["name"])
|
||||
if canon not in nodes_dict:
|
||||
nodes_dict[canon] = {"type": typ, "count": data["count"]}
|
||||
else:
|
||||
nodes_dict[canon]["count"] += data["count"]
|
||||
|
||||
# 9. Construir aristas deduplicadas con alias aplicado
|
||||
edges_set: set[tuple[str, str, str]] = set()
|
||||
for e in keep:
|
||||
h_canon = alias.get(e["from"], e["from"])
|
||||
t_canon = alias.get(e["to"], e["to"])
|
||||
if h_canon == t_canon:
|
||||
continue
|
||||
edges_set.add((h_canon, e["kind"], t_canon))
|
||||
|
||||
elapsed = round(time.time() - t0, 2)
|
||||
|
||||
return {
|
||||
"nodes": [{"id": n, "type": info["type"], "count": info["count"]} for n, info in nodes_dict.items()],
|
||||
"edges": [{"from": h, "to": t, "kind": k} for h, k, t in edges_set],
|
||||
"stats": {
|
||||
"n_chunks": len(chunks),
|
||||
"n_nodes": len(nodes_dict),
|
||||
"n_edges": len(edges_set),
|
||||
"n_dropped_typed": len(drop),
|
||||
"elapsed_s": elapsed,
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
---
|
||||
id: generate_isochrones_by_zone_pipeline_py_pipelines
|
||||
name: generate_isochrones_by_zone_pipeline
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "async def generate_isochrones_by_zone_pipeline(zones, points, centers, base_url, concurrency) -> dict"
|
||||
description: "Genera isócronas Valhalla por zona geográfica usando el p75 de tiempos de viaje de los puntos en cada zona."
|
||||
tags: [pipeline, geo, footprint, valhalla, isochrone, zone, p75, geojson]
|
||||
uses_functions: ["load_geojson_polygons_py_geo", "polygon_bbox_py_geo", "point_in_polygons_bbox_py_geo", "summary_stats_py_datascience", "valhalla_isochrones_async_py_geo"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
example: |
|
||||
import asyncio
|
||||
result = asyncio.run(generate_isochrones_by_zone_pipeline(
|
||||
zones=[{"label": "M-30", "geojson_path": "zones/m30.geojson", "exclude_geojson_path": None}],
|
||||
points=[{"lat": 40.41, "lon": -3.70, "seconds": 600.0}],
|
||||
centers=[{"lat": 40.42, "lon": -3.69, "id": "centro_a"}],
|
||||
))
|
||||
# result["zones"][0]["minutes"] == 10.0, isochrones lista con GeoJSON
|
||||
tested: true
|
||||
tests: ["test_generate_isochrones_by_zone_pipeline"]
|
||||
test_file_path: "python/functions/pipelines/tests/test_generate_isochrones_by_zone_pipeline.py"
|
||||
file_path: "python/functions/pipelines/generate_isochrones_by_zone_pipeline.py"
|
||||
params:
|
||||
- {name: zones, desc: "Lista de dicts con label, geojson_path y exclude_geojson_path (opcional). Define las zonas geográficas."}
|
||||
- {name: points, desc: "Lista de dicts con lat, lon y seconds (tiempo de viaje real medido). Se calcula p75 por zona."}
|
||||
- {name: centers, desc: "Lista de dicts con lat, lon e id. Se filtra por zona para generar isócronas."}
|
||||
- {name: base_url, desc: "URL base del servidor Valhalla. Default http://localhost:8002."}
|
||||
- {name: concurrency, desc: "Número máximo de requests async simultáneos para isócronas. Default 6."}
|
||||
output: "Dict con 'zones': lista de {label, minutes, n_points, n_centers, isochrones} por zona."
|
||||
source_repo: "internal:footprint_aurgi"
|
||||
source_license: "internal-aurgi"
|
||||
source_file: "internal:composed"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
|
||||
result = asyncio.run(generate_isochrones_by_zone_pipeline(
|
||||
zones=[
|
||||
{"label": "M-30", "geojson_path": "zones/m30.geojson", "exclude_geojson_path": None},
|
||||
{"label": "M-40", "geojson_path": "zones/m40.geojson", "exclude_geojson_path": "zones/m30.geojson"},
|
||||
],
|
||||
points=[
|
||||
{"lat": 40.41, "lon": -3.70, "seconds": 600.0},
|
||||
{"lat": 40.38, "lon": -3.72, "seconds": 900.0},
|
||||
],
|
||||
centers=[
|
||||
{"lat": 40.42, "lon": -3.69, "id": "centro_a"},
|
||||
{"lat": 40.35, "lon": -3.75, "id": "centro_b"},
|
||||
],
|
||||
))
|
||||
|
||||
for z in result["zones"]:
|
||||
print(f"{z['label']}: {z['n_points']} pts, p75={z['minutes']:.1f}min, {z['n_centers']} centros")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Función async — requiere `asyncio.run(...)` o `await`.
|
||||
El p75 se calcula sobre `seconds` de los puntos en zona; luego se divide entre 60 para obtener minutos.
|
||||
Si una zona no tiene puntos con seconds válidos, `minutes=None` y no se generan isócronas.
|
||||
El polígono de exclusión permite hacer anillos (ej: M-40 excluyendo M-30).
|
||||
Los centros se asignan a la primera zona que los contiene — no se deduplicam entre zonas.
|
||||
@@ -0,0 +1,130 @@
|
||||
"""Pipeline: genera isócronas por zona geográfica usando p75 de tiempos de los puntos en cada zona."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import math
|
||||
import sys
|
||||
import os
|
||||
|
||||
_FUNCTIONS_DIR = os.path.join(os.path.dirname(__file__), "..")
|
||||
if _FUNCTIONS_DIR not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS_DIR)
|
||||
|
||||
from geo.load_geojson_polygons import load_geojson_polygons
|
||||
from geo.polygon_bbox import polygon_bbox
|
||||
from geo.point_in_polygons_bbox import point_in_polygons_bbox
|
||||
from datascience.summary_stats import summary_stats
|
||||
from geo.valhalla_isochrones_async import valhalla_isochrones_async
|
||||
|
||||
|
||||
async def generate_isochrones_by_zone_pipeline(
|
||||
zones: list[dict],
|
||||
points: list[dict],
|
||||
centers: list[dict],
|
||||
base_url: str = "http://localhost:8002",
|
||||
concurrency: int = 6,
|
||||
) -> dict:
|
||||
"""Genera isócronas por zona usando el p75 de tiempos de viaje de los puntos en cada zona.
|
||||
|
||||
Para cada zona:
|
||||
1. Carga los polígonos GeoJSON de la zona (y de exclusión si se indica).
|
||||
2. Filtra los points que caen dentro de la zona (no en exclusión).
|
||||
3. Calcula el p75(seconds/60) → minutos de isócrona representativa.
|
||||
4. Filtra los centers que caen en la zona.
|
||||
5. Genera una isócrona Valhalla por cada center con esos minutos.
|
||||
|
||||
Args:
|
||||
zones: Lista de dicts con:
|
||||
- "label" (str): nombre de la zona.
|
||||
- "geojson_path" (str): ruta al GeoJSON de la zona.
|
||||
- "exclude_geojson_path" (str | None): ruta GeoJSON de exclusión (opcional).
|
||||
points: Lista de dicts con "lat" (float), "lon" (float) y "seconds" (float | None).
|
||||
Los puntos sin seconds o con seconds=None se omiten del cálculo de p75.
|
||||
centers: Lista de dicts con "lat" (float), "lon" (float) e "id" (str).
|
||||
base_url: URL base del servidor Valhalla.
|
||||
concurrency: Número máximo de requests async simultáneos para isócronas.
|
||||
|
||||
Returns:
|
||||
Dict con clave "zones": lista de dicts con:
|
||||
- "label" (str): nombre de la zona.
|
||||
- "minutes" (float | None): p75 en minutos usado para las isócronas. None si sin datos.
|
||||
- "n_points" (int): puntos en la zona con seconds válidos.
|
||||
- "n_centers" (int): centros en la zona.
|
||||
- "isochrones" (list[dict | None]): GeoJSON por cada center de la zona.
|
||||
"""
|
||||
zone_results = []
|
||||
|
||||
for zone in zones:
|
||||
label = zone["label"]
|
||||
geojson_path = zone["geojson_path"]
|
||||
exclude_path = zone.get("exclude_geojson_path")
|
||||
|
||||
# 1. Cargar polígonos de la zona
|
||||
polygons = load_geojson_polygons(geojson_path)
|
||||
bboxes = [polygon_bbox(p) for p in polygons]
|
||||
|
||||
# Cargar polígonos de exclusión si se indica
|
||||
exclude_polygons: list = []
|
||||
exclude_bboxes: list = []
|
||||
if exclude_path:
|
||||
exclude_polygons = load_geojson_polygons(exclude_path)
|
||||
exclude_bboxes = [polygon_bbox(p) for p in exclude_polygons]
|
||||
|
||||
# 2. Filtrar points en zona (y no en exclusión)
|
||||
zone_seconds: list[float] = []
|
||||
for pt in points:
|
||||
lon = float(pt["lon"])
|
||||
lat = float(pt["lat"])
|
||||
secs = pt.get("seconds")
|
||||
if secs is None or (isinstance(secs, float) and math.isnan(secs)):
|
||||
continue
|
||||
if not point_in_polygons_bbox(lon, lat, polygons, bboxes):
|
||||
continue
|
||||
if exclude_polygons and point_in_polygons_bbox(lon, lat, exclude_polygons, exclude_bboxes):
|
||||
continue
|
||||
zone_seconds.append(float(secs))
|
||||
|
||||
# 3. Calcular p75 en minutos
|
||||
minutes: float | None = None
|
||||
if zone_seconds:
|
||||
stats = summary_stats(zone_seconds)
|
||||
p75_secs = stats.get("p75", math.nan)
|
||||
if not math.isnan(p75_secs):
|
||||
minutes = p75_secs / 60.0
|
||||
|
||||
# 4. Filtrar centers en zona
|
||||
zone_centers = [
|
||||
c for c in centers
|
||||
if point_in_polygons_bbox(float(c["lon"]), float(c["lat"]), polygons, bboxes)
|
||||
]
|
||||
|
||||
# 5. Generar isócronas para cada center de la zona
|
||||
isochrones: list = []
|
||||
if zone_centers and minutes is not None and minutes > 0:
|
||||
iso_requests = [
|
||||
{
|
||||
"lat": float(c["lat"]),
|
||||
"lon": float(c["lon"]),
|
||||
"minutes": max(1, round(minutes)),
|
||||
"id": c.get("id", str(idx)),
|
||||
}
|
||||
for idx, c in enumerate(zone_centers)
|
||||
]
|
||||
isochrones = await valhalla_isochrones_async(
|
||||
requests=iso_requests,
|
||||
base_url=base_url,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
|
||||
zone_results.append(
|
||||
{
|
||||
"label": label,
|
||||
"minutes": minutes,
|
||||
"n_points": len(zone_seconds),
|
||||
"n_centers": len(zone_centers),
|
||||
"isochrones": list(isochrones),
|
||||
}
|
||||
)
|
||||
|
||||
return {"zones": zone_results}
|
||||
@@ -0,0 +1,56 @@
|
||||
---
|
||||
id: setup_geo_stack_docker_pipeline_py_pipelines
|
||||
name: setup_geo_stack_docker_pipeline
|
||||
kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def setup_geo_stack_docker_pipeline(compose_path: str, wait_seconds: int, verify: bool) -> dict"
|
||||
description: "Levanta el geo stack Docker (Valhalla + PostGIS + Martin) via docker compose up -d y verifica que los tres servicios responden."
|
||||
tags: [pipeline, geo, footprint, docker, valhalla, postgis, martin]
|
||||
uses_functions: ["valhalla_route_py_geo"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
example: |
|
||||
result = setup_geo_stack_docker_pipeline(
|
||||
compose_path="apps/footprint_geo_stack/docker-compose.yml",
|
||||
wait_seconds=60,
|
||||
verify=True,
|
||||
)
|
||||
# {"docker_up": True, "valhalla_ok": True, "postgis_ok": True, "martin_ok": True}
|
||||
tested: true
|
||||
tests: ["test_setup_geo_stack_docker_pipeline"]
|
||||
test_file_path: "python/functions/pipelines/tests/test_setup_geo_stack_docker_pipeline.py"
|
||||
file_path: "python/functions/pipelines/setup_geo_stack_docker_pipeline.py"
|
||||
params:
|
||||
- {name: compose_path, desc: "Ruta al docker-compose.yml del geo stack (Valhalla + PostGIS + Martin)."}
|
||||
- {name: wait_seconds, desc: "Segundos a esperar tras docker compose up -d antes de verificar servicios. Default 60."}
|
||||
- {name: verify, desc: "Si True, verifica los tres servicios via HTTP/docker exec. Default True."}
|
||||
output: "Dict con docker_up, valhalla_ok, postgis_ok y martin_ok (bool cada uno)."
|
||||
source_repo: "internal:footprint_aurgi"
|
||||
source_license: "internal-aurgi"
|
||||
source_file: "internal:composed"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
result = setup_geo_stack_docker_pipeline(
|
||||
compose_path="apps/footprint_geo_stack/docker-compose.yml",
|
||||
wait_seconds=60,
|
||||
verify=True,
|
||||
)
|
||||
print(result)
|
||||
# {"docker_up": True, "valhalla_ok": True, "postgis_ok": True, "martin_ok": True}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Verifica Valhalla via GET /status, PostGIS via `docker exec footprint_postgis pg_isready -U postgres`,
|
||||
y Martin via GET /health en http://localhost:3000/health.
|
||||
Si `verify=False` solo retorna `docker_up` y el resto en False.
|
||||
El nombre del contenedor PostGIS (`footprint_postgis`) debe coincidir con el definido en el compose.
|
||||
@@ -0,0 +1,98 @@
|
||||
"""Pipeline: levanta el geo stack Docker (Valhalla + PostGIS + Martin) y verifica servicios."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from urllib import request as urllib_request
|
||||
from urllib.error import URLError
|
||||
|
||||
|
||||
def setup_geo_stack_docker_pipeline(
|
||||
compose_path: str = "apps/footprint_geo_stack/docker-compose.yml",
|
||||
wait_seconds: int = 60,
|
||||
verify: bool = True,
|
||||
) -> dict:
|
||||
"""Levanta el geo stack via docker compose y verifica que los servicios responden.
|
||||
|
||||
Ejecuta `docker compose up -d` sobre el compose_path dado, espera wait_seconds
|
||||
y luego verifica (si verify=True) que Valhalla, PostGIS y Martin están operativos.
|
||||
|
||||
Args:
|
||||
compose_path: Ruta al docker-compose.yml del geo stack.
|
||||
wait_seconds: Segundos a esperar tras `docker compose up -d` antes de verificar.
|
||||
verify: Si True, verifica los tres servicios via HTTP/docker exec.
|
||||
|
||||
Returns:
|
||||
Dict con claves:
|
||||
"docker_up" (bool): True si docker compose arrancó sin error.
|
||||
"valhalla_ok" (bool): True si Valhalla responde a /status.
|
||||
"postgis_ok" (bool): True si pg_isready retorna OK via docker exec.
|
||||
"martin_ok" (bool): True si Martin responde a /health.
|
||||
"""
|
||||
result = {
|
||||
"docker_up": False,
|
||||
"valhalla_ok": False,
|
||||
"postgis_ok": False,
|
||||
"martin_ok": False,
|
||||
}
|
||||
|
||||
# Step 1: docker compose up -d
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["docker", "compose", "-f", compose_path, "up", "-d"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
result["docker_up"] = proc.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
||||
return result
|
||||
|
||||
if not result["docker_up"]:
|
||||
return result
|
||||
|
||||
if not verify:
|
||||
return result
|
||||
|
||||
# Step 2: wait for services to be ready
|
||||
if wait_seconds > 0:
|
||||
time.sleep(wait_seconds)
|
||||
|
||||
# Step 3: verify Valhalla via POST /route (lightweight status check via /status)
|
||||
try:
|
||||
req = urllib_request.Request(
|
||||
"http://localhost:8002/status",
|
||||
method="GET",
|
||||
)
|
||||
with urllib_request.urlopen(req, timeout=10) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
result["valhalla_ok"] = isinstance(data, dict)
|
||||
except (URLError, OSError, json.JSONDecodeError, Exception):
|
||||
result["valhalla_ok"] = False
|
||||
|
||||
# Step 4: verify PostGIS via pg_isready inside docker exec
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["docker", "exec", "footprint_postgis", "pg_isready", "-U", "postgres"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
result["postgis_ok"] = proc.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
||||
result["postgis_ok"] = False
|
||||
|
||||
# Step 5: verify Martin via /health
|
||||
try:
|
||||
req = urllib_request.Request(
|
||||
"http://localhost:3000/health",
|
||||
method="GET",
|
||||
)
|
||||
with urllib_request.urlopen(req, timeout=10) as resp:
|
||||
result["martin_ok"] = resp.status == 200
|
||||
except (URLError, OSError, Exception):
|
||||
result["martin_ok"] = False
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Tests para compute_centers_reachability_pipeline.
|
||||
|
||||
Usa 2 orígenes y 2 centros reales en España con el stack Valhalla activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.compute_centers_reachability_pipeline import (
|
||||
compute_centers_reachability_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def test_compute_centers_reachability_pipeline():
|
||||
"""Matrix 2×2 y 2 isócronas con Valhalla en localhost:8002."""
|
||||
origins = [
|
||||
(40.4168, -3.7038), # Madrid
|
||||
(37.3891, -5.9845), # Sevilla
|
||||
]
|
||||
centers = [
|
||||
(41.3851, 2.1734), # Barcelona
|
||||
(43.2627, -2.9253), # Bilbao
|
||||
]
|
||||
|
||||
result = asyncio.run(
|
||||
compute_centers_reachability_pipeline(
|
||||
origins=origins,
|
||||
centers=centers,
|
||||
isochrone_minutes=15,
|
||||
base_url="http://localhost:8002",
|
||||
concurrency=4,
|
||||
)
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "matrix" in result
|
||||
assert "isochrones" in result
|
||||
|
||||
# Matrix: 2 orígenes × 2 centros = 4 entradas
|
||||
matrix = result["matrix"]
|
||||
assert len(matrix) == 4, f"Esperadas 4 entradas en matrix, got {len(matrix)}"
|
||||
|
||||
for entry in matrix:
|
||||
assert "i" in entry and "j" in entry
|
||||
assert "meters" in entry and "seconds" in entry and "error" in entry
|
||||
# Si Valhalla resolvió el par, meters > 0
|
||||
if entry["error"] == 0:
|
||||
assert entry["meters"] > 0, "meters debe ser > 0 cuando no hay error"
|
||||
|
||||
# Isochrones: 2 centros → 2 entradas
|
||||
isochrones = result["isochrones"]
|
||||
assert len(isochrones) == 2, f"Esperadas 2 isócronas, got {len(isochrones)}"
|
||||
|
||||
# Al menos una isócrona debe ser un dict GeoJSON válido
|
||||
valid_isos = [iso for iso in isochrones if isinstance(iso, dict)]
|
||||
assert len(valid_isos) >= 1, "Al menos una isócrona debe ser un dict GeoJSON"
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Tests para count_points_per_zone_pipeline.
|
||||
|
||||
Usa un cuadrado sintético como zona y puntos aleatorios entre lat[40,41] lon[-4,-3].
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.count_points_per_zone_pipeline import (
|
||||
count_points_per_zone_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def _make_square_geojson(min_lon: float, min_lat: float, max_lon: float, max_lat: float) -> dict:
|
||||
"""Crea un GeoJSON Polygon cuadrado con las coordenadas dadas."""
|
||||
return {
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Polygon",
|
||||
"coordinates": [
|
||||
[
|
||||
[min_lon, min_lat],
|
||||
[max_lon, min_lat],
|
||||
[max_lon, max_lat],
|
||||
[min_lon, max_lat],
|
||||
[min_lon, min_lat],
|
||||
]
|
||||
],
|
||||
},
|
||||
"properties": {},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_count_points_per_zone_pipeline():
|
||||
"""100 puntos aleatorios en [40,41]x[-4,-3]. Zona = cuadrado interior [40.2,40.8]x[-3.8,-3.2]."""
|
||||
random.seed(42)
|
||||
# Puntos (lon, lat) — orden GeoJSON
|
||||
points = [
|
||||
(random.uniform(-4.0, -3.0), random.uniform(40.0, 41.0))
|
||||
for _ in range(100)
|
||||
]
|
||||
|
||||
# Zona interior: cuadrado centrado que cubre ~36% del área total → espera ~36 puntos
|
||||
zone_min_lon, zone_max_lon = -3.8, -3.2
|
||||
zone_min_lat, zone_max_lat = 40.2, 40.8
|
||||
|
||||
# Cuántos puntos deben caer (referencia para assert)
|
||||
expected_inside = sum(
|
||||
1 for lon, lat in points
|
||||
if zone_min_lon <= lon <= zone_max_lon and zone_min_lat <= lat <= zone_max_lat
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
zone_path = os.path.join(tmpdir, "zone_centro.geojson")
|
||||
with open(zone_path, "w") as f:
|
||||
json.dump(
|
||||
_make_square_geojson(zone_min_lon, zone_min_lat, zone_max_lon, zone_max_lat),
|
||||
f,
|
||||
)
|
||||
|
||||
zones = [{"label": "Centro", "geojson_path": zone_path}]
|
||||
|
||||
result = count_points_per_zone_pipeline(points=points, zones=zones)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert set(result.keys()) == {"counts", "total_points", "total_assigned", "unassigned"}
|
||||
|
||||
assert result["total_points"] == 100
|
||||
assert result["counts"]["Centro"] > 0, "Debe haber puntos en la zona"
|
||||
assert result["counts"]["Centro"] == expected_inside, (
|
||||
f"Esperados {expected_inside} puntos, got {result['counts']['Centro']}"
|
||||
)
|
||||
assert result["total_assigned"] == expected_inside
|
||||
assert result["unassigned"] == 100 - expected_inside
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Tests para extract_graph_from_text pipeline.
|
||||
|
||||
Usa stubs para GLiNER2 para validar el flujo completo sin descargar modelos.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.extract_graph_from_text import extract_graph_from_text
|
||||
|
||||
|
||||
class _Schema:
|
||||
def entities(self, labels):
|
||||
return self
|
||||
|
||||
def relations(self, labels):
|
||||
return self
|
||||
|
||||
|
||||
class _StubModel:
|
||||
"""Stub que retorna un grafo conocido para cualquier texto."""
|
||||
|
||||
def create_schema(self):
|
||||
return _Schema()
|
||||
|
||||
def extract(self, text, schema=None, threshold=0.3, include_confidence=False):
|
||||
return {
|
||||
"entities": {
|
||||
"person": ["Carlos Torres"],
|
||||
"organization": ["BBVA"],
|
||||
"location": ["Bilbao"],
|
||||
},
|
||||
"relation_extraction": {
|
||||
"president_of": [("Carlos Torres", "BBVA")],
|
||||
"headquartered_in": [("BBVA", "Bilbao")],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
ENTITY_LABELS = ["person", "organization", "location"]
|
||||
RELATION_LABELS = ["president_of", "headquartered_in", "works_at"]
|
||||
ALLOWED = {
|
||||
"president_of": (["person"], ["organization"]),
|
||||
"headquartered_in": (["organization"], ["location"]),
|
||||
}
|
||||
|
||||
|
||||
def test_texto_corto_produce_nodos_y_aristas_esperados():
|
||||
"""texto corto produce nodos y aristas esperados con stub model"""
|
||||
text = "Carlos Torres es presidente de BBVA con sede en Bilbao."
|
||||
result = extract_graph_from_text(
|
||||
text=text,
|
||||
entity_labels=ENTITY_LABELS,
|
||||
relation_labels=RELATION_LABELS,
|
||||
allowed=ALLOWED,
|
||||
model=_StubModel(),
|
||||
threshold=0.3,
|
||||
)
|
||||
|
||||
node_ids = {n["id"] for n in result["nodes"]}
|
||||
assert "Carlos Torres" in node_ids
|
||||
assert "BBVA" in node_ids
|
||||
assert "Bilbao" in node_ids
|
||||
|
||||
edge_kinds = {e["kind"] for e in result["edges"]}
|
||||
assert "president_of" in edge_kinds
|
||||
assert "headquartered_in" in edge_kinds
|
||||
|
||||
|
||||
def test_stats_tiene_todos_los_campos_requeridos():
|
||||
"""stats tiene todos los campos requeridos"""
|
||||
text = "Texto de prueba para el pipeline."
|
||||
result = extract_graph_from_text(
|
||||
text=text,
|
||||
entity_labels=ENTITY_LABELS,
|
||||
relation_labels=RELATION_LABELS,
|
||||
allowed=ALLOWED,
|
||||
model=_StubModel(),
|
||||
)
|
||||
stats = result["stats"]
|
||||
assert "n_chunks" in stats
|
||||
assert "n_nodes" in stats
|
||||
assert "n_edges" in stats
|
||||
assert "n_dropped_typed" in stats
|
||||
assert "elapsed_s" in stats
|
||||
assert stats["n_chunks"] >= 1
|
||||
assert stats["n_nodes"] >= 0
|
||||
@@ -0,0 +1,117 @@
|
||||
"""Tests para generate_isochrones_by_zone_pipeline.
|
||||
|
||||
Crea archivos GeoJSON temporales con zonas sintéticas sobre Madrid
|
||||
y verifica el resultado del pipeline con el stack Valhalla activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.generate_isochrones_by_zone_pipeline import (
|
||||
generate_isochrones_by_zone_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def _make_square_geojson(center_lon: float, center_lat: float, half: float) -> dict:
|
||||
"""Crea un GeoJSON Polygon cuadrado alrededor de (center_lon, center_lat)."""
|
||||
lo, hi_lon = center_lon - half, center_lon + half
|
||||
la, hi_lat = center_lat - half, center_lat + half
|
||||
return {
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Polygon",
|
||||
"coordinates": [
|
||||
[
|
||||
[lo, la],
|
||||
[hi_lon, la],
|
||||
[hi_lon, hi_lat],
|
||||
[lo, hi_lat],
|
||||
[lo, la],
|
||||
]
|
||||
],
|
||||
},
|
||||
"properties": {},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_generate_isochrones_by_zone_pipeline():
|
||||
"""Dos zonas cuadradas sintéticas, 50 puntos con seconds=600, 3 centros en Madrid."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Zona norte: cuadrado alrededor de Chamartín (Madrid norte)
|
||||
zone_norte_path = os.path.join(tmpdir, "zone_norte.geojson")
|
||||
with open(zone_norte_path, "w") as f:
|
||||
json.dump(_make_square_geojson(-3.685, 40.47, 0.05), f)
|
||||
|
||||
# Zona sur: cuadrado alrededor de Vallecas (Madrid sur)
|
||||
zone_sur_path = os.path.join(tmpdir, "zone_sur.geojson")
|
||||
with open(zone_sur_path, "w") as f:
|
||||
json.dump(_make_square_geojson(-3.666, 40.38, 0.05), f)
|
||||
|
||||
zones = [
|
||||
{"label": "Norte", "geojson_path": zone_norte_path, "exclude_geojson_path": None},
|
||||
{"label": "Sur", "geojson_path": zone_sur_path, "exclude_geojson_path": None},
|
||||
]
|
||||
|
||||
# 50 puntos: 25 en zona norte, 25 en zona sur, todos con seconds=600 (10 min)
|
||||
points_norte = [
|
||||
{"lat": 40.47 + i * 0.001, "lon": -3.685 + i * 0.001, "seconds": 600.0}
|
||||
for i in range(-12, 13)
|
||||
]
|
||||
points_sur = [
|
||||
{"lat": 40.38 + i * 0.001, "lon": -3.666 + i * 0.001, "seconds": 600.0}
|
||||
for i in range(-12, 13)
|
||||
]
|
||||
points = points_norte + points_sur
|
||||
|
||||
# 3 centros: 2 en norte, 1 en sur
|
||||
centers = [
|
||||
{"lat": 40.47, "lon": -3.685, "id": "centro_norte_a"},
|
||||
{"lat": 40.465, "lon": -3.680, "id": "centro_norte_b"},
|
||||
{"lat": 40.380, "lon": -3.666, "id": "centro_sur_a"},
|
||||
]
|
||||
|
||||
result = asyncio.run(
|
||||
generate_isochrones_by_zone_pipeline(
|
||||
zones=zones,
|
||||
points=points,
|
||||
centers=centers,
|
||||
base_url="http://localhost:8002",
|
||||
concurrency=4,
|
||||
)
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "zones" in result
|
||||
zone_results = result["zones"]
|
||||
|
||||
assert len(zone_results) == 2, f"Esperadas 2 zonas, got {len(zone_results)}"
|
||||
|
||||
for z in zone_results:
|
||||
assert "label" in z
|
||||
assert "minutes" in z
|
||||
assert "n_points" in z
|
||||
assert "n_centers" in z
|
||||
assert "isochrones" in z
|
||||
assert isinstance(z["isochrones"], list)
|
||||
|
||||
# p75 de 600s = 10 min → minutes ≈ 10
|
||||
for z in zone_results:
|
||||
if z["n_points"] > 0:
|
||||
assert z["minutes"] is not None
|
||||
assert 9.5 <= z["minutes"] <= 10.5, f"p75 esperado ~10 min, got {z['minutes']}"
|
||||
|
||||
# Al menos una zona debe tener isócronas (Valhalla activo)
|
||||
total_isos = sum(len(z["isochrones"]) for z in zone_results)
|
||||
assert total_isos >= 1, "Al menos una isócrona debe generarse"
|
||||
@@ -0,0 +1,38 @@
|
||||
"""Tests para setup_geo_stack_docker_pipeline.
|
||||
|
||||
El geo stack ya está corriendo en localhost:8002 (Valhalla), por lo que
|
||||
verify=True retorna flags reales del stack activo.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
|
||||
|
||||
from python.functions.pipelines.setup_geo_stack_docker_pipeline import (
|
||||
setup_geo_stack_docker_pipeline,
|
||||
)
|
||||
|
||||
|
||||
def test_setup_geo_stack_docker_pipeline():
|
||||
"""Verifica el geo stack activo en localhost (docker ya arrancado)."""
|
||||
# Llamamos con verify=True pero sin relanzar docker compose
|
||||
# (pasamos wait_seconds=0 para no esperar, el stack ya está up)
|
||||
result = setup_geo_stack_docker_pipeline(
|
||||
compose_path="apps/footprint_geo_stack/docker-compose.yml",
|
||||
wait_seconds=0,
|
||||
verify=True,
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert set(result.keys()) == {"docker_up", "valhalla_ok", "postgis_ok", "martin_ok"}
|
||||
|
||||
# docker_up puede ser False si el compose no existe en CI, pero verify sí corre
|
||||
# Lo importante: los flags son bool
|
||||
for key in ("docker_up", "valhalla_ok", "postgis_ok", "martin_ok"):
|
||||
assert isinstance(result[key], bool), f"{key} debe ser bool"
|
||||
|
||||
# Valhalla está activo en localhost:8002
|
||||
assert result["valhalla_ok"] is True, "Valhalla debe responder en localhost:8002"
|
||||
Reference in New Issue
Block a user