feat: extraccion masiva footprint_aurgi (41 funcs + 4 types + stack Docker geo)

Extrae al registry funciones del proyecto interno footprint_aurgi:
- core (6): slugify_ascii, normalize_for_join, cp_provincia_es, infer_provincia_from_cp, safe_read_csv_fallback, csv_to_parquet_duckdb
- geo puras (7): haversine_km, point_in_ring, point_in_polygon, point_in_polygons_bbox, polygon_bbox, extent_with_padding, distance_bucket
- geo I/O (4): load_geojson_polygons, load_boundary_gdf, add_basemap_osm, add_basemap_with_timeout
- valhalla client (4): valhalla_route, valhalla_isochrone, valhalla_isochrones_async, valhalla_matrix_1_to_n
- datascience stats (7): trimmed_mean, geometric_mean, detect_distribution_type, best_central_tendency, summary_stats, kde_density_levels, alpha_shape_concave_hull
- datascience fuzzy (3): fuzzy_merge_adaptive (rapidfuzz), words_to_dataset, remove_words_from_column
- datascience viz (2): plot_kde_2d, plot_heatmap_log
- infra (4): compress_pdf_ghostscript, render_table_page_pdfpages, add_header_logo, osm2pgsql_ingest
- pipelines (4): setup_geo_stack_docker, compute_centers_reachability, generate_isochrones_by_zone, count_points_per_zone
- types geo (4): LonLat, BBox, IsochroneRequest, Centro

Incluye:
- apps/footprint_geo_stack/ (PostGIS + Martin + Valhalla via docker-compose)
- 131/132 tests pasan (1 skip esperado: osm2pgsql en PATH)
- Issue tracker dev/issues/0052-footprint-aurgi-extraction.md
- Atribucion uniforme: source_repo internal:footprint_aurgi, source_license internal-aurgi
- Build con 9 agentes en paralelo (8 wave 1 + 1 wave 2 pipelines)

Tambien commitea trabajo previo no commiteado: aggregate_extraction_results, chunk_with_overlap, clean_pdf_text, merge_entity_aliases, extract_graph_gliner2, extract_relations_mrebel, extract_triples_spacy_es, gliner2/mrebel/marianmt/rebel/spacy_es load_model, parse_rebel_output, translate_es_to_en, issue 0050/0051.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 23:35:22 +02:00
parent f73ea072bd
commit faac610745
193 changed files with 13146 additions and 3 deletions
@@ -0,0 +1,65 @@
"""Tests para aggregate_extraction_results."""
from __future__ import annotations
import os
import sys
from collections import Counter
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from core.aggregate_extraction_results import aggregate_extraction_results
def test_lista_vacia_retorna_entities_y_relations_vacios():
"""lista vacia retorna entities vacio y relations vacio"""
result = aggregate_extraction_results([])
assert result["entities"] == {}
assert result["relations"] == Counter()
def test_resultado_unico_se_agrega_correctamente():
"""resultado unico se agrega correctamente"""
r = [
{
"entities": {"person": ["Pablo Isla"], "organization": ["Inditex"]},
"relation_extraction": {"ceo_of": [("Pablo Isla", "Inditex")]},
}
]
result = aggregate_extraction_results(r)
assert ("person", "pablo isla") in result["entities"]
assert ("organization", "inditex") in result["entities"]
assert result["entities"][("person", "pablo isla")]["count"] == 1
assert result["relations"][("Pablo Isla", "ceo_of", "Inditex")] == 1
def test_dos_resultados_con_solapamiento_acumulan_counts():
"""dos resultados con solapamiento acumulan counts"""
r = [
{
"entities": {"person": ["Pablo Isla"], "organization": ["Inditex"]},
"relation_extraction": {"ceo_of": [("Pablo Isla", "Inditex")]},
},
{
"entities": {"person": ["Pablo Isla"], "organization": ["Inditex"]},
"relation_extraction": {"ceo_of": [("Pablo Isla", "Inditex")]},
},
]
result = aggregate_extraction_results(r)
assert result["entities"][("person", "pablo isla")]["count"] == 2
assert result["relations"][("Pablo Isla", "ceo_of", "Inditex")] == 2
def test_entidades_deduplicen_case_insensitive():
"""entidades se deduplicien case-insensitive"""
r = [
{"entities": {"person": ["Pablo Isla"]}, "relation_extraction": {}},
{"entities": {"person": ["pablo isla"]}, "relation_extraction": {}},
]
result = aggregate_extraction_results(r)
# Ambas van a la misma key (person, pablo isla)
assert ("person", "pablo isla") in result["entities"]
assert result["entities"][("person", "pablo isla")]["count"] == 2
# Solo una key para pablo isla
pablo_keys = [k for k in result["entities"] if k[1] == "pablo isla"]
assert len(pablo_keys) == 1
@@ -0,0 +1,72 @@
"""Tests para chunk_with_overlap."""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from core.chunk_with_overlap import chunk_with_overlap
def test_texto_vacio_retorna_lista_vacia():
"""texto vacio retorna lista vacia"""
assert chunk_with_overlap("") == []
assert chunk_with_overlap(" ") == []
def test_una_frase_menor_que_max_chars_produce_1_chunk():
"""una frase menor que max_chars produce 1 chunk"""
text = "Esta es una frase corta."
chunks = chunk_with_overlap(text, max_chars=500, overlap_sentences=0)
assert len(chunks) == 1
assert chunks[0]["text"] == text
def test_multiples_frases_producen_N_chunks_con_overlap():
"""multiples frases producen N chunks con overlap"""
# 3 frases de ~30 chars c/u, max_chars=60 -> al menos 2 chunks
text = "Primera frase larga aqui. Segunda frase larga aqui. Tercera frase larga aqui."
chunks = chunk_with_overlap(text, max_chars=55, overlap_sentences=1)
assert len(chunks) >= 2
# Cada chunk tiene texto no vacio
for c in chunks:
assert c["text"].strip()
assert len(c["sentences"]) > 0
def test_frase_mas_larga_que_max_chars_no_bucle_infinito():
"""frase mas larga que max_chars se incluye sin bucle infinito"""
long_sentence = "A" * 2000 + "."
chunks = chunk_with_overlap(long_sentence, max_chars=100, overlap_sentences=0)
# Debe terminar (no bucle infinito) y producir exactamente 1 chunk
assert len(chunks) == 1
assert chunks[0]["text"] == long_sentence.strip()
def test_overlap_0_no_duplica_frases():
"""overlap=0 no duplica frases entre chunks"""
text = "Primera frase aqui completa. Segunda frase aqui completa. Tercera frase aqui completa."
chunks = chunk_with_overlap(text, max_chars=50, overlap_sentences=0)
# Recolectar todas las frases de todos los chunks
all_sents = [s for c in chunks for s in c["sentences"]]
# Con overlap=0 ninguna frase debe aparecer dos veces
assert len(all_sents) == len(set(all_sents))
def test_overlap_2_el_chunk_N_mas_1_empieza_con_ultimas_2_frases_del_N():
"""overlap=2 el chunk N+1 empieza con las 2 ultimas frases del chunk N"""
# 5 frases cortas, max_chars=80 para forzar al menos 2 chunks
text = (
"Frase uno aqui. "
"Frase dos aqui. "
"Frase tres aqui. "
"Frase cuatro aqui. "
"Frase cinco aqui."
)
chunks = chunk_with_overlap(text, max_chars=80, overlap_sentences=2)
if len(chunks) >= 2:
prev_tail = chunks[0]["sentences"][-2:]
next_head = chunks[1]["sentences"][:2]
assert prev_tail == next_head
@@ -0,0 +1,49 @@
"""Tests para clean_pdf_text."""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from core.clean_pdf_text import clean_pdf_text
def test_string_vacio_retorna_vacio():
"""string vacio retorna vacio"""
assert clean_pdf_text("") == ""
def test_marca_de_pagina_1_20_se_elimina():
"""marca de pagina 1/20 se elimina"""
result = clean_pdf_text("1/20\nfoo bar")
assert "1/20" not in result
assert "foo bar" in result
def test_dehyphenation_exa_newline_mple():
"""dehyphenation exa-newline-mple -> example"""
result = clean_pdf_text("exa-\nmple")
assert result == "example"
def test_espacios_duplicados_se_colapsan():
"""espacios duplicados se colapsan"""
result = clean_pdf_text("ab cd")
assert result == "ab cd"
def test_salto_de_linea_en_mitad_de_oracion_se_une_con_espacio():
"""salto de linea en mitad de oracion se une con espacio"""
result = clean_pdf_text("Pablo Isla es el\npresidente de Inditex")
assert result == "Pablo Isla es el presidente de Inditex"
def test_salto_de_linea_tras_punto_se_preserva():
"""salto de linea tras punto se preserva"""
result = clean_pdf_text("Primera oracion.\nSegunda oracion.")
# El salto tras punto debe quedar (no se une con espacio)
assert "\n" in result
assert "Primera oracion." in result
assert "Segunda oracion." in result
@@ -0,0 +1,44 @@
"""Tests para cp_provincia_es."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cp_provincia_es import cp_provincia_es
def test_cp_completo_retorna_provincia():
"""cp completo retorna provincia"""
assert cp_provincia_es("28001") == "Madrid"
def test_prefijo_2_digitos_retorna_provincia():
"""prefijo 2 digitos retorna provincia"""
assert cp_provincia_es("28") == "Madrid"
def test_primer_prefijo_01_retorna_alava():
"""primer prefijo 01 retorna Alava"""
assert cp_provincia_es("01") == "Álava"
def test_cp_desconocido_retorna_none():
"""cp desconocido retorna None"""
assert cp_provincia_es("99") is None
def test_cp_entero_completo():
assert cp_provincia_es(28001) == "Madrid"
def test_cp_ceuta():
assert cp_provincia_es("51001") == "Ceuta"
def test_cp_melilla():
assert cp_provincia_es("52") == "Melilla"
def test_cp_barcelona():
assert cp_provincia_es("08") == "Barcelona"
@@ -0,0 +1,54 @@
"""Tests para csv_to_parquet_duckdb."""
from __future__ import annotations
import tempfile
from pathlib import Path
import pytest
def test_convierte_csv_a_parquet_y_duckdb_puede_leerlo():
"""convierte csv a parquet y duckdb puede leerlo"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from core.csv_to_parquet_duckdb import csv_to_parquet_duckdb
import duckdb
with tempfile.TemporaryDirectory() as tmpdir:
csv_path = Path(tmpdir) / "test.csv"
parquet_path = Path(tmpdir) / "test.parquet"
csv_path.write_text("nombre,lat,lon\nMadrid,40.4,-3.7\nBarcelona,41.3,2.1\n")
result = csv_to_parquet_duckdb(csv_path, parquet_path)
assert result is True
assert parquet_path.exists()
assert parquet_path.stat().st_size > 0
# Verify duckdb can read it back
con = duckdb.connect()
df = con.execute(f"SELECT * FROM read_parquet('{parquet_path}')").df()
con.close()
assert df.shape == (2, 3)
assert set(df.columns) == {"nombre", "lat", "lon"}
def test_overwrite_False_no_sobreescribe_parquet_existente():
"""overwrite=False no sobreescribe parquet existente"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from core.csv_to_parquet_duckdb import csv_to_parquet_duckdb
with tempfile.TemporaryDirectory() as tmpdir:
csv_path = Path(tmpdir) / "test.csv"
parquet_path = Path(tmpdir) / "test.parquet"
csv_path.write_text("a,b\n1,2\n")
# Create existing parquet with known content
parquet_path.write_bytes(b"existing content")
original_size = parquet_path.stat().st_size
result = csv_to_parquet_duckdb(csv_path, parquet_path, overwrite=False)
assert result is False
# File must remain unchanged
assert parquet_path.stat().st_size == original_size
@@ -0,0 +1,60 @@
"""Tests para filter_relations_by_entity_types."""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from core.filter_relations_by_entity_types import filter_relations_by_entity_types
NAME_TO_TYPE = {
"carlos torres": "person",
"bbva": "organization",
"madrid": "location",
"santander": "organization",
"ana": "person",
}
ALLOWED = {
"president_of": (["person"], ["organization"]),
"located_in": (["organization", "person"], ["location"]),
}
def test_pares_validos_se_incluyen_en_kept():
"""pares validos se incluyen en kept"""
relations = {"president_of": [("Carlos Torres", "BBVA")]}
kept, dropped = filter_relations_by_entity_types(relations, NAME_TO_TYPE, ALLOWED)
assert len(kept) == 1
assert kept[0]["from"] == "Carlos Torres"
assert kept[0]["to"] == "BBVA"
assert len(dropped) == 0
def test_pares_con_tipos_incompatibles_van_a_dropped():
"""pares con tipos incompatibles van a dropped"""
# Madrid es location, no person -> no puede presidir nada
relations = {"president_of": [("Madrid", "Santander")]}
kept, dropped = filter_relations_by_entity_types(relations, NAME_TO_TYPE, ALLOWED)
assert len(kept) == 0
assert len(dropped) == 1
assert dropped[0]["head_type"] == "location"
def test_rel_type_no_en_allowed_se_acepta_siempre():
"""rel_type no en allowed se acepta siempre"""
relations = {"unknown_rel": [("Carlos Torres", "Madrid")]}
kept, dropped = filter_relations_by_entity_types(relations, NAME_TO_TYPE, ALLOWED)
assert len(kept) == 1
assert len(dropped) == 0
def test_entidad_no_encontrada_en_name_to_type_va_a_dropped():
"""entidad no encontrada en name_to_type va a dropped"""
# "Desconocido" no esta en name_to_type -> head_type es None -> dropped
relations = {"president_of": [("Desconocido", "BBVA")]}
kept, dropped = filter_relations_by_entity_types(relations, NAME_TO_TYPE, ALLOWED)
assert len(dropped) == 1
assert dropped[0]["head_type"] is None
@@ -0,0 +1,78 @@
"""Tests para infer_provincia_from_cp."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from infer_provincia_from_cp import infer_provincia_from_cp
def test_inferencia_con_cp_dominante_madrid():
"""inferencia con cp dominante madrid"""
rows = [
{"codigo_postal": "28001", "provincia": "Madrid"},
{"codigo_postal": "28010", "provincia": "Madrid"},
]
result = infer_provincia_from_cp(rows)
assert result == ["Madrid", "Madrid"]
def test_fila_con_cp_fuera_de_top2_usa_dominante():
"""fila con cp fuera de top2 usa dominante"""
# Madrid tiene 3 prefijos distintos: 28 (x4), 29 (x1), 41 (x1).
# top-2 son: 28 y 29 (o 41 dependiendo del orden, pero 41 queda fuera).
# Para que 41 quede fuera del top-2 necesitamos mas de 2 prefijos distintos.
rows = [
{"codigo_postal": "28001", "provincia": "Madrid"},
{"codigo_postal": "28002", "provincia": "Madrid"},
{"codigo_postal": "28003", "provincia": "Madrid"},
{"codigo_postal": "28004", "provincia": "Madrid"},
{"codigo_postal": "29001", "provincia": "Madrid"},
{"codigo_postal": "29002", "provincia": "Madrid"},
{"codigo_postal": "41001", "provincia": "Madrid"}, # outlier: fuera de top-2
]
result = infer_provincia_from_cp(rows)
# top-2 de Madrid: "28" (4 ocurrencias) y "29" (2 ocurrencias).
# "41" no esta en top-2, asi que usa el dominante (28 -> Madrid)
assert result[6] == "Madrid"
def test_fila_sin_provincia_retorna_none():
"""fila sin provincia retorna None"""
rows = [
{"codigo_postal": "28001", "provincia": None},
]
result = infer_provincia_from_cp(rows)
assert result == [None]
def test_fila_sin_cp_retorna_none():
rows = [
{"codigo_postal": None, "provincia": "Madrid"},
]
result = infer_provincia_from_cp(rows)
assert result == [None]
def test_columnas_custom():
rows = [
{"cp": "28001", "prov": "Madrid"},
{"cp": "28010", "prov": "Madrid"},
]
result = infer_provincia_from_cp(rows, cp_col="cp", prov_col="prov")
assert result == ["Madrid", "Madrid"]
def test_multiples_provincias():
rows = [
{"codigo_postal": "28001", "provincia": "Madrid"},
{"codigo_postal": "08001", "provincia": "Barcelona"},
{"codigo_postal": "41001", "provincia": "Sevilla"},
]
result = infer_provincia_from_cp(rows)
assert result == ["Madrid", "Barcelona", "Sevilla"]
def test_lista_vacia():
assert infer_provincia_from_cp([]) == []
@@ -0,0 +1,58 @@
"""Tests para merge_entity_aliases."""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from core.merge_entity_aliases import merge_entity_aliases
def test_duplicados_case_insensitive_se_mapean_al_mismo_canonical():
"""duplicados case-insensitive se mapean al mismo canonical"""
result = merge_entity_aliases(["BBVA", "bbva", "Bbva"])
# Todos deben apuntar al mismo canonical (el mas largo / mayor)
vals = set(result.values())
assert len(vals) == 1
# El canonical debe ser la forma de mayor longitud/orden: "BBVA" (mayusculas, misma longitud)
canon = vals.pop()
assert canon.lower() == "bbva"
def test_nombre_corto_se_absorbe_en_nombre_largo_que_lo_contiene():
"""nombre corto se absorbe en nombre largo que lo contiene"""
# El substring merge funciona cuando la forma corta APARECE LITERALMENTE
# en la forma larga (normalizada). Ejemplo: "bilbao" esta en "banco bilbao vizcaya argentaria"
names = ["Bilbao", "Banco Bilbao Vizcaya Argentaria"]
result = merge_entity_aliases(names)
# "bilbao" (6 chars) aparece como palabra en la forma larga normalizada
assert result["Bilbao"] == "Banco Bilbao Vizcaya Argentaria"
assert result["Banco Bilbao Vizcaya Argentaria"] == "Banco Bilbao Vizcaya Argentaria"
def test_siglas_cortas_menos_de_4_chars_no_absorben_falsamente():
"""siglas cortas menos de 4 chars no absorben falsamente"""
# "US" es 2 chars normalizados -> no debe absorber a "USA" ni a "BBUSA"
names = ["US", "USA", "Standard Chartered"]
result = merge_entity_aliases(names)
# "US" (2 chars) no debe poder absorber nada
assert result["USA"] in ("USA", "Standard Chartered") or result["USA"] == "USA"
# "US" puede quedarse como identidad o ser absorbido por algo que lo contenga
# Lo importante: NO absorbe a nombres que no lo contienen como palabra completa
assert result["Standard Chartered"] == "Standard Chartered"
def test_nombres_totalmente_disjuntos_se_mapean_a_si_mismos():
"""nombres totalmente disjuntos se mapean a si mismos"""
names = ["Inditex", "Santander", "Telefonica"]
result = merge_entity_aliases(names)
assert result["Inditex"] == "Inditex"
assert result["Santander"] == "Santander"
assert result["Telefonica"] == "Telefonica"
def test_lista_vacia_retorna_dict_vacio():
"""lista vacia retorna dict vacio"""
assert merge_entity_aliases([]) == {}
@@ -0,0 +1,42 @@
"""Tests para normalize_for_join."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from normalize_for_join import normalize_for_join
def test_normalize_con_puntuacion_y_diacriticos_y_none():
"""normalize con puntuacion y diacriticos y None"""
result = normalize_for_join(["Calle Mayor, 14", "ávila", None])
assert result == ["CALLE MAYOR 14", "AVILA", ""]
def test_normalize_lista_vacia():
assert normalize_for_join([]) == []
def test_normalize_upper():
assert normalize_for_join(["madrid"]) == ["MADRID"]
def test_normalize_elimina_simbolos():
assert normalize_for_join(["José García S.L."]) == ["JOSE GARCIA SL"]
def test_normalize_colapsa_espacios():
assert normalize_for_join([" hola mundo "]) == ["HOLA MUNDO"]
def test_normalize_nan_as_empty():
# NaN de float (float('nan'))
result = normalize_for_join([float("nan")])
assert result == [""]
def test_normalize_entero():
# Enteros se convierten a string
result = normalize_for_join([28001])
assert result == ["28001"]
@@ -0,0 +1,40 @@
"""Tests para safe_read_csv_fallback."""
from __future__ import annotations
import tempfile
from pathlib import Path
import pytest
def test_lee_csv_utf_8_correctamente():
"""lee csv utf-8 correctamente"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from core.safe_read_csv_fallback import safe_read_csv_fallback
with tempfile.TemporaryDirectory() as tmpdir:
csv_path = Path(tmpdir) / "test_utf8.csv"
csv_path.write_text("nombre,valor\nAña,42\nBéta,99\n", encoding="utf-8")
df = safe_read_csv_fallback(csv_path)
assert df.shape == (2, 2)
assert list(df.columns) == ["nombre", "valor"]
assert df["nombre"].tolist() == ["Aña", "Béta"]
def test_lee_csv_latin_1_con_fallback():
"""lee csv latin-1 con fallback"""
import sys
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from core.safe_read_csv_fallback import safe_read_csv_fallback
with tempfile.TemporaryDirectory() as tmpdir:
csv_path = Path(tmpdir) / "test_latin1.csv"
# Write latin-1 encoded CSV (ñ, é are 0xF1, 0xE9 in latin-1)
csv_path.write_bytes("nombre,valor\nMad\xf1id,10\nC\xe9ntro,20\n".encode("latin-1"))
df = safe_read_csv_fallback(csv_path)
assert df.shape == (2, 2)
assert "Mad" in df["nombre"].iloc[0]
assert df["valor"].tolist() == [10, 20]
@@ -0,0 +1,44 @@
"""Tests para slugify_ascii."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from slugify_ascii import slugify_ascii
def test_slugify_texto_con_puntuacion():
"""slugify texto con puntuacion"""
assert slugify_ascii("Calle Mayor, 14") == "calle-mayor-14"
def test_slugify_diacriticos():
"""slugify diacriticos"""
assert slugify_ascii("Ávila") == "avila"
def test_slugify_cadena_vacia_retorna_default():
"""slugify cadena vacia retorna default"""
assert slugify_ascii("") == "centro"
def test_slugify_trunca_a_max_len():
"""slugify trunca a max_len"""
assert slugify_ascii("a" * 100, max_len=10) == "aaaaaaaaaa"
def test_slugify_none_retorna_default():
assert slugify_ascii(None) == "centro"
def test_slugify_default_custom():
assert slugify_ascii("---", default="sin-nombre") == "sin-nombre"
def test_slugify_solo_diacriticos_y_puntuacion():
assert slugify_ascii("ñoño") == "nono"
def test_slugify_numeros():
assert slugify_ascii("28001 Madrid") == "28001-madrid"