cd658cc703
Tres funciones puras nuevas del dominio datascience (tags eda + geospatial) que
sostienen el capítulo GEOSPATIAL del AutomaticEDA, delegadas a fn-constructor:
- detect_latlon_columns: identifica el par (lat, lon) por nombre de columna +
rango de valores ([-90,90] / [-180,180]) desde profile['columns']. Devuelve
{lat_col, lon_col, confidence, reason}. 9 tests.
- analyze_geo_extent: bbox, centroide, span haversine, conteo por zona/país
(lookup offline con bounding boxes embebidos, KISS sin geopandas) y
hemisferios. 7 tests.
- build_geo_scatter: prepara los puntos del scatter en orden [lon, lat] con
downsampling determinista por paso fijo + aspect equirectangular 1/cos(lat)
clampado. 6 tests.
Registradas en datascience/__init__.py. Todas pure, params_schema completo,
.md autosuficiente (Ejemplo + Cuando usarla + Gotchas).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
127 lines
4.5 KiB
Python
127 lines
4.5 KiB
Python
"""Tests para analyze_geo_extent."""
|
|
|
|
import math
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from analyze_geo_extent import analyze_geo_extent, _haversine_km
|
|
|
|
# Keys that a non-empty result dict must always contain.
|
|
_EXPECTED_KEYS = {
|
|
"n_points", "bbox", "centroid", "span_km",
|
|
"by_region", "hemisphere", "note",
|
|
}
|
|
|
|
|
|
def test_nube_en_espana():
|
|
"""Golden: nube de puntos alrededor de Madrid -> region top = España."""
|
|
# Cuatro puntos en torno a Madrid (lat ~40, lon ~-3.7), con algo de spread.
|
|
lats = [40.4, 40.0, 41.0, 39.5]
|
|
lons = [-3.7, -3.5, -4.0, -3.2]
|
|
res = analyze_geo_extent(lats, lons)
|
|
|
|
assert set(res.keys()) == _EXPECTED_KEYS
|
|
assert res["n_points"] == 4
|
|
|
|
# Todos caen en España -> by_region una sola entrada.
|
|
assert res["by_region"][0]["region"] == "España"
|
|
assert res["by_region"][0]["count"] == 4
|
|
|
|
# Centroide coherente: media de lat y lon.
|
|
assert math.isclose(res["centroid"]["lat"], sum(lats) / 4, rel_tol=1e-9)
|
|
assert math.isclose(res["centroid"]["lon"], sum(lons) / 4, rel_tol=1e-9)
|
|
|
|
# bbox correcto.
|
|
assert res["bbox"]["lat_min"] == 39.5
|
|
assert res["bbox"]["lat_max"] == 41.0
|
|
assert res["bbox"]["lon_min"] == -4.0
|
|
assert res["bbox"]["lon_max"] == -3.2
|
|
|
|
# Hay spread -> diagonal > 0.
|
|
assert res["span_km"] > 0.0
|
|
|
|
# Hemisferio norte (lat>0) y oeste (lon<0).
|
|
assert res["hemisphere"]["north"] == 4
|
|
assert res["hemisphere"]["south"] == 0
|
|
assert res["hemisphere"]["east"] == 0
|
|
assert res["hemisphere"]["west"] == 4
|
|
|
|
assert "España" in res["note"]
|
|
|
|
|
|
def test_dos_paises_distintos():
|
|
"""Golden: puntos en España y Francia -> by_region con 2 entradas."""
|
|
# Madrid (España) x2 y Paris (Francia) x1.
|
|
lats = [40.4, 40.0, 48.8]
|
|
lons = [-3.7, -3.5, 2.3]
|
|
res = analyze_geo_extent(lats, lons)
|
|
|
|
assert res["n_points"] == 3
|
|
regions = {entry["region"]: entry["count"] for entry in res["by_region"]}
|
|
assert regions == {"España": 2, "Francia": 1}
|
|
# Orden descendente por count: España (2) antes que Francia (1).
|
|
assert res["by_region"][0]["region"] == "España"
|
|
assert res["by_region"][0]["count"] == 2
|
|
|
|
# Madrid y Paris ambos hemisferio norte; Paris lon>0 -> 1 east, 2 west.
|
|
assert res["hemisphere"]["north"] == 3
|
|
assert res["hemisphere"]["east"] == 1
|
|
assert res["hemisphere"]["west"] == 2
|
|
|
|
|
|
def test_listas_vacias():
|
|
"""Edge: listas vacias -> n_points 0, bbox None, sin lanzar."""
|
|
res = analyze_geo_extent([], [])
|
|
assert res["n_points"] == 0
|
|
assert res["bbox"] is None
|
|
assert res["centroid"] is None
|
|
assert res["span_km"] == 0.0
|
|
assert res["by_region"] == []
|
|
assert res["hemisphere"] == {"north": 0, "south": 0, "east": 0, "west": 0}
|
|
assert res["note"] == "sin coordenadas validas"
|
|
|
|
|
|
def test_pares_invalidos_filtrados():
|
|
"""Edge: None / NaN / fuera de rango se descartan, no lanza."""
|
|
nan = float("nan")
|
|
lats = [40.4, None, nan, 91.0, -200.0, 40.0]
|
|
lons = [-3.7, -3.5, -3.0, 2.0, 5.0, -3.5]
|
|
# Validos: indices 0 y 5 (lat 91 fuera de rango, lon -200 fuera de rango,
|
|
# None y NaN descartados).
|
|
res = analyze_geo_extent(lats, lons)
|
|
assert res["n_points"] == 2
|
|
assert res["by_region"][0]["region"] == "España"
|
|
assert res["by_region"][0]["count"] == 2
|
|
|
|
|
|
def test_longitudes_desbalanceadas():
|
|
"""Edge: len(lats) != len(lons) usa el minimo comun sin lanzar."""
|
|
lats = [40.4, 40.0, 41.0, 39.5] # 4 elementos
|
|
lons = [-3.7, -3.5] # 2 elementos
|
|
res = analyze_geo_extent(lats, lons)
|
|
# Solo se emparejan los 2 primeros.
|
|
assert res["n_points"] == 2
|
|
assert res["bbox"]["lat_min"] == 40.0
|
|
assert res["bbox"]["lat_max"] == 40.4
|
|
|
|
|
|
def test_span_km_haversine_par_conocido():
|
|
"""Edge: span_km coincide con haversine de la diagonal del bbox."""
|
|
# Dos puntos: (0, 0) y (0, 1). bbox diagonal = mismos dos puntos.
|
|
res = analyze_geo_extent([0.0, 0.0], [0.0, 1.0])
|
|
# 1 grado de longitud en el ecuador ~ 111.19 km.
|
|
expected = _haversine_km(0.0, 0.0, 0.0, 1.0)
|
|
assert math.isclose(res["span_km"], expected, rel_tol=1e-9)
|
|
assert math.isclose(res["span_km"], 111.19, abs_tol=0.5)
|
|
|
|
|
|
def test_no_lanza_con_entradas_raras():
|
|
"""Edge: tipos no-lista o None devuelven la forma vacia sin lanzar."""
|
|
assert analyze_geo_extent(None, None)["n_points"] == 0
|
|
assert analyze_geo_extent("foo", "bar")["n_points"] == 0
|
|
# Strings dentro de las listas se descartan como invalidos.
|
|
res = analyze_geo_extent(["x", 40.0], [None, -3.5])
|
|
assert res["n_points"] == 1
|