cd658cc703
Tres funciones puras nuevas del dominio datascience (tags eda + geospatial) que
sostienen el capítulo GEOSPATIAL del AutomaticEDA, delegadas a fn-constructor:
- detect_latlon_columns: identifica el par (lat, lon) por nombre de columna +
rango de valores ([-90,90] / [-180,180]) desde profile['columns']. Devuelve
{lat_col, lon_col, confidence, reason}. 9 tests.
- analyze_geo_extent: bbox, centroide, span haversine, conteo por zona/país
(lookup offline con bounding boxes embebidos, KISS sin geopandas) y
hemisferios. 7 tests.
- build_geo_scatter: prepara los puntos del scatter en orden [lon, lat] con
downsampling determinista por paso fijo + aspect equirectangular 1/cos(lat)
clampado. 6 tests.
Registradas en datascience/__init__.py. Todas pure, params_schema completo,
.md autosuficiente (Ejemplo + Cuando usarla + Gotchas).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
210 lines
8.1 KiB
Python
210 lines
8.1 KiB
Python
"""analyze_geo_extent — geographic extent of a cloud of coordinates (EDA `geospatial`).
|
|
|
|
Pure function: no I/O, no network, deterministic. Given two parallel lists of
|
|
latitudes and longitudes it derives the bounding box, centroid, diagonal span
|
|
(haversine), per-region counts and hemisphere split of the points, and assigns
|
|
each point to a country/region via an OFFLINE lookup against a table of
|
|
rectangular bounding boxes embedded as a constant (`_REGION_BBOXES`).
|
|
|
|
It never reads files, never hits the network and depends only on `math`. The
|
|
country boxes are deliberately coarse rectangles (a KISS approximation, NOT a
|
|
reverse-geocoder). Reading is defensive throughout and the function NEVER
|
|
raises: invalid pairs (None / NaN / out of range) are silently discarded and an
|
|
empty cloud yields a zeroed result the caller can skip.
|
|
"""
|
|
|
|
import math
|
|
|
|
# Earth mean radius in km used by the haversine formula.
|
|
_EARTH_RADIUS_KM = 6371.0
|
|
|
|
# How many distinct regions to surface in `by_region` before collapsing the
|
|
# remainder into a single "Otros" bucket.
|
|
_TOP_REGIONS = 8
|
|
|
|
# Offline region lookup: (name, lat_min, lat_max, lon_min, lon_max).
|
|
#
|
|
# Specific countries are listed FIRST and continental fallbacks LAST: each point
|
|
# is assigned to the FIRST box that contains it, so the more specific country box
|
|
# wins over the broad continent box. Boxes are coarse rectangles approximating
|
|
# the mainland extent of each region; overlapping neighbours are ordered so the
|
|
# narrower/more-western country claims its coastal points (e.g. Portugal before
|
|
# Spain, Chile before Argentina, the contiguous US before Canada).
|
|
_REGION_BBOXES = (
|
|
# --- countries (specific) ---
|
|
("Portugal", 36.9, 42.2, -9.6, -6.2),
|
|
("España", 36.0, 43.8, -9.4, 3.4),
|
|
("Francia", 41.3, 51.1, -5.2, 9.6),
|
|
("Reino Unido", 49.9, 58.7, -8.6, 1.8),
|
|
("Irlanda", 51.4, 55.4, -10.6, -5.9),
|
|
("Países Bajos", 50.7, 53.6, 3.3, 7.2),
|
|
("Bélgica", 49.5, 51.5, 2.5, 6.4),
|
|
("Suiza", 45.8, 47.8, 5.9, 10.5),
|
|
("Alemania", 47.3, 55.1, 5.9, 15.0),
|
|
("Italia", 36.6, 47.1, 6.6, 18.5),
|
|
("Marruecos", 27.7, 35.9, -13.2, -1.0),
|
|
("Egipto", 22.0, 31.7, 25.0, 35.0),
|
|
("Sudáfrica", -34.8, -22.1, 16.5, 32.9),
|
|
("China", 18.0, 53.6, 73.5, 135.1),
|
|
("Japón", 24.0, 45.6, 122.9, 145.9),
|
|
("India", 6.7, 35.5, 68.1, 97.4),
|
|
("Australia", -43.7, -10.0, 112.9, 153.7),
|
|
("México", 14.5, 32.7, -118.4, -86.7),
|
|
("Estados Unidos", 24.4, 49.4, -125.0, -66.9),
|
|
("Canadá", 41.7, 83.1, -141.0, -52.6),
|
|
("Chile", -55.9, -17.5, -75.6, -66.4),
|
|
("Argentina", -55.1, -21.8, -73.6, -53.6),
|
|
("Brasil", -33.8, 5.3, -74.0, -34.8),
|
|
("Rusia", 41.2, 77.0, 19.6, 180.0),
|
|
# --- continental fallbacks (broad) ---
|
|
("Europa", 34.0, 72.0, -25.0, 45.0),
|
|
("África", -35.0, 37.5, -18.0, 52.0),
|
|
("Asia", 5.0, 78.0, 26.0, 180.0),
|
|
("América del Norte", 7.0, 84.0, -168.0, -52.0),
|
|
("América del Sur", -56.0, 13.0, -82.0, -34.0),
|
|
("Oceanía", -50.0, 0.0, 110.0, 180.0),
|
|
)
|
|
|
|
|
|
def _coord(value, limit):
|
|
"""Coerce a coordinate to a valid float in [-limit, limit] or None.
|
|
|
|
bool is a subclass of int but never a real coordinate, so True/False are
|
|
treated as missing. NaN and out-of-range values are rejected.
|
|
"""
|
|
if value is None or isinstance(value, bool):
|
|
return None
|
|
try:
|
|
f = float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
# NaN is the only value that is not equal to itself.
|
|
if f != f or f < -limit or f > limit:
|
|
return None
|
|
return f
|
|
|
|
|
|
def _haversine_km(lat1, lon1, lat2, lon2):
|
|
"""Great-circle distance in km between two (lat, lon) points in degrees."""
|
|
rlat1, rlat2 = math.radians(lat1), math.radians(lat2)
|
|
dlat = math.radians(lat2 - lat1)
|
|
dlon = math.radians(lon2 - lon1)
|
|
a = math.sin(dlat / 2.0) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2.0) ** 2
|
|
return 2.0 * _EARTH_RADIUS_KM * math.asin(min(1.0, math.sqrt(a)))
|
|
|
|
|
|
def _region_of(lat, lon):
|
|
"""Return the name of the first embedded box containing (lat, lon)."""
|
|
for name, lat_min, lat_max, lon_min, lon_max in _REGION_BBOXES:
|
|
if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max:
|
|
return name
|
|
return "Océano/Otros"
|
|
|
|
|
|
def _empty_result():
|
|
"""Result shape when there are no valid coordinate pairs."""
|
|
return {
|
|
"n_points": 0,
|
|
"bbox": None,
|
|
"centroid": None,
|
|
"span_km": 0.0,
|
|
"by_region": [],
|
|
"hemisphere": {"north": 0, "south": 0, "east": 0, "west": 0},
|
|
"note": "sin coordenadas validas",
|
|
}
|
|
|
|
|
|
def analyze_geo_extent(lats: list, lons: list) -> dict:
|
|
"""Summarise the geographic extent of a cloud of lat/lon coordinates.
|
|
|
|
Pairs `lats[i]` with `lons[i]` by index (over the common length when the two
|
|
lists differ in size), discards any pair where either value is None / NaN or
|
|
outside [-90, 90] (lat) / [-180, 180] (lon), and derives the bounding box,
|
|
centroid, diagonal span, per-region counts and hemisphere split. Each valid
|
|
point is matched to a country/region by an offline lookup against coarse
|
|
rectangular bounding boxes (`_REGION_BBOXES`).
|
|
|
|
Args:
|
|
lats: List of latitudes in degrees ([-90, 90]); read defensively.
|
|
lons: List of longitudes in degrees ([-180, 180]); read defensively.
|
|
Paired with `lats` by index; the shorter length wins when they differ.
|
|
|
|
Returns:
|
|
Dict with the geographic summary:
|
|
{n_points, bbox={lat_min,lat_max,lon_min,lon_max}, centroid={lat,lon},
|
|
span_km (haversine of the SW->NE bbox diagonal), by_region=[{region,count}]
|
|
(descending, top-8 with the rest folded into "Otros"),
|
|
hemisphere={north,south,east,west}, note (Spanish summary phrase)}.
|
|
With no valid pairs returns the zeroed shape: n_points 0, bbox None,
|
|
centroid None, span_km 0.0, empty by_region, zeroed hemisphere and the
|
|
note "sin coordenadas validas". Never raises.
|
|
"""
|
|
if not isinstance(lats, (list, tuple)) or not isinstance(lons, (list, tuple)):
|
|
return _empty_result()
|
|
|
|
valid = []
|
|
# zip already stops at the shorter list -> unbalanced lengths are handled.
|
|
for raw_lat, raw_lon in zip(lats, lons):
|
|
lat = _coord(raw_lat, 90.0)
|
|
lon = _coord(raw_lon, 180.0)
|
|
if lat is None or lon is None:
|
|
continue
|
|
valid.append((lat, lon))
|
|
|
|
if not valid:
|
|
return _empty_result()
|
|
|
|
n = len(valid)
|
|
lat_vals = [p[0] for p in valid]
|
|
lon_vals = [p[1] for p in valid]
|
|
|
|
lat_min, lat_max = min(lat_vals), max(lat_vals)
|
|
lon_min, lon_max = min(lon_vals), max(lon_vals)
|
|
|
|
centroid_lat = sum(lat_vals) / n
|
|
centroid_lon = sum(lon_vals) / n
|
|
|
|
# Diagonal span: SW corner (lat_min, lon_min) to NE corner (lat_max, lon_max).
|
|
span_km = _haversine_km(lat_min, lon_min, lat_max, lon_max)
|
|
|
|
# Hemisphere split: the equator/prime-meridian go to north/east respectively.
|
|
north = sum(1 for lat in lat_vals if lat >= 0.0)
|
|
south = n - north
|
|
east = sum(1 for lon in lon_vals if lon >= 0.0)
|
|
west = n - east
|
|
|
|
# Count points per region (offline bbox lookup).
|
|
counts = {}
|
|
for lat, lon in valid:
|
|
region = _region_of(lat, lon)
|
|
counts[region] = counts.get(region, 0) + 1
|
|
|
|
# Descending by count, then by name for a deterministic tie-break.
|
|
ranked = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
|
by_region = [{"region": name, "count": count} for name, count in ranked[:_TOP_REGIONS]]
|
|
rest = sum(count for _, count in ranked[_TOP_REGIONS:])
|
|
if rest > 0:
|
|
by_region.append({"region": "Otros", "count": rest})
|
|
|
|
top_region, top_count = ranked[0]
|
|
note = (
|
|
"los puntos se concentran en {region} ({count} de {n})".format(
|
|
region=top_region, count=top_count, n=n
|
|
)
|
|
)
|
|
|
|
return {
|
|
"n_points": n,
|
|
"bbox": {
|
|
"lat_min": lat_min,
|
|
"lat_max": lat_max,
|
|
"lon_min": lon_min,
|
|
"lon_max": lon_max,
|
|
},
|
|
"centroid": {"lat": centroid_lat, "lon": centroid_lon},
|
|
"span_km": span_km,
|
|
"by_region": by_region,
|
|
"hemisphere": {"north": north, "south": south, "east": east, "west": west},
|
|
"note": note,
|
|
}
|