cd658cc703
Tres funciones puras nuevas del dominio datascience (tags eda + geospatial) que
sostienen el capítulo GEOSPATIAL del AutomaticEDA, delegadas a fn-constructor:
- detect_latlon_columns: identifica el par (lat, lon) por nombre de columna +
rango de valores ([-90,90] / [-180,180]) desde profile['columns']. Devuelve
{lat_col, lon_col, confidence, reason}. 9 tests.
- analyze_geo_extent: bbox, centroide, span haversine, conteo por zona/país
(lookup offline con bounding boxes embebidos, KISS sin geopandas) y
hemisferios. 7 tests.
- build_geo_scatter: prepara los puntos del scatter en orden [lon, lat] con
downsampling determinista por paso fijo + aspect equirectangular 1/cos(lat)
clampado. 6 tests.
Registradas en datascience/__init__.py. Todas pure, params_schema completo,
.md autosuficiente (Ejemplo + Cuando usarla + Gotchas).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
154 lines
6.0 KiB
Python
154 lines
6.0 KiB
Python
"""build_geo_scatter — prepare points for a geographic scatter (EDA `geospatial`).
|
|
|
|
Pure function: no I/O, deterministic. Takes two parallel lists of latitudes and
|
|
longitudes and returns the data a caller needs to draw a geographic scatter in an
|
|
equirectangular projection: cleaned points in [lon, lat] order, a bounding box, a
|
|
projection aspect ratio and a suggested axis padding.
|
|
|
|
It NEVER draws anything (no matplotlib) — the chapter that consumes this output is
|
|
responsible for the rendering. Reading is defensive throughout and the function
|
|
NEVER raises: malformed pairs (None, NaN, infinity or out-of-range coordinates)
|
|
are silently dropped and an empty/valid result is always returned.
|
|
|
|
To keep the rendered PDF/PPTX light on phones, when the number of valid pairs
|
|
exceeds `max_points` the points are down-sampled DETERMINISTICALLY by a fixed
|
|
step (`pairs[::step]`), never randomly, so the result is reproducible.
|
|
"""
|
|
|
|
import math
|
|
|
|
# Minimum axis padding (in degrees) so a single point or a zero-range cloud is
|
|
# never drawn glued to the axis border (it would collapse to a line).
|
|
_MIN_PAD = 0.01
|
|
|
|
# Aspect ratio clamp. 1/cos(lat) blows up near the poles; clamp keeps the render
|
|
# sane (Tufte: do not let the projection stretch the cloud out of proportion).
|
|
_ASPECT_MIN = 0.3
|
|
_ASPECT_MAX = 5.0
|
|
|
|
|
|
def _coord(value):
|
|
"""Coerce to a finite float defensively; return None for invalid coordinates.
|
|
|
|
bool is a subclass of int, but a real latitude/longitude is never a bool, so
|
|
True/False are treated as missing instead of coercing to 1.0/0.0. NaN and
|
|
+/-infinity are never valid coordinates either.
|
|
"""
|
|
if value is None or isinstance(value, bool):
|
|
return None
|
|
try:
|
|
coord = float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
if math.isnan(coord) or math.isinf(coord):
|
|
return None
|
|
return coord
|
|
|
|
|
|
def build_geo_scatter(lats: list, lons: list, max_points: int = 2000) -> dict:
|
|
"""Prepare the data for a geographic scatter in equirectangular projection.
|
|
|
|
Pairs `lats` and `lons` by index, drops invalid pairs, optionally
|
|
down-samples deterministically, and derives the geometry (bbox, aspect, pad)
|
|
a caller needs to draw the cloud. No raw rendering is performed.
|
|
|
|
Args:
|
|
lats: List (or tuple) of latitudes in degrees. Paired by index with
|
|
`lons`. A value that is None, NaN, infinite, bool or outside
|
|
[-90, 90] discards that pair. Read defensively.
|
|
lons: List (or tuple) of longitudes in degrees, parallel to `lats`. A
|
|
value outside [-180, 180] (or None/NaN/inf/bool) discards that pair.
|
|
max_points: Cap on the number of points returned. When the number of
|
|
valid pairs exceeds this cap, the points are down-sampled by a fixed
|
|
step `ceil(n_total / max_points)` taking `pairs[::step]` — DETERMINISTIC,
|
|
not random, so the output is reproducible. A non-positive or non-int
|
|
value disables down-sampling.
|
|
|
|
Returns:
|
|
Dict ready for a caller's ax.scatter:
|
|
{points: [[lon, lat], ...] (x=lon, y=lat order), n_total: valid pairs
|
|
before down-sampling, n_shown: points returned, downsampled: bool,
|
|
bbox: {lat_min, lat_max, lon_min, lon_max} or None, aspect: 1/cos(centroid
|
|
lat) clamped to [0.3, 5.0], pad: {lon, lat} ~5% of each range with a small
|
|
floor}. When there are no valid pairs returns points=[], n_total=0,
|
|
n_shown=0, downsampled=False, bbox=None, aspect=1.0, pad={lon:0.0, lat:0.0}.
|
|
"""
|
|
pairs = [] # each item is (lon, lat) — already in [x, y] order
|
|
if isinstance(lats, (list, tuple)) and isinstance(lons, (list, tuple)):
|
|
n = min(len(lats), len(lons))
|
|
for i in range(n):
|
|
lat = _coord(lats[i])
|
|
lon = _coord(lons[i])
|
|
if lat is None or lon is None:
|
|
continue
|
|
if lat < -90.0 or lat > 90.0:
|
|
continue
|
|
if lon < -180.0 or lon > 180.0:
|
|
continue
|
|
pairs.append((lon, lat))
|
|
|
|
n_total = len(pairs)
|
|
if n_total == 0:
|
|
return {
|
|
"points": [],
|
|
"n_total": 0,
|
|
"n_shown": 0,
|
|
"downsampled": False,
|
|
"bbox": None,
|
|
"aspect": 1.0,
|
|
"pad": {"lon": 0.0, "lat": 0.0},
|
|
}
|
|
|
|
# Deterministic down-sampling by a fixed step. Reproducible: same input ->
|
|
# same output, no randomness.
|
|
if (
|
|
isinstance(max_points, int)
|
|
and not isinstance(max_points, bool)
|
|
and max_points > 0
|
|
and n_total > max_points
|
|
):
|
|
step = math.ceil(n_total / max_points)
|
|
sampled = pairs[::step]
|
|
else:
|
|
sampled = pairs
|
|
|
|
points = [[lon, lat] for (lon, lat) in sampled]
|
|
n_shown = len(points)
|
|
downsampled = n_shown < n_total
|
|
|
|
lons_s = [p[0] for p in sampled]
|
|
lats_s = [p[1] for p in sampled]
|
|
lon_min, lon_max = min(lons_s), max(lons_s)
|
|
lat_min, lat_max = min(lats_s), max(lats_s)
|
|
bbox = {
|
|
"lat_min": lat_min,
|
|
"lat_max": lat_max,
|
|
"lon_min": lon_min,
|
|
"lon_max": lon_max,
|
|
}
|
|
|
|
# Aspect for an equirectangular projection: stretch the x axis by 1/cos(lat)
|
|
# at the cloud centroid so a degree of longitude reads at its real width.
|
|
centroid_lat = sum(lats_s) / len(lats_s)
|
|
cos_lat = math.cos(math.radians(centroid_lat))
|
|
if cos_lat < 1e-12: # centroid at (or numerically at) a pole
|
|
aspect = _ASPECT_MAX
|
|
else:
|
|
aspect = 1.0 / cos_lat
|
|
aspect = max(_ASPECT_MIN, min(_ASPECT_MAX, aspect))
|
|
|
|
# Padding ~5% of each range, with a small floor so a zero-range cloud (single
|
|
# point / all identical) still gets a non-zero margin.
|
|
pad_lon = max(0.05 * (lon_max - lon_min), _MIN_PAD)
|
|
pad_lat = max(0.05 * (lat_max - lat_min), _MIN_PAD)
|
|
|
|
return {
|
|
"points": points,
|
|
"n_total": n_total,
|
|
"n_shown": n_shown,
|
|
"downsampled": downsampled,
|
|
"bbox": bbox,
|
|
"aspect": aspect,
|
|
"pad": {"lon": pad_lon, "lat": pad_lat},
|
|
}
|