cd658cc703
Tres funciones puras nuevas del dominio datascience (tags eda + geospatial) que
sostienen el capítulo GEOSPATIAL del AutomaticEDA, delegadas a fn-constructor:
- detect_latlon_columns: identifica el par (lat, lon) por nombre de columna +
rango de valores ([-90,90] / [-180,180]) desde profile['columns']. Devuelve
{lat_col, lon_col, confidence, reason}. 9 tests.
- analyze_geo_extent: bbox, centroide, span haversine, conteo por zona/país
(lookup offline con bounding boxes embebidos, KISS sin geopandas) y
hemisferios. 7 tests.
- build_geo_scatter: prepara los puntos del scatter en orden [lon, lat] con
downsampling determinista por paso fijo + aspect equirectangular 1/cos(lat)
clampado. 6 tests.
Registradas en datascience/__init__.py. Todas pure, params_schema completo,
.md autosuficiente (Ejemplo + Cuando usarla + Gotchas).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
199 lines
8.2 KiB
Python
199 lines
8.2 KiB
Python
"""detect_latlon_columns — detect a (latitude, longitude) column pair in an EDA profile.
|
|
|
|
Pure function: no I/O, deterministic. Takes the `columns` list of a TableProfile
|
|
(group `eda`) and decides whether two of its columns form a geographic coordinate
|
|
pair (latitude + longitude), combining a name heuristic with a value-range check.
|
|
|
|
The detection is intentionally conservative: a name hint alone is never enough. A
|
|
column is only accepted as latitude/longitude if its numeric range fits inside the
|
|
valid coordinate bounds ([-90, 90] for latitude, [-180, 180] for longitude). When
|
|
the `numeric` sub-block is absent the optional `samples` argument is used instead.
|
|
|
|
Reading is fully defensive (.get throughout) and the function NEVER raises: any
|
|
malformed input (None, non-list, non-dict entries, missing keys) simply yields a
|
|
no-pair result {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": ...}.
|
|
"""
|
|
|
|
import re
|
|
|
|
# Collapse the separators a column name may use (snake_case, kebab-case, spaces)
|
|
# so that "y_coord", "y-coord" and "y coord" all normalize to the same token.
|
|
_SEP_RE = re.compile(r"[\s_\-]+")
|
|
|
|
# Name-match strengths: a strong, unambiguous coordinate name vs a weak generic
|
|
# axis name (x / y) that only counts when the range also fits and a partner exists.
|
|
_STRONG = 0.6
|
|
_WEAK = 0.3
|
|
_RANGE_BONUS = 0.4 # added once the mandatory range validation passes
|
|
|
|
|
|
def _normalize(name):
|
|
"""Lowercase a column name and strip separator chars (_, -, whitespace)."""
|
|
if not isinstance(name, str):
|
|
return ""
|
|
return _SEP_RE.sub("", name.strip().lower())
|
|
|
|
|
|
def _num(value):
|
|
"""Coerce to float defensively; return None for None/bool/non-numeric."""
|
|
# bool is a subclass of int; a coordinate value is never a real bool, so treat
|
|
# True/False as missing instead of silently coercing to 1.0/0.0.
|
|
if value is None or isinstance(value, bool):
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _lat_name_strength(nn):
|
|
"""Strength of a normalized name as a latitude candidate (0=no match)."""
|
|
if not nn:
|
|
return 0.0
|
|
# "lat", "latitude", "latitud" all contain the "lat" stem.
|
|
if "lat" in nn:
|
|
return _STRONG
|
|
# Weak generic axis name: only useful when paired with an x/lon partner.
|
|
if nn in ("y", "ycoord", "ycoordinate", "ycoordinates"):
|
|
return _WEAK
|
|
return 0.0
|
|
|
|
|
|
def _lon_name_strength(nn):
|
|
"""Strength of a normalized name as a longitude candidate (0=no match)."""
|
|
if not nn:
|
|
return 0.0
|
|
# "lon", "long", "longitude", "longitud" share the "lon" stem; "lng" is separate.
|
|
if "lon" in nn or "lng" in nn:
|
|
return _STRONG
|
|
if nn in ("x", "xcoord", "xcoordinate", "xcoordinates"):
|
|
return _WEAK
|
|
return 0.0
|
|
|
|
|
|
def _col_range(col, sample_values):
|
|
"""Return (min, max) floats for a column, or (None, None) if not numeric.
|
|
|
|
Prefers the `numeric` sub-block min/max (the output of describe_numeric); falls
|
|
back to the provided sample list. A column is only treated as numeric when both
|
|
extremes are derivable: from the numeric block, or from samples whose every
|
|
non-null value coerces to a number.
|
|
"""
|
|
if isinstance(col, dict):
|
|
numeric = col.get("numeric")
|
|
if isinstance(numeric, dict):
|
|
mn = _num(numeric.get("min"))
|
|
mx = _num(numeric.get("max"))
|
|
if mn is not None and mx is not None:
|
|
return mn, mx
|
|
# Fall back to samples when the numeric block is missing or incomplete.
|
|
if isinstance(sample_values, (list, tuple)):
|
|
non_null = [v for v in sample_values if v is not None]
|
|
if non_null:
|
|
coerced = [_num(v) for v in non_null]
|
|
# Any non-numeric sample means we cannot trust the column as numeric.
|
|
if all(c is not None for c in coerced):
|
|
return min(coerced), max(coerced)
|
|
return None, None
|
|
|
|
|
|
def _no_pair(reason):
|
|
"""Canonical empty result: no coordinate pair detected."""
|
|
return {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": reason}
|
|
|
|
|
|
def detect_latlon_columns(columns: list, samples: dict | None = None) -> dict:
|
|
"""Detect a (latitude, longitude) column pair from an eda TableProfile.
|
|
|
|
Combines a name heuristic (latitude/longitude/lat/lon/lng + weak x/y) with a
|
|
mandatory range validation: the chosen latitude must sit in [-90, 90] and the
|
|
longitude in [-180, 180]. A name hint whose range does not fit is discarded.
|
|
Both sides are required for success; if only one is found, no pair is returned.
|
|
|
|
Args:
|
|
columns: List of ColumnProfile dicts (the `columns` of a TableProfile).
|
|
Each dict is read defensively with .get; only `name` is required.
|
|
`numeric.min` / `numeric.max` (and optionally `inferred_type`) are used
|
|
for the range check when present.
|
|
samples: Optional {column_name: [values...]} used to validate the range
|
|
when a column lacks `numeric.min`/`numeric.max`. If None/omitted, only
|
|
the `numeric` sub-block is consulted.
|
|
|
|
Returns:
|
|
Always a dict {"lat_col": str|None, "lon_col": str|None,
|
|
"confidence": float, "reason": str}. On success lat_col and lon_col name
|
|
the detected pair (distinct columns) and confidence is in [0, 1]: a pair
|
|
validated by a strong name on both sides scores ~1.0, a weak x/y pair ~0.7.
|
|
On failure both columns are None and confidence is 0.0.
|
|
"""
|
|
if not isinstance(columns, (list, tuple)) or len(columns) == 0:
|
|
return _no_pair("sin columnas que inspeccionar")
|
|
|
|
sample_map = samples if isinstance(samples, dict) else {}
|
|
|
|
# (column_name, confidence) for each side. Confidence already includes the
|
|
# range bonus because membership in the list implies the range was validated.
|
|
lat_candidates = []
|
|
lon_candidates = []
|
|
|
|
for col in columns:
|
|
if not isinstance(col, dict):
|
|
continue
|
|
name = col.get("name")
|
|
if not isinstance(name, str) or not name:
|
|
continue
|
|
|
|
nn = _normalize(name)
|
|
lat_strength = _lat_name_strength(nn)
|
|
lon_strength = _lon_name_strength(nn)
|
|
if lat_strength == 0.0 and lon_strength == 0.0:
|
|
continue # name gives no coordinate hint; skip.
|
|
|
|
mn, mx = _col_range(col, sample_map.get(name))
|
|
is_numeric = mn is not None and mx is not None
|
|
if not is_numeric:
|
|
continue # range cannot be validated -> not a coordinate.
|
|
|
|
if lat_strength > 0.0 and mn >= -90.0 and mx <= 90.0:
|
|
lat_candidates.append((name, lat_strength + _RANGE_BONUS))
|
|
if lon_strength > 0.0 and mn >= -180.0 and mx <= 180.0:
|
|
lon_candidates.append((name, lon_strength + _RANGE_BONUS))
|
|
|
|
if not lat_candidates and not lon_candidates:
|
|
return _no_pair("ninguna columna sugiere latitud ni longitud por nombre+rango")
|
|
if not lat_candidates:
|
|
return _no_pair("no se encontro columna de latitud valida (nombre+rango en [-90,90])")
|
|
if not lon_candidates:
|
|
return _no_pair("no se encontro columna de longitud valida (nombre+rango en [-180,180])")
|
|
|
|
# Pick the distinct pair with the highest combined confidence. First match wins
|
|
# on ties to keep the result deterministic by input order.
|
|
best = None # (combined, lat_name, lon_name, lat_c, lon_c)
|
|
for lat_name, lat_c in lat_candidates:
|
|
for lon_name, lon_c in lon_candidates:
|
|
if lat_name == lon_name:
|
|
continue # a column cannot be both axes of the same pair.
|
|
combined = (lat_c + lon_c) / 2.0
|
|
if best is None or combined > best[0]:
|
|
best = (combined, lat_name, lon_name, lat_c, lon_c)
|
|
|
|
if best is None:
|
|
return _no_pair("solo una columna sirve para ambos ejes; no hay par lat/lon distinto")
|
|
|
|
combined, lat_name, lon_name, lat_c, lon_c = best
|
|
confidence = max(0.0, min(1.0, combined))
|
|
|
|
lat_label = "fuerte" if lat_c >= 0.9 else "debil"
|
|
lon_label = "fuerte" if lon_c >= 0.9 else "debil"
|
|
reason = (
|
|
f"par lat='{lat_name}' (nombre {lat_label}) / lon='{lon_name}' "
|
|
f"(nombre {lon_label}) con rango valido"
|
|
)
|
|
|
|
return {
|
|
"lat_col": lat_name,
|
|
"lon_col": lon_name,
|
|
"confidence": confidence,
|
|
"reason": reason,
|
|
}
|