Files
fn_registry/python/functions/datascience/detect_latlon_columns.py
T
egutierrez cd658cc703 feat(eda): primitivas geoespaciales del grupo eda (detección lat/lon + extensión + scatter)
Tres funciones puras nuevas del dominio datascience (tags eda + geospatial) que
sostienen el capítulo GEOSPATIAL del AutomaticEDA, delegadas a fn-constructor:

- detect_latlon_columns: identifica el par (lat, lon) por nombre de columna +
  rango de valores ([-90,90] / [-180,180]) desde profile['columns']. Devuelve
  {lat_col, lon_col, confidence, reason}. 9 tests.
- analyze_geo_extent: bbox, centroide, span haversine, conteo por zona/país
  (lookup offline con bounding boxes embebidos, KISS sin geopandas) y
  hemisferios. 7 tests.
- build_geo_scatter: prepara los puntos del scatter en orden [lon, lat] con
  downsampling determinista por paso fijo + aspect equirectangular 1/cos(lat)
  clampado. 6 tests.

Registradas en datascience/__init__.py. Todas pure, params_schema completo,
.md autosuficiente (Ejemplo + Cuando usarla + Gotchas).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 15:29:33 +02:00

199 lines
8.2 KiB
Python

"""detect_latlon_columns — detect a (latitude, longitude) column pair in an EDA profile.
Pure function: no I/O, deterministic. Takes the `columns` list of a TableProfile
(group `eda`) and decides whether two of its columns form a geographic coordinate
pair (latitude + longitude), combining a name heuristic with a value-range check.
The detection is intentionally conservative: a name hint alone is never enough. A
column is only accepted as latitude/longitude if its numeric range fits inside the
valid coordinate bounds ([-90, 90] for latitude, [-180, 180] for longitude). When
the `numeric` sub-block is absent the optional `samples` argument is used instead.
Reading is fully defensive (.get throughout) and the function NEVER raises: any
malformed input (None, non-list, non-dict entries, missing keys) simply yields a
no-pair result {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": ...}.
"""
import re
# Collapse the separators a column name may use (snake_case, kebab-case, spaces)
# so that "y_coord", "y-coord" and "y coord" all normalize to the same token.
_SEP_RE = re.compile(r"[\s_\-]+")
# Name-match strengths: a strong, unambiguous coordinate name vs a weak generic
# axis name (x / y) that only counts when the range also fits and a partner exists.
_STRONG = 0.6
_WEAK = 0.3
_RANGE_BONUS = 0.4 # added once the mandatory range validation passes
def _normalize(name):
"""Lowercase a column name and strip separator chars (_, -, whitespace)."""
if not isinstance(name, str):
return ""
return _SEP_RE.sub("", name.strip().lower())
def _num(value):
"""Coerce to float defensively; return None for None/bool/non-numeric."""
# bool is a subclass of int; a coordinate value is never a real bool, so treat
# True/False as missing instead of silently coercing to 1.0/0.0.
if value is None or isinstance(value, bool):
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def _lat_name_strength(nn):
"""Strength of a normalized name as a latitude candidate (0=no match)."""
if not nn:
return 0.0
# "lat", "latitude", "latitud" all contain the "lat" stem.
if "lat" in nn:
return _STRONG
# Weak generic axis name: only useful when paired with an x/lon partner.
if nn in ("y", "ycoord", "ycoordinate", "ycoordinates"):
return _WEAK
return 0.0
def _lon_name_strength(nn):
"""Strength of a normalized name as a longitude candidate (0=no match)."""
if not nn:
return 0.0
# "lon", "long", "longitude", "longitud" share the "lon" stem; "lng" is separate.
if "lon" in nn or "lng" in nn:
return _STRONG
if nn in ("x", "xcoord", "xcoordinate", "xcoordinates"):
return _WEAK
return 0.0
def _col_range(col, sample_values):
"""Return (min, max) floats for a column, or (None, None) if not numeric.
Prefers the `numeric` sub-block min/max (the output of describe_numeric); falls
back to the provided sample list. A column is only treated as numeric when both
extremes are derivable: from the numeric block, or from samples whose every
non-null value coerces to a number.
"""
if isinstance(col, dict):
numeric = col.get("numeric")
if isinstance(numeric, dict):
mn = _num(numeric.get("min"))
mx = _num(numeric.get("max"))
if mn is not None and mx is not None:
return mn, mx
# Fall back to samples when the numeric block is missing or incomplete.
if isinstance(sample_values, (list, tuple)):
non_null = [v for v in sample_values if v is not None]
if non_null:
coerced = [_num(v) for v in non_null]
# Any non-numeric sample means we cannot trust the column as numeric.
if all(c is not None for c in coerced):
return min(coerced), max(coerced)
return None, None
def _no_pair(reason):
"""Canonical empty result: no coordinate pair detected."""
return {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": reason}
def detect_latlon_columns(columns: list, samples: dict | None = None) -> dict:
"""Detect a (latitude, longitude) column pair from an eda TableProfile.
Combines a name heuristic (latitude/longitude/lat/lon/lng + weak x/y) with a
mandatory range validation: the chosen latitude must sit in [-90, 90] and the
longitude in [-180, 180]. A name hint whose range does not fit is discarded.
Both sides are required for success; if only one is found, no pair is returned.
Args:
columns: List of ColumnProfile dicts (the `columns` of a TableProfile).
Each dict is read defensively with .get; only `name` is required.
`numeric.min` / `numeric.max` (and optionally `inferred_type`) are used
for the range check when present.
samples: Optional {column_name: [values...]} used to validate the range
when a column lacks `numeric.min`/`numeric.max`. If None/omitted, only
the `numeric` sub-block is consulted.
Returns:
Always a dict {"lat_col": str|None, "lon_col": str|None,
"confidence": float, "reason": str}. On success lat_col and lon_col name
the detected pair (distinct columns) and confidence is in [0, 1]: a pair
validated by a strong name on both sides scores ~1.0, a weak x/y pair ~0.7.
On failure both columns are None and confidence is 0.0.
"""
if not isinstance(columns, (list, tuple)) or len(columns) == 0:
return _no_pair("sin columnas que inspeccionar")
sample_map = samples if isinstance(samples, dict) else {}
# (column_name, confidence) for each side. Confidence already includes the
# range bonus because membership in the list implies the range was validated.
lat_candidates = []
lon_candidates = []
for col in columns:
if not isinstance(col, dict):
continue
name = col.get("name")
if not isinstance(name, str) or not name:
continue
nn = _normalize(name)
lat_strength = _lat_name_strength(nn)
lon_strength = _lon_name_strength(nn)
if lat_strength == 0.0 and lon_strength == 0.0:
continue # name gives no coordinate hint; skip.
mn, mx = _col_range(col, sample_map.get(name))
is_numeric = mn is not None and mx is not None
if not is_numeric:
continue # range cannot be validated -> not a coordinate.
if lat_strength > 0.0 and mn >= -90.0 and mx <= 90.0:
lat_candidates.append((name, lat_strength + _RANGE_BONUS))
if lon_strength > 0.0 and mn >= -180.0 and mx <= 180.0:
lon_candidates.append((name, lon_strength + _RANGE_BONUS))
if not lat_candidates and not lon_candidates:
return _no_pair("ninguna columna sugiere latitud ni longitud por nombre+rango")
if not lat_candidates:
return _no_pair("no se encontro columna de latitud valida (nombre+rango en [-90,90])")
if not lon_candidates:
return _no_pair("no se encontro columna de longitud valida (nombre+rango en [-180,180])")
# Pick the distinct pair with the highest combined confidence. First match wins
# on ties to keep the result deterministic by input order.
best = None # (combined, lat_name, lon_name, lat_c, lon_c)
for lat_name, lat_c in lat_candidates:
for lon_name, lon_c in lon_candidates:
if lat_name == lon_name:
continue # a column cannot be both axes of the same pair.
combined = (lat_c + lon_c) / 2.0
if best is None or combined > best[0]:
best = (combined, lat_name, lon_name, lat_c, lon_c)
if best is None:
return _no_pair("solo una columna sirve para ambos ejes; no hay par lat/lon distinto")
combined, lat_name, lon_name, lat_c, lon_c = best
confidence = max(0.0, min(1.0, combined))
lat_label = "fuerte" if lat_c >= 0.9 else "debil"
lon_label = "fuerte" if lon_c >= 0.9 else "debil"
reason = (
f"par lat='{lat_name}' (nombre {lat_label}) / lon='{lon_name}' "
f"(nombre {lon_label}) con rango valido"
)
return {
"lat_col": lat_name,
"lon_col": lon_name,
"confidence": confidence,
"reason": reason,
}