"""detect_latlon_columns — detect a (latitude, longitude) column pair in an EDA profile. Pure function: no I/O, deterministic. Takes the `columns` list of a TableProfile (group `eda`) and decides whether two of its columns form a geographic coordinate pair (latitude + longitude), combining a name heuristic with a value-range check. The detection is intentionally conservative: a name hint alone is never enough. A column is only accepted as latitude/longitude if its numeric range fits inside the valid coordinate bounds ([-90, 90] for latitude, [-180, 180] for longitude). When the `numeric` sub-block is absent the optional `samples` argument is used instead. Reading is fully defensive (.get throughout) and the function NEVER raises: any malformed input (None, non-list, non-dict entries, missing keys) simply yields a no-pair result {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": ...}. """ import re # Collapse the separators a column name may use (snake_case, kebab-case, spaces) # so that "y_coord", "y-coord" and "y coord" all normalize to the same token. _SEP_RE = re.compile(r"[\s_\-]+") # Name-match strengths: a strong, unambiguous coordinate name vs a weak generic # axis name (x / y) that only counts when the range also fits and a partner exists. _STRONG = 0.6 _WEAK = 0.3 _RANGE_BONUS = 0.4 # added once the mandatory range validation passes def _normalize(name): """Lowercase a column name and strip separator chars (_, -, whitespace).""" if not isinstance(name, str): return "" return _SEP_RE.sub("", name.strip().lower()) def _num(value): """Coerce to float defensively; return None for None/bool/non-numeric.""" # bool is a subclass of int; a coordinate value is never a real bool, so treat # True/False as missing instead of silently coercing to 1.0/0.0. if value is None or isinstance(value, bool): return None try: return float(value) except (TypeError, ValueError): return None def _lat_name_strength(nn): """Strength of a normalized name as a latitude candidate (0=no match).""" if not nn: return 0.0 # "lat", "latitude", "latitud" all contain the "lat" stem. if "lat" in nn: return _STRONG # Weak generic axis name: only useful when paired with an x/lon partner. if nn in ("y", "ycoord", "ycoordinate", "ycoordinates"): return _WEAK return 0.0 def _lon_name_strength(nn): """Strength of a normalized name as a longitude candidate (0=no match).""" if not nn: return 0.0 # "lon", "long", "longitude", "longitud" share the "lon" stem; "lng" is separate. if "lon" in nn or "lng" in nn: return _STRONG if nn in ("x", "xcoord", "xcoordinate", "xcoordinates"): return _WEAK return 0.0 def _col_range(col, sample_values): """Return (min, max) floats for a column, or (None, None) if not numeric. Prefers the `numeric` sub-block min/max (the output of describe_numeric); falls back to the provided sample list. A column is only treated as numeric when both extremes are derivable: from the numeric block, or from samples whose every non-null value coerces to a number. """ if isinstance(col, dict): numeric = col.get("numeric") if isinstance(numeric, dict): mn = _num(numeric.get("min")) mx = _num(numeric.get("max")) if mn is not None and mx is not None: return mn, mx # Fall back to samples when the numeric block is missing or incomplete. if isinstance(sample_values, (list, tuple)): non_null = [v for v in sample_values if v is not None] if non_null: coerced = [_num(v) for v in non_null] # Any non-numeric sample means we cannot trust the column as numeric. if all(c is not None for c in coerced): return min(coerced), max(coerced) return None, None def _no_pair(reason): """Canonical empty result: no coordinate pair detected.""" return {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": reason} def detect_latlon_columns(columns: list, samples: dict | None = None) -> dict: """Detect a (latitude, longitude) column pair from an eda TableProfile. Combines a name heuristic (latitude/longitude/lat/lon/lng + weak x/y) with a mandatory range validation: the chosen latitude must sit in [-90, 90] and the longitude in [-180, 180]. A name hint whose range does not fit is discarded. Both sides are required for success; if only one is found, no pair is returned. Args: columns: List of ColumnProfile dicts (the `columns` of a TableProfile). Each dict is read defensively with .get; only `name` is required. `numeric.min` / `numeric.max` (and optionally `inferred_type`) are used for the range check when present. samples: Optional {column_name: [values...]} used to validate the range when a column lacks `numeric.min`/`numeric.max`. If None/omitted, only the `numeric` sub-block is consulted. Returns: Always a dict {"lat_col": str|None, "lon_col": str|None, "confidence": float, "reason": str}. On success lat_col and lon_col name the detected pair (distinct columns) and confidence is in [0, 1]: a pair validated by a strong name on both sides scores ~1.0, a weak x/y pair ~0.7. On failure both columns are None and confidence is 0.0. """ if not isinstance(columns, (list, tuple)) or len(columns) == 0: return _no_pair("sin columnas que inspeccionar") sample_map = samples if isinstance(samples, dict) else {} # (column_name, confidence) for each side. Confidence already includes the # range bonus because membership in the list implies the range was validated. lat_candidates = [] lon_candidates = [] for col in columns: if not isinstance(col, dict): continue name = col.get("name") if not isinstance(name, str) or not name: continue nn = _normalize(name) lat_strength = _lat_name_strength(nn) lon_strength = _lon_name_strength(nn) if lat_strength == 0.0 and lon_strength == 0.0: continue # name gives no coordinate hint; skip. mn, mx = _col_range(col, sample_map.get(name)) is_numeric = mn is not None and mx is not None if not is_numeric: continue # range cannot be validated -> not a coordinate. if lat_strength > 0.0 and mn >= -90.0 and mx <= 90.0: lat_candidates.append((name, lat_strength + _RANGE_BONUS)) if lon_strength > 0.0 and mn >= -180.0 and mx <= 180.0: lon_candidates.append((name, lon_strength + _RANGE_BONUS)) if not lat_candidates and not lon_candidates: return _no_pair("ninguna columna sugiere latitud ni longitud por nombre+rango") if not lat_candidates: return _no_pair("no se encontro columna de latitud valida (nombre+rango en [-90,90])") if not lon_candidates: return _no_pair("no se encontro columna de longitud valida (nombre+rango en [-180,180])") # Pick the distinct pair with the highest combined confidence. First match wins # on ties to keep the result deterministic by input order. best = None # (combined, lat_name, lon_name, lat_c, lon_c) for lat_name, lat_c in lat_candidates: for lon_name, lon_c in lon_candidates: if lat_name == lon_name: continue # a column cannot be both axes of the same pair. combined = (lat_c + lon_c) / 2.0 if best is None or combined > best[0]: best = (combined, lat_name, lon_name, lat_c, lon_c) if best is None: return _no_pair("solo una columna sirve para ambos ejes; no hay par lat/lon distinto") combined, lat_name, lon_name, lat_c, lon_c = best confidence = max(0.0, min(1.0, combined)) lat_label = "fuerte" if lat_c >= 0.9 else "debil" lon_label = "fuerte" if lon_c >= 0.9 else "debil" reason = ( f"par lat='{lat_name}' (nombre {lat_label}) / lon='{lon_name}' " f"(nombre {lon_label}) con rango valido" ) return { "lat_col": lat_name, "lon_col": lon_name, "confidence": confidence, "reason": reason, }