fn_registry/python/functions/datascience/detect_latlon_columns.py

"""detect_latlon_columns — detect a (latitude, longitude) column pair in an EDA profile.

Pure function: no I/O, deterministic. Takes the `columns` list of a TableProfile
(group `eda`) and decides whether two of its columns form a geographic coordinate
pair (latitude + longitude), combining a name heuristic with a value-range check.

The detection is intentionally conservative: a name hint alone is never enough. A
column is only accepted as latitude/longitude if its numeric range fits inside the
valid coordinate bounds ([-90, 90] for latitude, [-180, 180] for longitude). When
the `numeric` sub-block is absent the optional `samples` argument is used instead.

Reading is fully defensive (.get throughout) and the function NEVER raises: any
malformed input (None, non-list, non-dict entries, missing keys) simply yields a
no-pair result {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": ...}.
"""

import re

# Collapse the separators a column name may use (snake_case, kebab-case, spaces)
# so that "y_coord", "y-coord" and "y coord" all normalize to the same token.
_SEP_RE = re.compile(r"[\s_\-]+")

# Name-match strengths: a strong, unambiguous coordinate name vs a weak generic
# axis name (x / y) that only counts when the range also fits and a partner exists.
_STRONG = 0.6
_WEAK = 0.3
_RANGE_BONUS = 0.4  # added once the mandatory range validation passes


def _normalize(name):
    """Lowercase a column name and strip separator chars (_, -, whitespace)."""
    if not isinstance(name, str):
        return ""
    return _SEP_RE.sub("", name.strip().lower())


def _num(value):
    """Coerce to float defensively; return None for None/bool/non-numeric."""
    # bool is a subclass of int; a coordinate value is never a real bool, so treat
    # True/False as missing instead of silently coercing to 1.0/0.0.
    if value is None or isinstance(value, bool):
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def _lat_name_strength(nn):
    """Strength of a normalized name as a latitude candidate (0=no match)."""
    if not nn:
        return 0.0
    # "lat", "latitude", "latitud" all contain the "lat" stem.
    if "lat" in nn:
        return _STRONG
    # Weak generic axis name: only useful when paired with an x/lon partner.
    if nn in ("y", "ycoord", "ycoordinate", "ycoordinates"):
        return _WEAK
    return 0.0


def _lon_name_strength(nn):
    """Strength of a normalized name as a longitude candidate (0=no match)."""
    if not nn:
        return 0.0
    # "lon", "long", "longitude", "longitud" share the "lon" stem; "lng" is separate.
    if "lon" in nn or "lng" in nn:
        return _STRONG
    if nn in ("x", "xcoord", "xcoordinate", "xcoordinates"):
        return _WEAK
    return 0.0


def _col_range(col, sample_values):
    """Return (min, max) floats for a column, or (None, None) if not numeric.

    Prefers the `numeric` sub-block min/max (the output of describe_numeric); falls
    back to the provided sample list. A column is only treated as numeric when both
    extremes are derivable: from the numeric block, or from samples whose every
    non-null value coerces to a number.
    """
    if isinstance(col, dict):
        numeric = col.get("numeric")
        if isinstance(numeric, dict):
            mn = _num(numeric.get("min"))
            mx = _num(numeric.get("max"))
            if mn is not None and mx is not None:
                return mn, mx
    # Fall back to samples when the numeric block is missing or incomplete.
    if isinstance(sample_values, (list, tuple)):
        non_null = [v for v in sample_values if v is not None]
        if non_null:
            coerced = [_num(v) for v in non_null]
            # Any non-numeric sample means we cannot trust the column as numeric.
            if all(c is not None for c in coerced):
                return min(coerced), max(coerced)
    return None, None


def _no_pair(reason):
    """Canonical empty result: no coordinate pair detected."""
    return {"lat_col": None, "lon_col": None, "confidence": 0.0, "reason": reason}


def detect_latlon_columns(columns: list, samples: dict | None = None) -> dict:
    """Detect a (latitude, longitude) column pair from an eda TableProfile.

    Combines a name heuristic (latitude/longitude/lat/lon/lng + weak x/y) with a
    mandatory range validation: the chosen latitude must sit in [-90, 90] and the
    longitude in [-180, 180]. A name hint whose range does not fit is discarded.
    Both sides are required for success; if only one is found, no pair is returned.

    Args:
        columns: List of ColumnProfile dicts (the `columns` of a TableProfile).
            Each dict is read defensively with .get; only `name` is required.
            `numeric.min` / `numeric.max` (and optionally `inferred_type`) are used
            for the range check when present.
        samples: Optional {column_name: [values...]} used to validate the range
            when a column lacks `numeric.min`/`numeric.max`. If None/omitted, only
            the `numeric` sub-block is consulted.

    Returns:
        Always a dict {"lat_col": str|None, "lon_col": str|None,
        "confidence": float, "reason": str}. On success lat_col and lon_col name
        the detected pair (distinct columns) and confidence is in [0, 1]: a pair
        validated by a strong name on both sides scores ~1.0, a weak x/y pair ~0.7.
        On failure both columns are None and confidence is 0.0.
    """
    if not isinstance(columns, (list, tuple)) or len(columns) == 0:
        return _no_pair("sin columnas que inspeccionar")

    sample_map = samples if isinstance(samples, dict) else {}

    # (column_name, confidence) for each side. Confidence already includes the
    # range bonus because membership in the list implies the range was validated.
    lat_candidates = []
    lon_candidates = []

    for col in columns:
        if not isinstance(col, dict):
            continue
        name = col.get("name")
        if not isinstance(name, str) or not name:
            continue

        nn = _normalize(name)
        lat_strength = _lat_name_strength(nn)
        lon_strength = _lon_name_strength(nn)
        if lat_strength == 0.0 and lon_strength == 0.0:
            continue  # name gives no coordinate hint; skip.

        mn, mx = _col_range(col, sample_map.get(name))
        is_numeric = mn is not None and mx is not None
        if not is_numeric:
            continue  # range cannot be validated -> not a coordinate.

        if lat_strength > 0.0 and mn >= -90.0 and mx <= 90.0:
            lat_candidates.append((name, lat_strength + _RANGE_BONUS))
        if lon_strength > 0.0 and mn >= -180.0 and mx <= 180.0:
            lon_candidates.append((name, lon_strength + _RANGE_BONUS))

    if not lat_candidates and not lon_candidates:
        return _no_pair("ninguna columna sugiere latitud ni longitud por nombre+rango")
    if not lat_candidates:
        return _no_pair("no se encontro columna de latitud valida (nombre+rango en [-90,90])")
    if not lon_candidates:
        return _no_pair("no se encontro columna de longitud valida (nombre+rango en [-180,180])")

    # Pick the distinct pair with the highest combined confidence. First match wins
    # on ties to keep the result deterministic by input order.
    best = None  # (combined, lat_name, lon_name, lat_c, lon_c)
    for lat_name, lat_c in lat_candidates:
        for lon_name, lon_c in lon_candidates:
            if lat_name == lon_name:
                continue  # a column cannot be both axes of the same pair.
            combined = (lat_c + lon_c) / 2.0
            if best is None or combined > best[0]:
                best = (combined, lat_name, lon_name, lat_c, lon_c)

    if best is None:
        return _no_pair("solo una columna sirve para ambos ejes; no hay par lat/lon distinto")

    combined, lat_name, lon_name, lat_c, lon_c = best
    confidence = max(0.0, min(1.0, combined))

    lat_label = "fuerte" if lat_c >= 0.9 else "debil"
    lon_label = "fuerte" if lon_c >= 0.9 else "debil"
    reason = (
        f"par lat='{lat_name}' (nombre {lat_label}) / lon='{lon_name}' "
        f"(nombre {lon_label}) con rango valido"
    )

    return {
        "lat_col": lat_name,
        "lon_col": lon_name,
        "confidence": confidence,
        "reason": reason,
    }