fn_registry/python/functions/datascience/fuzzy_merge_adaptive.py

"""Fuzzy merge adaptativo con multiples thresholds usando rapidfuzz."""

from __future__ import annotations

from typing import Iterable


def fuzzy_merge_adaptive(
    left: list[dict],
    right: list[dict],
    left_key: str,
    right_key: str,
    thresholds: list[int] | None = None,
    how: str = "left",
) -> list[dict]:
    """Realiza un fuzzy join adaptativo entre dos listas de dicts.

    Para cada item en left busca en right el mejor match usando
    rapidfuzz.fuzz.token_sort_ratio. Prueba thresholds de mayor a menor
    y asigna threshold_used al mayor threshold cumplido. Si no cumple
    ninguno, match es None.

    Args:
        left: Lista de dicts (lado izquierdo del join).
        right: Lista de dicts (lado derecho del join).
        left_key: Clave en los dicts de left usada para matching.
        right_key: Clave en los dicts de right usada para matching.
        thresholds: Thresholds a probar en orden descendente.
            Default [90, 80, 70, 60, 50].
        how: Tipo de join. 'left' incluye todos los items de left
            (con None en campos de right si no hay match).
            'inner' incluye solo items con match.

    Returns:
        Lista de dicts mergeados con campos de left + campos de right
        (sufijos _left/_right si colisionan) + fuzzy_match, match_score,
        threshold_used.
    """
    from rapidfuzz import fuzz, process

    if thresholds is None:
        thresholds = [90, 80, 70, 60, 50]

    right_values = [
        str(r[right_key]) for r in right if r.get(right_key) is not None
    ]

    def find_best_match(value: str | None) -> tuple[str | None, int, int | None]:
        if value is None:
            return None, 0, None
        result = process.extractOne(str(value), right_values, scorer=fuzz.token_sort_ratio)
        if not result:
            return None, 0, None
        match_str, score = result[0], result[1]
        for t in thresholds:
            if score >= t:
                return match_str, score, t
        return None, 0, None

    # Detectar colisiones de claves
    left_keys = set(left[0].keys()) if left else set()
    right_keys = set(right[0].keys()) if right else set()
    collision_keys = left_keys & right_keys

    # Construir indice de right por right_key
    right_index: dict[str, dict] = {}
    for r in right:
        val = r.get(right_key)
        if val is not None:
            right_index[str(val)] = r

    result_rows = []
    for item in left:
        value = item.get(left_key)
        fuzzy_match, score, threshold_used = find_best_match(value)

        if fuzzy_match is None and how == "inner":
            continue

        row: dict = {}
        # Campos de left
        for k, v in item.items():
            if k in collision_keys:
                row[f"{k}_left"] = v
            else:
                row[k] = v

        # Campos de right
        matched_right = right_index.get(fuzzy_match) if fuzzy_match else None
        if matched_right is not None:
            for k, v in matched_right.items():
                if k in collision_keys:
                    row[f"{k}_right"] = v
                else:
                    row[k] = v
        else:
            for k in right_keys:
                if k in collision_keys:
                    row[f"{k}_right"] = None
                else:
                    row[k] = None

        row["fuzzy_match"] = fuzzy_match
        row["match_score"] = score
        row["threshold_used"] = threshold_used
        result_rows.append(row)

    return result_rows