"""Fuzzy merge adaptativo con multiples thresholds usando rapidfuzz.""" from __future__ import annotations from typing import Iterable def fuzzy_merge_adaptive( left: list[dict], right: list[dict], left_key: str, right_key: str, thresholds: list[int] | None = None, how: str = "left", ) -> list[dict]: """Realiza un fuzzy join adaptativo entre dos listas de dicts. Para cada item en left busca en right el mejor match usando rapidfuzz.fuzz.token_sort_ratio. Prueba thresholds de mayor a menor y asigna threshold_used al mayor threshold cumplido. Si no cumple ninguno, match es None. Args: left: Lista de dicts (lado izquierdo del join). right: Lista de dicts (lado derecho del join). left_key: Clave en los dicts de left usada para matching. right_key: Clave en los dicts de right usada para matching. thresholds: Thresholds a probar en orden descendente. Default [90, 80, 70, 60, 50]. how: Tipo de join. 'left' incluye todos los items de left (con None en campos de right si no hay match). 'inner' incluye solo items con match. Returns: Lista de dicts mergeados con campos de left + campos de right (sufijos _left/_right si colisionan) + fuzzy_match, match_score, threshold_used. """ from rapidfuzz import fuzz, process if thresholds is None: thresholds = [90, 80, 70, 60, 50] right_values = [ str(r[right_key]) for r in right if r.get(right_key) is not None ] def find_best_match(value: str | None) -> tuple[str | None, int, int | None]: if value is None: return None, 0, None result = process.extractOne(str(value), right_values, scorer=fuzz.token_sort_ratio) if not result: return None, 0, None match_str, score = result[0], result[1] for t in thresholds: if score >= t: return match_str, score, t return None, 0, None # Detectar colisiones de claves left_keys = set(left[0].keys()) if left else set() right_keys = set(right[0].keys()) if right else set() collision_keys = left_keys & right_keys # Construir indice de right por right_key right_index: dict[str, dict] = {} for r in right: val = r.get(right_key) if val is not None: right_index[str(val)] = r result_rows = [] for item in left: value = item.get(left_key) fuzzy_match, score, threshold_used = find_best_match(value) if fuzzy_match is None and how == "inner": continue row: dict = {} # Campos de left for k, v in item.items(): if k in collision_keys: row[f"{k}_left"] = v else: row[k] = v # Campos de right matched_right = right_index.get(fuzzy_match) if fuzzy_match else None if matched_right is not None: for k, v in matched_right.items(): if k in collision_keys: row[f"{k}_right"] = v else: row[k] = v else: for k in right_keys: if k in collision_keys: row[f"{k}_right"] = None else: row[k] = None row["fuzzy_match"] = fuzzy_match row["match_score"] = score row["threshold_used"] = threshold_used result_rows.append(row) return result_rows