82 lines
2.5 KiB
Python
82 lines
2.5 KiB
Python
"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
|
|
|
|
import re
|
|
|
|
|
|
_TITLES = re.compile(
|
|
r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_LEGAL_SUFFIXES = re.compile(
|
|
r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
|
|
r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
|
|
r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_MULTI_SPACE = re.compile(r"\s+")
|
|
|
|
|
|
def normalize_entity_name(name: str, entity_type: str = "") -> str:
|
|
"""Normaliza el nombre de una entidad para comparacion y deduplicacion.
|
|
|
|
Aplica reglas diferentes segun el tipo de entidad:
|
|
- ip / email / domain / crypto_wallet / phone: normalizacion tecnica
|
|
- person: normalizacion de nombre humano (titulos, formato apellido-nombre)
|
|
- organization: normalizacion corporativa (sufijos legales)
|
|
- default: lower + strip + colapsar espacios
|
|
|
|
Args:
|
|
name: nombre de la entidad a normalizar.
|
|
entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
|
|
person, organization). Vacio = default.
|
|
|
|
Returns:
|
|
nombre normalizado como string.
|
|
"""
|
|
name = name.strip()
|
|
et = entity_type.lower().strip()
|
|
|
|
if et == "ip":
|
|
return name.lower()
|
|
|
|
if et == "email":
|
|
return name.lower()
|
|
|
|
if et == "domain":
|
|
result = name.lower().rstrip(".")
|
|
if result.startswith("www."):
|
|
result = result[4:]
|
|
return result
|
|
|
|
if et == "crypto_wallet":
|
|
# Bitcoin addresses son case-sensitive — solo strip
|
|
return name
|
|
|
|
if et == "phone":
|
|
# Mantener solo digitos y el signo +
|
|
return re.sub(r"[^\d+]", "", name)
|
|
|
|
if et == "person":
|
|
# Remover titulos al inicio
|
|
result = _TITLES.sub("", name).strip()
|
|
# Detectar formato "Apellido, Nombre"
|
|
if "," in result:
|
|
parts = result.split(",", 1)
|
|
last = parts[0].strip()
|
|
first = parts[1].strip()
|
|
result = f"{first} {last}"
|
|
# Colapsar espacios y title case
|
|
result = _MULTI_SPACE.sub(" ", result).strip()
|
|
return result.title()
|
|
|
|
if et == "organization":
|
|
result = _LEGAL_SUFFIXES.sub("", name).strip()
|
|
result = _MULTI_SPACE.sub(" ", result).strip()
|
|
# Title case para consistencia
|
|
return result.title()
|
|
|
|
# Default: lower, strip, colapsar espacios
|
|
return _MULTI_SPACE.sub(" ", name.lower()).strip()
|