feat: funciones Python datascience, finance, cybersecurity y pipelines
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift, diff_entities/relations, extract_entities/relations_llm, hotness_score, melt, merge_graphs, pivot, build_entity/relation_schema_prompt. Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order, hawkes_intensity + módulo finance.py. Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py. Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
---
|
||||
name: aggregate_by_group
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def aggregate_by_group(rows: list[dict], group_by: list[str], aggs: dict[str, str]) -> list[dict]"
|
||||
description: "GROUP BY + agregaciones sobre datos tabulares. aggs es un dict de columna a funcion (sum, mean, count, min, max, first, last, collect). collect acumula valores en lista. None se ignora en agregaciones numericas."
|
||||
tags: [datascience, tabular, groupby, aggregate, transform, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["collections"]
|
||||
tested: true
|
||||
tests:
|
||||
- "Group by una columna con sum"
|
||||
- "Group by multiples columnas"
|
||||
- "Agregacion mean count min max"
|
||||
- "collect acumula en lista"
|
||||
- "Grupo con una sola fila"
|
||||
- "Campo con None se ignora en agregaciones numericas"
|
||||
test_file_path: "python/functions/datascience/aggregate_by_group_test.py"
|
||||
file_path: "python/functions/datascience/aggregate_by_group.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
rows = [
|
||||
{"dept": "eng", "salary": 100},
|
||||
{"dept": "eng", "salary": 120},
|
||||
{"dept": "sales", "salary": 80},
|
||||
]
|
||||
aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"})
|
||||
# [{"dept": "eng", "salary": 110.0}, {"dept": "sales", "salary": 80.0}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib).
|
||||
Preserva el orden de primera aparicion de cada grupo.
|
||||
La funcion 'collect' no filtra None — acumula todos los valores incluyendo None.
|
||||
@@ -0,0 +1,71 @@
|
||||
"""GROUP BY + agregaciones sobre datos tabulares list[dict]."""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def aggregate_by_group(
|
||||
rows: list[dict],
|
||||
group_by: list[str],
|
||||
aggs: dict[str, str],
|
||||
) -> list[dict]:
|
||||
"""Agrupa filas por una o varias columnas y aplica agregaciones.
|
||||
|
||||
Equivalente a SQL GROUP BY con funciones de agregacion.
|
||||
La funcion 'collect' acumula todos los valores en una lista.
|
||||
Los valores None se ignoran en agregaciones numericas (sum, mean, min, max).
|
||||
|
||||
Args:
|
||||
rows: Lista de dicts con los datos.
|
||||
group_by: Lista de columnas por las que agrupar.
|
||||
aggs: Dict de {columna: funcion}. Funciones: sum, mean, count,
|
||||
min, max, first, last, collect.
|
||||
|
||||
Returns:
|
||||
Lista de dicts con las columnas de group_by mas los campos agregados.
|
||||
El orden de las filas sigue el orden de primera aparicion del grupo.
|
||||
"""
|
||||
# Mantener orden de grupos con lista de claves
|
||||
group_keys: list[tuple] = []
|
||||
seen_groups: set[tuple] = set()
|
||||
buckets: dict[tuple, dict[str, list]] = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for row in rows:
|
||||
gk = tuple(row.get(col) for col in group_by)
|
||||
if gk not in seen_groups:
|
||||
seen_groups.add(gk)
|
||||
group_keys.append(gk)
|
||||
for col in aggs:
|
||||
val = row.get(col)
|
||||
buckets[gk][col].append(val)
|
||||
|
||||
def _aggregate(vals: list, func: str):
|
||||
if func == "collect":
|
||||
return vals
|
||||
if func == "count":
|
||||
return len(vals)
|
||||
if func == "first":
|
||||
return vals[0] if vals else None
|
||||
if func == "last":
|
||||
return vals[-1] if vals else None
|
||||
# Para sum, mean, min, max: ignorar None
|
||||
numeric = [v for v in vals if v is not None]
|
||||
if not numeric:
|
||||
return None
|
||||
if func == "sum":
|
||||
return sum(numeric)
|
||||
if func == "mean":
|
||||
return sum(numeric) / len(numeric)
|
||||
if func == "min":
|
||||
return min(numeric)
|
||||
if func == "max":
|
||||
return max(numeric)
|
||||
raise ValueError(f"Funcion de agregacion no soportada: {func}")
|
||||
|
||||
result = []
|
||||
for gk in group_keys:
|
||||
record: dict = dict(zip(group_by, gk))
|
||||
for col, func in aggs.items():
|
||||
record[col] = _aggregate(buckets[gk][col], func)
|
||||
result.append(record)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Tests para aggregate_by_group."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from aggregate_by_group import aggregate_by_group
|
||||
|
||||
|
||||
def test_group_by_una_columna_con_sum():
|
||||
"""Group by una columna con sum."""
|
||||
rows = [
|
||||
{"dept": "eng", "salary": 100},
|
||||
{"dept": "eng", "salary": 120},
|
||||
{"dept": "sales", "salary": 80},
|
||||
]
|
||||
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
|
||||
assert len(result) == 2
|
||||
eng = next(r for r in result if r["dept"] == "eng")
|
||||
sales = next(r for r in result if r["dept"] == "sales")
|
||||
assert eng["salary"] == 220
|
||||
assert sales["salary"] == 80
|
||||
|
||||
|
||||
def test_group_by_multiples_columnas():
|
||||
"""Group by multiples columnas."""
|
||||
rows = [
|
||||
{"dept": "eng", "level": "senior", "salary": 150},
|
||||
{"dept": "eng", "level": "junior", "salary": 80},
|
||||
{"dept": "eng", "level": "senior", "salary": 160},
|
||||
{"dept": "sales", "level": "senior", "salary": 120},
|
||||
]
|
||||
result = aggregate_by_group(rows, group_by=["dept", "level"], aggs={"salary": "sum"})
|
||||
assert len(result) == 3
|
||||
eng_senior = next(r for r in result if r["dept"] == "eng" and r["level"] == "senior")
|
||||
assert eng_senior["salary"] == 310
|
||||
|
||||
|
||||
def test_agregacion_mean_count_min_max():
|
||||
"""Agregacion mean count min max."""
|
||||
rows = [
|
||||
{"cat": "A", "val": 10},
|
||||
{"cat": "A", "val": 20},
|
||||
{"cat": "A", "val": 30},
|
||||
]
|
||||
result_mean = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "mean"})
|
||||
assert result_mean[0]["val"] == 20.0
|
||||
|
||||
result_count = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "count"})
|
||||
assert result_count[0]["val"] == 3
|
||||
|
||||
result_min = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "min"})
|
||||
assert result_min[0]["val"] == 10
|
||||
|
||||
result_max = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "max"})
|
||||
assert result_max[0]["val"] == 30
|
||||
|
||||
|
||||
def test_collect_acumula_en_lista():
|
||||
"""collect acumula en lista."""
|
||||
rows = [
|
||||
{"dept": "eng", "name": "Alice"},
|
||||
{"dept": "eng", "name": "Bob"},
|
||||
{"dept": "sales", "name": "Carol"},
|
||||
]
|
||||
result = aggregate_by_group(rows, group_by=["dept"], aggs={"name": "collect"})
|
||||
eng = next(r for r in result if r["dept"] == "eng")
|
||||
assert sorted(eng["name"]) == ["Alice", "Bob"]
|
||||
|
||||
|
||||
def test_grupo_con_una_sola_fila():
|
||||
"""Grupo con una sola fila."""
|
||||
rows = [{"dept": "eng", "salary": 100}]
|
||||
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
|
||||
assert len(result) == 1
|
||||
assert result[0]["salary"] == 100
|
||||
|
||||
|
||||
def test_campo_con_none_se_ignora_en_agregaciones_numericas():
|
||||
"""Campo con None se ignora en agregaciones numericas."""
|
||||
rows = [
|
||||
{"dept": "eng", "salary": 100},
|
||||
{"dept": "eng", "salary": None},
|
||||
{"dept": "eng", "salary": 200},
|
||||
]
|
||||
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
|
||||
assert result[0]["salary"] == 300
|
||||
|
||||
result_mean = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"})
|
||||
assert result_mean[0]["salary"] == 150.0
|
||||
@@ -0,0 +1,62 @@
|
||||
---
|
||||
name: build_entity_schema_prompt
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def build_entity_schema_prompt(entity_presets: list[dict]) -> str"
|
||||
description: "Genera la seccion del system prompt que describe los entity types disponibles para extraccion. Formatea los presets del registry en texto legible para el LLM."
|
||||
tags: [prompt, llm, entity, schema, osint, graph, extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "lista con varios presets"
|
||||
- "lista vacia retorna string vacio"
|
||||
- "preset sin metadata_fields"
|
||||
test_file_path: "python/functions/datascience/build_entity_schema_prompt_test.py"
|
||||
file_path: "python/functions/datascience/build_entity_schema_prompt.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from build_entity_schema_prompt import build_entity_schema_prompt
|
||||
|
||||
presets = [
|
||||
{
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
|
||||
},
|
||||
{
|
||||
"type_ref": "osint_organization_go_cybersecurity",
|
||||
"label": "Organization",
|
||||
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"],
|
||||
},
|
||||
]
|
||||
|
||||
prompt = build_entity_schema_prompt(presets)
|
||||
# Entity types available for extraction:
|
||||
#
|
||||
# 1. Person (type_ref: osint_person_go_cybersecurity)
|
||||
# Attributes: full_name, alias, nationality, dob, risk_score
|
||||
#
|
||||
# 2. Organization (type_ref: osint_organization_go_cybersecurity)
|
||||
# Attributes: legal_name, country, sector, founded, risk_score
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No requiere dependencias externas.
|
||||
|
||||
El formato de salida es deliberadamente sencillo para maximizar la comprension por el LLM: numero de orden, label humano, type_ref del registry y lista de atributos en una sola linea.
|
||||
|
||||
Si un preset no tiene `metadata_fields` (o tiene lista vacia), se omite la linea de atributos.
|
||||
|
||||
Pensada para componer con `build_relation_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT.
|
||||
@@ -0,0 +1,43 @@
|
||||
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
|
||||
|
||||
|
||||
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los entity types disponibles.
|
||||
|
||||
Formatea los presets del registry en una seccion del system prompt que indica
|
||||
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
|
||||
|
||||
Args:
|
||||
entity_presets: Lista de presets con campos 'label', 'type_ref' y
|
||||
opcionalmente 'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista de presets esta vacia.
|
||||
"""
|
||||
if not entity_presets:
|
||||
return ""
|
||||
|
||||
lines = ["Entity types available for extraction:", ""]
|
||||
|
||||
for i, preset in enumerate(entity_presets, start=1):
|
||||
label = preset.get("label", "Unknown")
|
||||
type_ref = preset.get("type_ref", "")
|
||||
metadata_fields = preset.get("metadata_fields", [])
|
||||
|
||||
lines.append(f"{i}. {label} (type_ref: {type_ref})")
|
||||
|
||||
if metadata_fields:
|
||||
attrs = ", ".join(metadata_fields)
|
||||
lines.append(f" Attributes: {attrs}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Remove trailing blank line
|
||||
if lines and lines[-1] == "":
|
||||
lines.pop()
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,41 @@
|
||||
"""Tests para build_entity_schema_prompt."""
|
||||
|
||||
from build_entity_schema_prompt import build_entity_schema_prompt
|
||||
|
||||
|
||||
def test_lista_con_varios_presets():
|
||||
presets = [
|
||||
{
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
|
||||
},
|
||||
{
|
||||
"type_ref": "osint_organization_go_cybersecurity",
|
||||
"label": "Organization",
|
||||
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"],
|
||||
},
|
||||
]
|
||||
result = build_entity_schema_prompt(presets)
|
||||
assert "Entity types available for extraction:" in result
|
||||
assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result
|
||||
assert " Attributes: full_name, alias, nationality, dob, risk_score" in result
|
||||
assert "2. Organization (type_ref: osint_organization_go_cybersecurity)" in result
|
||||
assert " Attributes: legal_name, country, sector, founded, risk_score" in result
|
||||
|
||||
|
||||
def test_lista_vacia_retorna_string_vacio():
|
||||
result = build_entity_schema_prompt([])
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_preset_sin_metadata_fields():
|
||||
presets = [
|
||||
{
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
}
|
||||
]
|
||||
result = build_entity_schema_prompt(presets)
|
||||
assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result
|
||||
assert "Attributes:" not in result
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
name: build_relation_schema_prompt
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def build_relation_schema_prompt(relation_types: list[str]) -> str"
|
||||
description: "Genera la seccion del system prompt con los tipos de relacion permitidos para extraccion. Formatea la lista de tipos en texto legible para el LLM."
|
||||
tags: [prompt, llm, relation, schema, osint, graph, extraction]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "lista con varios tipos"
|
||||
- "lista vacia retorna string vacio"
|
||||
- "un solo tipo"
|
||||
test_file_path: "python/functions/datascience/build_relation_schema_prompt_test.py"
|
||||
file_path: "python/functions/datascience/build_relation_schema_prompt.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from build_relation_schema_prompt import build_relation_schema_prompt
|
||||
|
||||
types = ["funds", "employs", "communicates_with", "owns"]
|
||||
prompt = build_relation_schema_prompt(types)
|
||||
# Allowed relation types:
|
||||
# funds, employs, communicates_with, owns
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No requiere dependencias externas.
|
||||
|
||||
La salida es una sola linea con todos los tipos separados por coma, precedida por el encabezado. El formato es minimal para no consumir tokens innecesarios del contexto del LLM.
|
||||
|
||||
Pensada para componer con `build_entity_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT.
|
||||
@@ -0,0 +1,22 @@
|
||||
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
|
||||
|
||||
|
||||
def build_relation_schema_prompt(relation_types: list[str]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
|
||||
|
||||
Formatea la lista de tipos de relacion en una seccion del system prompt que
|
||||
indica al LLM que relaciones puede extraer entre entidades.
|
||||
|
||||
Args:
|
||||
relation_types: Lista de strings con los tipos de relacion permitidos.
|
||||
Ejemplo: ["funds", "employs", "communicates_with"]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista esta vacia.
|
||||
"""
|
||||
if not relation_types:
|
||||
return ""
|
||||
|
||||
joined = ", ".join(relation_types)
|
||||
return f"Allowed relation types:\n{joined}"
|
||||
@@ -0,0 +1,19 @@
|
||||
"""Tests para build_relation_schema_prompt."""
|
||||
|
||||
from build_relation_schema_prompt import build_relation_schema_prompt
|
||||
|
||||
|
||||
def test_lista_normal():
|
||||
relation_types = ["funds", "employs", "communicates_with", "owns", "operates"]
|
||||
result = build_relation_schema_prompt(relation_types)
|
||||
assert result.startswith("Allowed relation types:")
|
||||
assert "funds" in result
|
||||
assert "employs" in result
|
||||
assert "communicates_with" in result
|
||||
assert "owns" in result
|
||||
assert "operates" in result
|
||||
|
||||
|
||||
def test_lista_vacia_retorna_string_vacio():
|
||||
result = build_relation_schema_prompt([])
|
||||
assert result == ""
|
||||
@@ -121,3 +121,72 @@ def linspace(start: float, stop: float, num: int) -> list:
|
||||
return [start]
|
||||
step = (stop - start) / (num - 1)
|
||||
return [start + i * step for i in range(num)]
|
||||
|
||||
|
||||
def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict:
|
||||
"""Estima parámetros de un proceso Hawkes desde autocorrelación de arrivals.
|
||||
|
||||
Ajusta exponencial a*exp(-b*lag) sobre la ACF.
|
||||
Retorna dict con alpha, beta, branching_ratio, acf.
|
||||
"""
|
||||
import numpy as np
|
||||
from scipy.optimize import curve_fit
|
||||
|
||||
arr = np.array(arrivals, dtype=float)
|
||||
mean_a = np.mean(arr)
|
||||
var_a = np.var(arr)
|
||||
if var_a == 0:
|
||||
return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': [1.0]}
|
||||
|
||||
acf = [1.0] + [
|
||||
float(np.mean((arr[lag:] - mean_a) * (arr[:-lag] - mean_a)) / var_a)
|
||||
for lag in range(1, max_lag)
|
||||
]
|
||||
|
||||
lags = np.arange(1, max_lag)
|
||||
acf_vals = np.array(acf[1:])
|
||||
|
||||
if acf_vals[0] <= 0.01:
|
||||
return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': acf}
|
||||
|
||||
exp_decay = lambda x, a, b: a * np.exp(-b * x)
|
||||
try:
|
||||
popt, _ = curve_fit(exp_decay, lags, acf_vals, p0=[0.5, 0.5], maxfev=5000)
|
||||
alpha_est, beta_est = abs(popt[0]), abs(popt[1])
|
||||
except RuntimeError:
|
||||
alpha_est, beta_est = 0.0, 1.0
|
||||
|
||||
branching = alpha_est / beta_est if beta_est > 0 else 0.0
|
||||
return {
|
||||
'alpha': round(alpha_est, 4),
|
||||
'beta': round(beta_est, 4),
|
||||
'branching_ratio': round(branching, 4),
|
||||
'acf': acf,
|
||||
}
|
||||
|
||||
|
||||
def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict:
|
||||
"""Estima el exponente alpha de una distribución Pareto via MLE.
|
||||
|
||||
α = n / Σ ln(xi / x_min) donde x_min es el percentil indicado.
|
||||
Alpha bajo = cola más pesada = más valores extremos.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
arr = np.array([v for v in values if v > 0], dtype=float)
|
||||
if len(arr) < 10:
|
||||
return {'alpha': 0.0, 'x_min': 0.0, 'n_tail': 0}
|
||||
|
||||
x_min = float(np.percentile(arr, x_min_percentile))
|
||||
tail = arr[arr >= x_min]
|
||||
|
||||
if len(tail) < 2 or x_min <= 0:
|
||||
return {'alpha': 0.0, 'x_min': x_min, 'n_tail': len(tail)}
|
||||
|
||||
alpha = float(len(tail) / np.sum(np.log(tail / x_min)))
|
||||
|
||||
return {
|
||||
'alpha': round(alpha, 4),
|
||||
'x_min': round(x_min, 6),
|
||||
'n_tail': len(tail),
|
||||
}
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
---
|
||||
name: deduplicate_entities
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def deduplicate_entities(candidates: list[EntityCandidate], name_threshold: float = 0.85, same_type_only: bool = True) -> DeduplicationResult"
|
||||
description: "Agrupa entidades candidatas que refieren a la misma entidad real usando fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para clusters transitivos. Retorna entidades mergeadas con mapas de resolucion de IDs y log de merges."
|
||||
tags: [deduplication, entity, fuzzy, levenshtein, jaccard, union-find, knowledge-graph, nlp, fuzzygraph, datascience]
|
||||
uses_functions:
|
||||
- normalize_entity_name_py_core
|
||||
- merge_entity_attributes_py_core
|
||||
uses_types:
|
||||
- entity_candidate_py_datascience
|
||||
- deduplication_result_py_datascience
|
||||
returns: [deduplication_result_py_datascience]
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports:
|
||||
- uuid
|
||||
tested: true
|
||||
tests:
|
||||
- "John Smith y Smith, John se mergean"
|
||||
- "Google y Google LLC se mergean"
|
||||
- "192.168.1.1 y 192.168.1.1 se mergean por matching exacto"
|
||||
- "John Smith (person) y John Smith (organization) NO se mergean"
|
||||
- "Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster"
|
||||
- "Entidades sin duplicados pasan sin modificacion"
|
||||
- "Confidence toma el max del cluster; atributos se fusionan"
|
||||
- "Lista vacia retorna resultado vacio"
|
||||
- "name_to_id contiene todos los nombres originales del cluster"
|
||||
test_file_path: "python/functions/datascience/deduplicate_entities_test.py"
|
||||
file_path: "python/functions/datascience/deduplicate_entities.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.functions.datascience.deduplicate_entities import deduplicate_entities
|
||||
|
||||
candidates = [
|
||||
EntityCandidate(name="John Smith", type_ref="person", confidence=0.9),
|
||||
EntityCandidate(name="Smith, John", type_ref="person", confidence=0.85),
|
||||
EntityCandidate(name="Google", type_ref="organization", confidence=0.95),
|
||||
EntityCandidate(name="Google LLC", type_ref="organization", confidence=0.88),
|
||||
]
|
||||
|
||||
result = deduplicate_entities(candidates, name_threshold=0.85, same_type_only=True)
|
||||
# result.total_before = 4
|
||||
# result.total_after = 2
|
||||
# result.merge_log = [
|
||||
# {"canonical": "John Smith", "merged": ["Smith, John"], "score": 0.91, "reason": "fuzzy_name"},
|
||||
# {"canonical": "Google", "merged": ["Google LLC"], "score": 0.89, "reason": "fuzzy_name"},
|
||||
# ]
|
||||
```
|
||||
|
||||
## Algoritmo
|
||||
|
||||
1. **Normalizar nombres** usando `normalize_entity_name()` sobre cada candidato segun su `type_ref`
|
||||
2. **Comparacion pairwise** dentro del mismo tipo (si `same_type_only=True`):
|
||||
- Para tipos tecnicos (ip, email, domain, crypto_wallet, phone): matching exacto normalizado
|
||||
- Para el resto: `score = max(levenshtein_sim, jaccard_sim)` + bonus por contencion (+0.3) y acronimos (+0.3)
|
||||
3. **Union-Find** para clusters transitivos: si A~B y B~C, entonces {A, B, C} forman un cluster
|
||||
4. **Merge por cluster:**
|
||||
- Nombre canonico: candidato con mayor `confidence`
|
||||
- Atributos: `merge_entity_attributes()` sobre todos los candidatos del cluster
|
||||
- Confidence: `max` del cluster
|
||||
- Source chunks: union de todos los candidatos
|
||||
- `merged_from`: union de todos los nombres originales
|
||||
|
||||
## Heuristicas de similitud de nombres
|
||||
|
||||
| Heuristica | Efecto |
|
||||
|---|---|
|
||||
| Levenshtein | `1 - (edit_distance / max_len)` |
|
||||
| Jaccard sobre tokens | `\|A ∩ B\| / \|A ∪ B\|` |
|
||||
| Score base | `max(lev_sim, jaccard_sim)` |
|
||||
| Contencion (a in b o b in a) | `+0.3` hasta max 1.0 |
|
||||
| Acronimo ("FBI" ~ "Federal Bureau of Investigation") | `+0.3` hasta max 1.0 |
|
||||
| Tipos exactos (ip/email/domain) | solo matching exacto, ignora umbral |
|
||||
|
||||
## Complejidad
|
||||
|
||||
- Pairwise: O(N^2) — aceptable para <1000 entidades (tipico por documento)
|
||||
- Union-Find con path compression: O(α(N)) amortizado por operacion
|
||||
- Para escalar a >1000: pre-filtrar por primera letra o n-gram index antes de comparar
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Implementa Levenshtein y Jaccard internamente para evitar dependencias externas a este modulo. Las funciones del registry `levenshtein_distance_py_cybersecurity` y `jaccard_similarity_py_cybersecurity` son equivalentes pero requieren imports adicionales — la implementacion inline mantiene la funcion sin dependencias de stdlib.
|
||||
|
||||
El `name_to_id` del resultado es el mapa de resolucion principal para la fase de deduplicacion de relaciones: permite resolver cualquier variante de nombre de una entidad a su ID canonico.
|
||||
@@ -0,0 +1,283 @@
|
||||
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.types.datascience.deduplication_result import DeduplicationResult
|
||||
from python.functions.core.normalize_entity_name import normalize_entity_name
|
||||
from python.functions.core.merge_entity_attributes import merge_entity_attributes
|
||||
|
||||
|
||||
# ── Similitud helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _levenshtein(a: str, b: str) -> int:
|
||||
"""Distancia de edicion Levenshtein entre dos strings."""
|
||||
if a == b:
|
||||
return 0
|
||||
if not a:
|
||||
return len(b)
|
||||
if not b:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
curr = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
|
||||
"""Similitud de Jaccard entre dos conjuntos de tokens."""
|
||||
set_a = set(tokens_a)
|
||||
set_b = set(tokens_b)
|
||||
if not set_a and not set_b:
|
||||
return 1.0
|
||||
inter = len(set_a & set_b)
|
||||
union = len(set_a | set_b)
|
||||
return inter / union if union else 0.0
|
||||
|
||||
|
||||
def _name_similarity(a: str, b: str) -> float:
|
||||
"""Score de similitud entre dos nombres normalizados.
|
||||
|
||||
Combina similitud de Levenshtein y Jaccard sobre tokens.
|
||||
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
|
||||
"""
|
||||
if a == b:
|
||||
return 1.0
|
||||
|
||||
# Similitud Levenshtein
|
||||
max_len = max(len(a), len(b))
|
||||
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
|
||||
|
||||
# Similitud Jaccard sobre tokens
|
||||
tokens_a = a.split()
|
||||
tokens_b = b.split()
|
||||
jac_sim = _jaccard(tokens_a, tokens_b)
|
||||
|
||||
score = max(lev_sim, jac_sim)
|
||||
|
||||
# Bonus de contencion: un nombre contiene al otro
|
||||
if a in b or b in a:
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
|
||||
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
|
||||
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
|
||||
if not candidate or not tokens:
|
||||
return False
|
||||
initials = "".join(t[0] for t in tokens if t).upper()
|
||||
return candidate.upper() == initials
|
||||
|
||||
|
||||
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
|
||||
|
||||
|
||||
def _is_exact_type(entity_type: str) -> bool:
|
||||
"""Tipos tecnicos donde solo se acepta matching exacto."""
|
||||
return entity_type.lower() in _EXACT_TYPES
|
||||
|
||||
|
||||
# ── Union-Find ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class _UnionFind:
|
||||
def __init__(self, n: int) -> None:
|
||||
self._parent = list(range(n))
|
||||
self._rank = [0] * n
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
while self._parent[x] != x:
|
||||
self._parent[x] = self._parent[self._parent[x]]
|
||||
x = self._parent[x]
|
||||
return x
|
||||
|
||||
def union(self, x: int, y: int) -> None:
|
||||
rx, ry = self.find(x), self.find(y)
|
||||
if rx == ry:
|
||||
return
|
||||
if self._rank[rx] < self._rank[ry]:
|
||||
rx, ry = ry, rx
|
||||
self._parent[ry] = rx
|
||||
if self._rank[rx] == self._rank[ry]:
|
||||
self._rank[rx] += 1
|
||||
|
||||
|
||||
# ── Implementacion principal ────────────────────────────────────────────────────
|
||||
|
||||
def deduplicate_entities(
|
||||
candidates: list[EntityCandidate],
|
||||
name_threshold: float = 0.85,
|
||||
same_type_only: bool = True,
|
||||
) -> DeduplicationResult:
|
||||
"""Agrupa entidades candidatas que refieren a la misma entidad real.
|
||||
|
||||
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
|
||||
detectar clusters transitivos. Por cada cluster genera una entidad canonica
|
||||
mergeando atributos de todos sus miembros.
|
||||
|
||||
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
|
||||
acepta matching exacto normalizado, ignorando el umbral de nombre.
|
||||
|
||||
Args:
|
||||
candidates: lista de EntityCandidate a deduplicar.
|
||||
name_threshold: score minimo para considerar dos nombres iguales (0-1).
|
||||
same_type_only: si True, solo compara entidades del mismo type_ref.
|
||||
|
||||
Returns:
|
||||
DeduplicationResult con entidades deduplicadas, mapas de resolucion
|
||||
e historial de merges.
|
||||
"""
|
||||
if not candidates:
|
||||
return DeduplicationResult(
|
||||
entities=[],
|
||||
entity_id_map={},
|
||||
name_to_id={},
|
||||
merge_log=[],
|
||||
total_before=0,
|
||||
total_after=0,
|
||||
)
|
||||
|
||||
n = len(candidates)
|
||||
|
||||
# Paso 1: normalizar nombres
|
||||
normalized: list[str] = []
|
||||
for c in candidates:
|
||||
norm = normalize_entity_name(c.name, c.type_ref)
|
||||
normalized.append(norm)
|
||||
|
||||
# Paso 2: Union-Find sobre todos los indices
|
||||
uf = _UnionFind(n)
|
||||
|
||||
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
|
||||
merge_pairs: list[tuple[int, int, float]] = []
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
|
||||
continue
|
||||
|
||||
ni, nj = normalized[i], normalized[j]
|
||||
et = candidates[i].type_ref.lower()
|
||||
|
||||
if _is_exact_type(et):
|
||||
if ni == nj:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, 1.0))
|
||||
continue
|
||||
|
||||
score = _name_similarity(ni, nj)
|
||||
if score >= name_threshold:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, score))
|
||||
|
||||
# Paso 4: agrupar indices por raiz del Union-Find
|
||||
clusters: dict[int, list[int]] = {}
|
||||
for i in range(n):
|
||||
root = uf.find(i)
|
||||
clusters.setdefault(root, []).append(i)
|
||||
|
||||
# Paso 5: merge por cluster
|
||||
merged_entities: list[EntityCandidate] = []
|
||||
entity_id_map: dict[str, str] = {}
|
||||
name_to_id: dict[str, str] = {}
|
||||
merge_log: list[dict] = []
|
||||
|
||||
# Pares mergeados para construir el log
|
||||
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
|
||||
for i, j, score in merge_pairs:
|
||||
root = uf.find(i)
|
||||
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
|
||||
|
||||
for root, indices in clusters.items():
|
||||
cluster_candidates = [candidates[idx] for idx in indices]
|
||||
|
||||
if len(cluster_candidates) == 1:
|
||||
c = cluster_candidates[0]
|
||||
canonical_name = c.name
|
||||
canonical_norm = normalized[indices[0]]
|
||||
merged_attrs = c.attributes
|
||||
merged_confidence = c.confidence
|
||||
merged_chunks = list(c.source_chunk_indices)
|
||||
merged_from = list(c.merged_from) if c.merged_from else [c.name]
|
||||
else:
|
||||
# Candidato con mayor confidence es el canonico
|
||||
best = max(cluster_candidates, key=lambda c: c.confidence)
|
||||
canonical_name = best.name
|
||||
canonical_norm = normalize_entity_name(best.name, best.type_ref)
|
||||
|
||||
merged_attrs = merge_entity_attributes(
|
||||
[c.attributes for c in cluster_candidates]
|
||||
)
|
||||
merged_confidence = max(c.confidence for c in cluster_candidates)
|
||||
|
||||
merged_chunks: list[int] = []
|
||||
seen_chunks: set[int] = set()
|
||||
for c in cluster_candidates:
|
||||
for idx in c.source_chunk_indices:
|
||||
if idx not in seen_chunks:
|
||||
merged_chunks.append(idx)
|
||||
seen_chunks.add(idx)
|
||||
|
||||
merged_from: list[str] = []
|
||||
seen_names: set[str] = set()
|
||||
for c in cluster_candidates:
|
||||
names_to_add = c.merged_from if c.merged_from else [c.name]
|
||||
for nm in names_to_add:
|
||||
if nm not in seen_names:
|
||||
merged_from.append(nm)
|
||||
seen_names.add(nm)
|
||||
|
||||
# Log de merge
|
||||
other_names = [c.name for c in cluster_candidates if c is not best]
|
||||
pairs = merged_pairs_by_root.get(root, [])
|
||||
max_score = max((s for _, _, s in pairs), default=1.0)
|
||||
merge_log.append(
|
||||
{
|
||||
"canonical": canonical_name,
|
||||
"merged": other_names,
|
||||
"score": round(max_score, 4),
|
||||
"reason": "fuzzy_name",
|
||||
}
|
||||
)
|
||||
|
||||
ent_id = str(uuid.uuid4())
|
||||
entity = EntityCandidate(
|
||||
name=canonical_name,
|
||||
name_normalized=canonical_norm,
|
||||
type_ref=cluster_candidates[0].type_ref,
|
||||
type_label=cluster_candidates[0].type_label,
|
||||
attributes=merged_attrs,
|
||||
confidence=merged_confidence,
|
||||
source_chunk_indices=merged_chunks,
|
||||
merged_from=merged_from,
|
||||
)
|
||||
merged_entities.append(entity)
|
||||
|
||||
# Poblar mapas de resolucion
|
||||
entity_id_map[canonical_norm] = ent_id
|
||||
for orig_name in merged_from:
|
||||
name_to_id[orig_name] = ent_id
|
||||
name_to_id[canonical_norm] = ent_id
|
||||
|
||||
return DeduplicationResult(
|
||||
entities=merged_entities,
|
||||
entity_id_map=entity_id_map,
|
||||
name_to_id=name_to_id,
|
||||
merge_log=merge_log,
|
||||
total_before=n,
|
||||
total_after=len(merged_entities),
|
||||
)
|
||||
@@ -0,0 +1,113 @@
|
||||
"""Tests para deduplicate_entities."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.functions.datascience.deduplicate_entities import deduplicate_entities
|
||||
|
||||
|
||||
def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
|
||||
return EntityCandidate(
|
||||
name=name,
|
||||
type_ref=type_ref,
|
||||
type_label=type_ref.capitalize(),
|
||||
attributes=attrs,
|
||||
confidence=confidence,
|
||||
source_chunk_indices=[0],
|
||||
)
|
||||
|
||||
|
||||
def test_john_smith_y_smith_john_merge():
|
||||
"""John Smith y Smith, John se mergean."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("Smith, John", type_ref="person")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_before == 2
|
||||
assert result.total_after == 1
|
||||
assert len(result.entities) == 1
|
||||
assert len(result.merge_log) == 1
|
||||
|
||||
|
||||
def test_google_y_google_llc_merge():
|
||||
"""Google y Google LLC se mergean."""
|
||||
a = _make("Google", type_ref="organization")
|
||||
b = _make("Google LLC", type_ref="organization")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
assert len(result.entities) == 1
|
||||
|
||||
|
||||
def test_ip_matching_exacto():
|
||||
"""192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
|
||||
a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
|
||||
b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
|
||||
|
||||
def test_same_name_different_type_no_merge():
|
||||
"""John Smith (person) y John Smith (organization) NO se mergean."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("John Smith", type_ref="organization")
|
||||
result = deduplicate_entities([a, b], same_type_only=True)
|
||||
assert result.total_after == 2
|
||||
|
||||
|
||||
def test_clusters_transitivos():
|
||||
"""Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
|
||||
a = _make("Alice Johnson", type_ref="person")
|
||||
b = _make("Alice Johnso", type_ref="person") # muy similar a A
|
||||
c = _make("Alice Johns", type_ref="person") # muy similar a B
|
||||
result = deduplicate_entities([a, b, c], name_threshold=0.80)
|
||||
assert result.total_after == 1
|
||||
|
||||
|
||||
def test_sin_duplicados_sin_cambios():
|
||||
"""Entidades sin duplicados pasan sin modificacion."""
|
||||
a = _make("Alice Smith", type_ref="person")
|
||||
b = _make("Bob Jones", type_ref="person")
|
||||
c = _make("Charlie Brown", type_ref="person")
|
||||
result = deduplicate_entities([a, b, c])
|
||||
assert result.total_before == 3
|
||||
assert result.total_after == 3
|
||||
assert len(result.merge_log) == 0
|
||||
|
||||
|
||||
def test_confidence_y_atributos_merge_correctos():
|
||||
"""Confidence toma el max del cluster; atributos se fusionan."""
|
||||
a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
|
||||
b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
entity = result.entities[0]
|
||||
# confidence = max(0.7, 0.95)
|
||||
assert entity.confidence == 0.95
|
||||
# atributos de ambos candidatos presentes
|
||||
assert "role" in entity.attributes
|
||||
assert "company" in entity.attributes
|
||||
|
||||
|
||||
def test_lista_vacia():
|
||||
"""Lista vacia retorna resultado vacio."""
|
||||
result = deduplicate_entities([])
|
||||
assert result.total_before == 0
|
||||
assert result.total_after == 0
|
||||
assert result.entities == []
|
||||
assert result.merge_log == []
|
||||
|
||||
|
||||
def test_name_to_id_resolucion():
|
||||
"""name_to_id contiene todos los nombres originales del cluster."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("Smith, John", type_ref="person")
|
||||
result = deduplicate_entities([a, b])
|
||||
# Ambos nombres deben apuntar al mismo ID
|
||||
ids = list(result.entity_id_map.values())
|
||||
assert len(ids) == 1
|
||||
ent_id = ids[0]
|
||||
# name_to_id debe tener entradas para los nombres originales
|
||||
assert any(v == ent_id for v in result.name_to_id.values())
|
||||
assert len(result.name_to_id) >= 2
|
||||
@@ -0,0 +1,81 @@
|
||||
---
|
||||
name: deduplicate_relations
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def deduplicate_relations(relations: list[RelationCandidate], entity_id_map: dict[str, str]) -> list[RelationCandidate]"
|
||||
description: "Deduplica relaciones candidatas resolviendo from_name/to_name a entity IDs finales via entity_id_map. Descarta self-loops y relaciones sin match. Mergea duplicados (mismo from_id, to_id, relation_type) concatenando descripciones unicas y tomando max confidence."
|
||||
tags: [datascience, extraction, knowledge-graph, nlp, deduplication, fuzzy-match, fuzzygraph]
|
||||
uses_functions:
|
||||
- levenshtein_distance_py_cybersecurity
|
||||
uses_types:
|
||||
- relation_candidate_py_datascience
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "dos relaciones identicas se colapsan en una"
|
||||
- "relacion con nombre mergeado se resuelve al id correcto"
|
||||
- "self loop se descarta"
|
||||
- "nombre no mapeado sin fuzzy match se descarta"
|
||||
- "relaciones distintas se mantienen"
|
||||
- "merge descripcion concatena unicas"
|
||||
- "lista vacia retorna lista vacia"
|
||||
- "fuzzy match resuelve nombre cercano"
|
||||
test_file_path: "python/functions/datascience/deduplicate_relations_test.py"
|
||||
file_path: "python/functions/datascience/deduplicate_relations.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from python.types.datascience.relation_candidate import RelationCandidate
|
||||
from python.functions.datascience.deduplicate_relations import deduplicate_relations
|
||||
|
||||
# entity_id_map producido por deduplicate_entities
|
||||
entity_id_map = {
|
||||
"john smith": "entity_001",
|
||||
"smith, john": "entity_001", # alias mergeado
|
||||
"acme corp": "entity_002",
|
||||
}
|
||||
|
||||
relations = [
|
||||
RelationCandidate(from_name="John Smith", to_name="Acme Corp",
|
||||
relation_type="works_at", description="John es CEO",
|
||||
confidence=0.9, source_chunk_index=0),
|
||||
RelationCandidate(from_name="Smith, John", to_name="Acme Corp",
|
||||
relation_type="works_at", description="CEO de Acme",
|
||||
confidence=0.7, source_chunk_index=2),
|
||||
]
|
||||
|
||||
result = deduplicate_relations(relations, entity_id_map)
|
||||
# → 1 RelationCandidate con from_id="entity_001", to_id="entity_002",
|
||||
# confidence=0.9, description="John es CEO; CEO de Acme"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
La funcion es pura: no hace I/O, no tiene efectos secundarios. El logging es
|
||||
de nivel DEBUG/WARNING — en produccion configurar el logger de la aplicacion.
|
||||
|
||||
**Resolucion de nombres:**
|
||||
- Lookup exacto primero (lowercase strip del nombre contra las claves del mapa).
|
||||
- Si no hay match exacto, fuzzy match con Levenshtein (threshold=3 ediciones).
|
||||
- Si sigue sin match, la relacion se descarta con `logger.warning`.
|
||||
|
||||
**Self-loops:** relaciones donde `from_id == to_id` siempre se descartan.
|
||||
|
||||
**Merge:** cuando varias relaciones comparten `(from_id, to_id, relation_type)`:
|
||||
- `confidence`: max del grupo.
|
||||
- `description`: union de descripciones unicas (no duplicadas), separadas por `'; '`.
|
||||
- `from_name` / `to_name` / `source_chunk_index`: del primer candidato del grupo.
|
||||
|
||||
**Integracion con fuzzygraph:**
|
||||
Esta funcion es el paso 4 del pipeline de extraccion. Recibe el output de
|
||||
`extract_relations_llm` (relaciones crudas con nombres de texto) y el
|
||||
`entity_id_map` producido por `deduplicate_entities`. Produce la lista final
|
||||
de relaciones para `ExtractionResult`.
|
||||
@@ -0,0 +1,189 @@
|
||||
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Importar levenshtein_distance desde cybersecurity ---
|
||||
# Soporta dos contextos:
|
||||
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
|
||||
# 2. Ejecutado desde la raiz del registry (fn run)
|
||||
def _levenshtein_distance(a: str, b: str) -> int:
|
||||
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
|
||||
if len(a) < len(b):
|
||||
return _levenshtein_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev_row = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr_row = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr_row.append(
|
||||
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
|
||||
)
|
||||
prev_row = curr_row
|
||||
return prev_row[-1]
|
||||
|
||||
|
||||
try:
|
||||
_here = os.path.dirname(os.path.abspath(__file__))
|
||||
_cyber_path = os.path.join(_here, "..", "cybersecurity")
|
||||
if _cyber_path not in sys.path:
|
||||
sys.path.insert(0, _cyber_path)
|
||||
from cybersecurity import levenshtein_distance as _lev
|
||||
except ImportError:
|
||||
_lev = None # type: ignore
|
||||
|
||||
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
|
||||
|
||||
|
||||
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
|
||||
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
|
||||
|
||||
Recorre todas las claves de entity_id_map y busca la mas cercana segun
|
||||
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
|
||||
threshold, o '' si no hay match aceptable.
|
||||
|
||||
Args:
|
||||
name: nombre a resolver (ya en lowercase strip).
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id.
|
||||
threshold: distancia maxima de edicion para considerar match (default 3).
|
||||
|
||||
Returns:
|
||||
entity_id del mejor match o '' si no hay match.
|
||||
"""
|
||||
best_id = ""
|
||||
best_dist = threshold + 1
|
||||
for key, entity_id in entity_id_map.items():
|
||||
dist = levenshtein_distance(name, key)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_id = entity_id
|
||||
return best_id if best_dist <= threshold else ""
|
||||
|
||||
|
||||
def deduplicate_relations(
|
||||
relations: list,
|
||||
entity_id_map: dict[str, str],
|
||||
) -> list:
|
||||
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
|
||||
|
||||
Algoritmo:
|
||||
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
|
||||
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
|
||||
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
|
||||
Si sigue sin match, descartar la relacion con warning.
|
||||
2. Descartar self-loops (from_id == to_id).
|
||||
3. Deduplicar por (from_id, to_id, relation_type):
|
||||
- description: concatenar descripciones unicas separadas por '; '
|
||||
- confidence: max del grupo
|
||||
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
|
||||
|
||||
Args:
|
||||
relations: lista de RelationCandidate con from_name/to_name originales.
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id (output de
|
||||
deduplicate_entities). Permite resolver nombres que fueron mergeados.
|
||||
|
||||
Returns:
|
||||
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
|
||||
"""
|
||||
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
|
||||
try:
|
||||
_types_path = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"..", "..", "..", "python", "types", "datascience",
|
||||
)
|
||||
if _types_path not in sys.path:
|
||||
sys.path.insert(0, _types_path)
|
||||
from relation_candidate import RelationCandidate
|
||||
except ImportError:
|
||||
from python.types.datascience.relation_candidate import RelationCandidate # type: ignore
|
||||
|
||||
resolved: list = []
|
||||
|
||||
for rel in relations:
|
||||
# --- Resolver from_name ---
|
||||
from_key = rel.from_name.lower().strip()
|
||||
from_id = entity_id_map.get(from_key, "")
|
||||
if not from_id:
|
||||
from_id = _fuzzy_resolve(from_key, entity_id_map)
|
||||
if not from_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
|
||||
rel.from_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Resolver to_name ---
|
||||
to_key = rel.to_name.lower().strip()
|
||||
to_id = entity_id_map.get(to_key, "")
|
||||
if not to_id:
|
||||
to_id = _fuzzy_resolve(to_key, entity_id_map)
|
||||
if not to_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
|
||||
rel.to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Descartar self-loops ---
|
||||
if from_id == to_id:
|
||||
logger.debug(
|
||||
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
|
||||
rel.from_name,
|
||||
rel.to_name,
|
||||
rel.relation_type,
|
||||
)
|
||||
continue
|
||||
|
||||
resolved.append(
|
||||
RelationCandidate(
|
||||
from_name=rel.from_name,
|
||||
to_name=rel.to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel.relation_type,
|
||||
description=rel.description,
|
||||
confidence=rel.confidence,
|
||||
source_chunk_index=rel.source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
# --- Deduplicar por (from_id, to_id, relation_type) ---
|
||||
groups: dict[tuple, list] = {}
|
||||
for rel in resolved:
|
||||
key = (rel.from_id, rel.to_id, rel.relation_type)
|
||||
groups.setdefault(key, []).append(rel)
|
||||
|
||||
result: list = []
|
||||
for (from_id, to_id, rel_type), group in groups.items():
|
||||
if len(group) == 1:
|
||||
result.append(group[0])
|
||||
continue
|
||||
|
||||
# Mergear: max confidence + union de descripciones unicas
|
||||
best_confidence = max(r.confidence for r in group)
|
||||
seen_desc: set[str] = set()
|
||||
descriptions: list[str] = []
|
||||
for r in group:
|
||||
if r.description and r.description not in seen_desc:
|
||||
descriptions.append(r.description)
|
||||
seen_desc.add(r.description)
|
||||
|
||||
result.append(
|
||||
RelationCandidate(
|
||||
from_name=group[0].from_name,
|
||||
to_name=group[0].to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel_type,
|
||||
description="; ".join(descriptions),
|
||||
confidence=best_confidence,
|
||||
source_chunk_index=group[0].source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,120 @@
|
||||
"""Tests para deduplicate_relations."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Permitir importar RelationCandidate desde python/types/datascience/
|
||||
_here = os.path.dirname(os.path.abspath(__file__))
|
||||
_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience")
|
||||
if _types_path not in sys.path:
|
||||
sys.path.insert(0, _types_path)
|
||||
|
||||
from relation_candidate import RelationCandidate
|
||||
from deduplicate_relations import deduplicate_relations
|
||||
|
||||
|
||||
def _make_rel(
|
||||
from_name: str,
|
||||
to_name: str,
|
||||
relation_type: str = "works_at",
|
||||
description: str = "",
|
||||
confidence: float = 0.8,
|
||||
source_chunk_index: int = 0,
|
||||
) -> RelationCandidate:
|
||||
return RelationCandidate(
|
||||
from_name=from_name,
|
||||
to_name=to_name,
|
||||
relation_type=relation_type,
|
||||
description=description,
|
||||
confidence=confidence,
|
||||
source_chunk_index=source_chunk_index,
|
||||
)
|
||||
|
||||
|
||||
# entity_id_map tipico: claves en lowercase normalizado
|
||||
_ENTITY_MAP: dict[str, str] = {
|
||||
"john smith": "entity_001",
|
||||
"acme corp": "entity_002",
|
||||
"jane doe": "entity_003",
|
||||
"google": "entity_004",
|
||||
}
|
||||
|
||||
|
||||
def test_dos_relaciones_identicas_se_colapsan_en_una():
|
||||
"""2 relaciones identicas (from, to, type) → 1."""
|
||||
rels = [
|
||||
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
|
||||
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7),
|
||||
]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 1
|
||||
assert result[0].from_id == "entity_001"
|
||||
assert result[0].to_id == "entity_002"
|
||||
assert result[0].confidence == 0.9 # max
|
||||
|
||||
|
||||
def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto():
|
||||
"""Relacion con nombre mergeado → se resuelve al ID correcto."""
|
||||
# entity_id_map incluye "smith, john" como alias de entity_001
|
||||
merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"}
|
||||
rels = [_make_rel("Smith, John", "Acme Corp")]
|
||||
result = deduplicate_relations(rels, merged_map)
|
||||
assert len(result) == 1
|
||||
assert result[0].from_id == "entity_001"
|
||||
assert result[0].to_id == "entity_002"
|
||||
|
||||
|
||||
def test_self_loop_se_descarta():
|
||||
"""Self-loop (from_id == to_id) → descartado."""
|
||||
rels = [_make_rel("John Smith", "John Smith", relation_type="knows")]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta():
|
||||
"""Relacion con nombre no mapeado y sin fuzzy match → descartada."""
|
||||
rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_relaciones_distintas_se_mantienen():
|
||||
"""Relaciones con (from, to, type) distintos → todas se mantienen."""
|
||||
rels = [
|
||||
_make_rel("John Smith", "Acme Corp", relation_type="works_at"),
|
||||
_make_rel("Jane Doe", "Acme Corp", relation_type="works_at"),
|
||||
_make_rel("John Smith", "Google", relation_type="invested_in"),
|
||||
]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 3
|
||||
|
||||
|
||||
def test_merge_descripcion_concatena_unicas():
|
||||
"""Merge de relaciones: descripciones unicas se concatenan."""
|
||||
rels = [
|
||||
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
|
||||
_make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7),
|
||||
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6),
|
||||
]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 1
|
||||
assert "John es CEO" in result[0].description
|
||||
assert "Acme fue fundada por John" in result[0].description
|
||||
# La descripcion duplicada ("John es CEO") no aparece dos veces
|
||||
assert result[0].description.count("John es CEO") == 1
|
||||
assert result[0].confidence == 0.9
|
||||
|
||||
|
||||
def test_lista_vacia_retorna_lista_vacia():
|
||||
"""Lista vacia de relaciones → lista vacia."""
|
||||
result = deduplicate_relations([], _ENTITY_MAP)
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_fuzzy_match_resuelve_nombre_cercano():
|
||||
"""Nombre con typo pequeño → fuzzy match lo resuelve."""
|
||||
# "john smit" tiene distancia 1 de "john smith"
|
||||
rels = [_make_rel("John Smit", "Acme Corp")]
|
||||
result = deduplicate_relations(rels, _ENTITY_MAP)
|
||||
assert len(result) == 1
|
||||
assert result[0].from_id == "entity_001"
|
||||
@@ -0,0 +1,56 @@
|
||||
---
|
||||
name: detect_drift
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def detect_drift(history: list[dict], current: dict, fields: list[str], threshold: float = 2.0) -> list[dict]"
|
||||
description: "Detecta drift estadistico comparando metricas de la ejecucion actual contra el historial usando z-score. Si |z| > threshold, el campo ha drifteado. Util para monitorizar executions en operations.db."
|
||||
tags: [drift, statistics, z-score, monitoring, executions, operations, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [math]
|
||||
tested: true
|
||||
tests:
|
||||
- "campo con drift claro (z > threshold)"
|
||||
- "campo estable (z < threshold)"
|
||||
- "historial con un solo punto → std=0, no puede calcular → drifted=False con nota"
|
||||
- "historial vacio → todos drifted=False"
|
||||
- "threshold custom"
|
||||
test_file_path: "python/functions/datascience/detect_drift_test.py"
|
||||
file_path: "python/functions/datascience/detect_drift.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
history = [
|
||||
{"records_out": 100, "duration_ms": 500},
|
||||
{"records_out": 105, "duration_ms": 480},
|
||||
{"records_out": 98, "duration_ms": 510},
|
||||
]
|
||||
current = {"records_out": 50, "duration_ms": 2000}
|
||||
|
||||
results = detect_drift(history, current, ["records_out", "duration_ms"])
|
||||
# [
|
||||
# {"field": "records_out", "current": 50, "mean": 101.0, "std": 3.6, "z_score": -14.2, "drifted": True},
|
||||
# {"field": "duration_ms", "current": 2000, "mean": 496.7, "std": 15.3, "z_score": 98.3, "drifted": True},
|
||||
# ]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Solo stdlib (`math`).
|
||||
|
||||
El z-score usa desviacion estandar poblacional (dividir por N, no N-1) para ser consistente con historial de cualquier tamanio.
|
||||
|
||||
Casos especiales:
|
||||
- **Historial vacio**: z_score=0.0, drifted=False para todos los campos.
|
||||
- **Un solo punto en historial**: std=0.0, z_score=0.0, drifted=False. No hay suficiente historia para calcular variabilidad.
|
||||
- **Std=0 con N>=2**: todos los valores historicos identicos. z_score=0.0, drifted=False (cualquier desviacion seria tecnicamente infinita, pero se asume que el sistema es muy estable).
|
||||
|
||||
Pensado para el paso ANALIZAR del bucle reactivo: comparar `metrics` de la ejecucion actual con executions historicas de `operations.db`.
|
||||
@@ -0,0 +1,86 @@
|
||||
"""detect_drift — detecta drift estadistico por z-score comparando metricas contra historial."""
|
||||
|
||||
import math
|
||||
|
||||
|
||||
def detect_drift(
|
||||
history: list[dict],
|
||||
current: dict,
|
||||
fields: list[str],
|
||||
threshold: float = 2.0,
|
||||
) -> list[dict]:
|
||||
"""Detecta drift estadistico comparando metricas actuales contra el historial.
|
||||
|
||||
Usa z-score: si |z| > threshold, el campo ha drifteado. Pensado para
|
||||
comparar metrics de executions sucesivas en operations.db.
|
||||
|
||||
Args:
|
||||
history: Lista de dicts con metricas historicas. Cada dict puede
|
||||
contener cualquier combinacion de los campos indicados.
|
||||
current: Dict con las metricas de la ejecucion actual.
|
||||
fields: Lista de campos numericos a analizar.
|
||||
threshold: Umbral de z-score para considerar drift. Default 2.0.
|
||||
|
||||
Returns:
|
||||
Lista de dicts con: field, current, mean, std, z_score, drifted.
|
||||
Si el historial tiene 0 o 1 punto, z_score=0.0 y drifted=False
|
||||
porque no hay suficiente informacion estadistica.
|
||||
"""
|
||||
results = []
|
||||
|
||||
for field in fields:
|
||||
values = [
|
||||
float(h[field])
|
||||
for h in history
|
||||
if field in h and h[field] is not None
|
||||
]
|
||||
|
||||
current_val = float(current.get(field, 0))
|
||||
|
||||
if len(values) == 0:
|
||||
results.append({
|
||||
"field": field,
|
||||
"current": current_val,
|
||||
"mean": 0.0,
|
||||
"std": 0.0,
|
||||
"z_score": 0.0,
|
||||
"drifted": False,
|
||||
})
|
||||
continue
|
||||
|
||||
n = len(values)
|
||||
mean = sum(values) / n
|
||||
|
||||
if n < 2:
|
||||
# Un solo punto: no hay std, no podemos calcular z-score
|
||||
results.append({
|
||||
"field": field,
|
||||
"current": current_val,
|
||||
"mean": mean,
|
||||
"std": 0.0,
|
||||
"z_score": 0.0,
|
||||
"drifted": False,
|
||||
})
|
||||
continue
|
||||
|
||||
variance = sum((v - mean) ** 2 for v in values) / n
|
||||
std = math.sqrt(variance)
|
||||
|
||||
if std == 0.0:
|
||||
# Todos los valores identicos: z_score indeterminado, no drift
|
||||
z_score = 0.0
|
||||
drifted = False
|
||||
else:
|
||||
z_score = (current_val - mean) / std
|
||||
drifted = abs(z_score) > threshold
|
||||
|
||||
results.append({
|
||||
"field": field,
|
||||
"current": current_val,
|
||||
"mean": mean,
|
||||
"std": std,
|
||||
"z_score": z_score,
|
||||
"drifted": drifted,
|
||||
})
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Tests para detect_drift."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from detect_drift import detect_drift
|
||||
|
||||
|
||||
def test_campo_con_drift_claro_z_mayor_threshold():
|
||||
history = [
|
||||
{"records_out": 100},
|
||||
{"records_out": 105},
|
||||
{"records_out": 98},
|
||||
]
|
||||
current = {"records_out": 50}
|
||||
results = detect_drift(history, current, ["records_out"])
|
||||
assert len(results) == 1
|
||||
r = results[0]
|
||||
assert r["field"] == "records_out"
|
||||
assert r["current"] == 50.0
|
||||
assert r["drifted"] is True
|
||||
assert r["z_score"] < -2.0 # muy lejos de la media
|
||||
|
||||
|
||||
def test_campo_estable_z_menor_threshold():
|
||||
history = [
|
||||
{"val": 100.0},
|
||||
{"val": 102.0},
|
||||
{"val": 98.0},
|
||||
{"val": 101.0},
|
||||
]
|
||||
current = {"val": 100.5} # dentro del rango normal
|
||||
results = detect_drift(history, current, ["val"])
|
||||
assert len(results) == 1
|
||||
r = results[0]
|
||||
assert r["drifted"] is False
|
||||
assert abs(r["z_score"]) < 2.0
|
||||
|
||||
|
||||
def test_historial_con_un_solo_punto_std_0_drifted_False_con_nota():
|
||||
history = [{"val": 100.0}]
|
||||
current = {"val": 999.0}
|
||||
results = detect_drift(history, current, ["val"])
|
||||
assert len(results) == 1
|
||||
r = results[0]
|
||||
assert r["std"] == 0.0
|
||||
assert r["z_score"] == 0.0
|
||||
assert r["drifted"] is False
|
||||
assert r["mean"] == 100.0
|
||||
|
||||
|
||||
def test_historial_vacio_todos_drifted_False():
|
||||
history = []
|
||||
current = {"records_out": 50, "duration_ms": 2000}
|
||||
results = detect_drift(history, current, ["records_out", "duration_ms"])
|
||||
assert len(results) == 2
|
||||
for r in results:
|
||||
assert r["drifted"] is False
|
||||
assert r["z_score"] == 0.0
|
||||
assert r["mean"] == 0.0
|
||||
|
||||
|
||||
def test_threshold_custom():
|
||||
history = [
|
||||
{"val": 100.0},
|
||||
{"val": 100.0},
|
||||
{"val": 110.0},
|
||||
{"val": 90.0},
|
||||
]
|
||||
# std ~ 7.07, mean = 100
|
||||
current = {"val": 115.0} # z ~ 2.12
|
||||
|
||||
# threshold default 2.0 -> drifted
|
||||
results = detect_drift(history, current, ["val"], threshold=2.0)
|
||||
assert results[0]["drifted"] is True
|
||||
|
||||
# threshold 3.0 -> no drifted
|
||||
results2 = detect_drift(history, current, ["val"], threshold=3.0)
|
||||
assert results2[0]["drifted"] is False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_campo_con_drift_claro_z_mayor_threshold()
|
||||
test_campo_estable_z_menor_threshold()
|
||||
test_historial_con_un_solo_punto_std_0_drifted_False_con_nota()
|
||||
test_historial_vacio_todos_drifted_False()
|
||||
test_threshold_custom()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: diff_entities
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def diff_entities(before: list[dict], after: list[dict], key: str = 'id', ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict"
|
||||
description: "Compara dos snapshots de entities y devuelve diferencias campo a campo. Detecta añadidas, eliminadas, modificadas e inalteradas. Ignora created_at y updated_at por defecto."
|
||||
tags: [diff, entities, snapshot, operations, comparison, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "entity añadida"
|
||||
- "entity eliminada"
|
||||
- "entity modificada con detalle de campos"
|
||||
- "entities identicas → unchanged"
|
||||
- "ignore_fields funciona"
|
||||
- "compare_fields filtra correctamente"
|
||||
- "lista vacia vs lista con datos"
|
||||
test_file_path: "python/functions/datascience/diff_entities_test.py"
|
||||
file_path: "python/functions/datascience/diff_entities.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
before = [
|
||||
{"id": "1", "name": "Alice", "status": "active", "updated_at": "2024-01-01"},
|
||||
{"id": "2", "name": "Bob", "status": "active", "updated_at": "2024-01-01"},
|
||||
]
|
||||
after = [
|
||||
{"id": "1", "name": "Alice", "status": "inactive", "updated_at": "2024-01-02"},
|
||||
{"id": "3", "name": "Carol", "status": "active", "updated_at": "2024-01-02"},
|
||||
]
|
||||
|
||||
result = diff_entities(before, after)
|
||||
# result["added"] -> [{"id": "3", "name": "Carol", ...}]
|
||||
# result["removed"] -> [{"id": "2", "name": "Bob", ...}]
|
||||
# result["modified"] -> [{"key": "1", "changes": {"status": {"old": "active", "new": "inactive"}}}]
|
||||
# result["unchanged"] -> 0
|
||||
# result["summary"] -> "1 added, 1 removed, 1 modified, 0 unchanged"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. No hace I/O — toma listas de dicts ya cargadas en memoria.
|
||||
|
||||
El campo `key` debe existir en todas las entities; las que no lo tengan se ignoran silenciosamente.
|
||||
|
||||
Si `compare_fields` se da, tiene prioridad sobre `ignore_fields`. Esto permite comparar solo un subconjunto especifico de campos sin preocuparse por los campos temporales.
|
||||
|
||||
El orden de `added` y `removed` no esta garantizado (depende del orden de iteracion de sets).
|
||||
@@ -0,0 +1,77 @@
|
||||
"""diff_entities — compara dos snapshots de entities detectando cambios campo a campo."""
|
||||
|
||||
|
||||
def diff_entities(
|
||||
before: list[dict],
|
||||
after: list[dict],
|
||||
key: str = "id",
|
||||
ignore_fields: list[str] | None = None,
|
||||
compare_fields: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""Compara dos snapshots de entities y devuelve diferencias campo a campo.
|
||||
|
||||
Detecta entities añadidas, eliminadas, modificadas e inalteradas.
|
||||
Ignora campos de metadata temporal por defecto (created_at, updated_at).
|
||||
|
||||
Args:
|
||||
before: Lista de entities del snapshot anterior.
|
||||
after: Lista de entities del snapshot posterior.
|
||||
key: Campo que identifica unicamente cada entity. Default "id".
|
||||
ignore_fields: Campos a excluir de la comparacion.
|
||||
Default ["created_at", "updated_at"].
|
||||
compare_fields: Si se da, solo compara estos campos (tiene prioridad
|
||||
sobre ignore_fields).
|
||||
|
||||
Returns:
|
||||
Dict con keys: added, removed, modified, unchanged, summary.
|
||||
modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}.
|
||||
"""
|
||||
if ignore_fields is None:
|
||||
ignore_fields = ["created_at", "updated_at"]
|
||||
|
||||
before_map = {str(e[key]): e for e in before if key in e}
|
||||
after_map = {str(e[key]): e for e in after if key in e}
|
||||
|
||||
before_keys = set(before_map.keys())
|
||||
after_keys = set(after_map.keys())
|
||||
|
||||
added = [after_map[k] for k in after_keys - before_keys]
|
||||
removed = [before_map[k] for k in before_keys - after_keys]
|
||||
|
||||
modified = []
|
||||
unchanged = 0
|
||||
|
||||
for k in before_keys & after_keys:
|
||||
b = before_map[k]
|
||||
a = after_map[k]
|
||||
|
||||
if compare_fields is not None:
|
||||
fields_to_check = compare_fields
|
||||
else:
|
||||
all_fields = set(b.keys()) | set(a.keys())
|
||||
fields_to_check = [f for f in all_fields if f not in ignore_fields and f != key]
|
||||
|
||||
changes = {}
|
||||
for field in fields_to_check:
|
||||
old_val = b.get(field)
|
||||
new_val = a.get(field)
|
||||
if old_val != new_val:
|
||||
changes[field] = {"old": old_val, "new": new_val}
|
||||
|
||||
if changes:
|
||||
modified.append({"key": k, "changes": changes})
|
||||
else:
|
||||
unchanged += 1
|
||||
|
||||
n_added = len(added)
|
||||
n_removed = len(removed)
|
||||
n_modified = len(modified)
|
||||
summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged"
|
||||
|
||||
return {
|
||||
"added": added,
|
||||
"removed": removed,
|
||||
"modified": modified,
|
||||
"unchanged": unchanged,
|
||||
"summary": summary,
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Tests para diff_entities."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from diff_entities import diff_entities
|
||||
|
||||
|
||||
def test_entity_anadida():
|
||||
before = [{"id": "1", "name": "Alice"}]
|
||||
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
|
||||
result = diff_entities(before, after)
|
||||
assert len(result["added"]) == 1
|
||||
assert result["added"][0]["id"] == "2"
|
||||
assert result["removed"] == []
|
||||
assert result["modified"] == []
|
||||
assert result["unchanged"] == 1
|
||||
assert "1 added" in result["summary"]
|
||||
|
||||
|
||||
def test_entity_eliminada():
|
||||
before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
|
||||
after = [{"id": "1", "name": "Alice"}]
|
||||
result = diff_entities(before, after)
|
||||
assert result["added"] == []
|
||||
assert len(result["removed"]) == 1
|
||||
assert result["removed"][0]["id"] == "2"
|
||||
assert result["unchanged"] == 1
|
||||
assert "1 removed" in result["summary"]
|
||||
|
||||
|
||||
def test_entity_modificada_con_detalle_de_campos():
|
||||
before = [{"id": "1", "name": "Alice", "status": "active"}]
|
||||
after = [{"id": "1", "name": "Alice", "status": "inactive"}]
|
||||
result = diff_entities(before, after)
|
||||
assert result["added"] == []
|
||||
assert result["removed"] == []
|
||||
assert len(result["modified"]) == 1
|
||||
mod = result["modified"][0]
|
||||
assert mod["key"] == "1"
|
||||
assert "status" in mod["changes"]
|
||||
assert mod["changes"]["status"]["old"] == "active"
|
||||
assert mod["changes"]["status"]["new"] == "inactive"
|
||||
assert result["unchanged"] == 0
|
||||
|
||||
|
||||
def test_entities_identicas_unchanged():
|
||||
before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
|
||||
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
|
||||
result = diff_entities(before, after)
|
||||
assert result["added"] == []
|
||||
assert result["removed"] == []
|
||||
assert result["modified"] == []
|
||||
assert result["unchanged"] == 2
|
||||
assert "2 unchanged" in result["summary"]
|
||||
|
||||
|
||||
def test_ignore_fields_funciona():
|
||||
before = [{"id": "1", "name": "Alice", "updated_at": "2024-01-01", "created_at": "2023-01-01"}]
|
||||
after = [{"id": "1", "name": "Alice", "updated_at": "2024-06-01", "created_at": "2023-01-01"}]
|
||||
result = diff_entities(before, after)
|
||||
# updated_at se ignora por defecto -> unchanged
|
||||
assert result["unchanged"] == 1
|
||||
assert result["modified"] == []
|
||||
|
||||
# Si no ignoramos updated_at, debe detectar el cambio
|
||||
result2 = diff_entities(before, after, ignore_fields=[])
|
||||
assert len(result2["modified"]) == 1
|
||||
assert "updated_at" in result2["modified"][0]["changes"]
|
||||
|
||||
|
||||
def test_compare_fields_filtra_correctamente():
|
||||
before = [{"id": "1", "name": "Alice", "status": "active", "score": 10}]
|
||||
after = [{"id": "1", "name": "Bob", "status": "inactive", "score": 10}]
|
||||
# Solo comparar score -> no hay cambio en score, unchanged
|
||||
result = diff_entities(before, after, compare_fields=["score"])
|
||||
assert result["unchanged"] == 1
|
||||
assert result["modified"] == []
|
||||
|
||||
# Solo comparar name -> detecta cambio
|
||||
result2 = diff_entities(before, after, compare_fields=["name"])
|
||||
assert len(result2["modified"]) == 1
|
||||
assert "name" in result2["modified"][0]["changes"]
|
||||
assert "status" not in result2["modified"][0]["changes"]
|
||||
|
||||
|
||||
def test_lista_vacia_vs_lista_con_datos():
|
||||
before = []
|
||||
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
|
||||
result = diff_entities(before, after)
|
||||
assert len(result["added"]) == 2
|
||||
assert result["removed"] == []
|
||||
assert result["unchanged"] == 0
|
||||
|
||||
# Invertido
|
||||
result2 = diff_entities(after, before)
|
||||
assert result2["added"] == []
|
||||
assert len(result2["removed"]) == 2
|
||||
assert result2["unchanged"] == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_entity_anadida()
|
||||
test_entity_eliminada()
|
||||
test_entity_modificada_con_detalle_de_campos()
|
||||
test_entities_identicas_unchanged()
|
||||
test_ignore_fields_funciona()
|
||||
test_compare_fields_filtra_correctamente()
|
||||
test_lista_vacia_vs_lista_con_datos()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,52 @@
|
||||
---
|
||||
name: diff_relations
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def diff_relations(before: list[dict], after: list[dict], key: tuple[str, str, str] = ('source_id', 'target_id', 'relation_type'), ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict"
|
||||
description: "Compara relaciones entre dos snapshots usando key compuesta (source_id, target_id, relation_type). Detecta relaciones añadidas, eliminadas y modificadas con detalle campo a campo."
|
||||
tags: [diff, relations, graph, snapshot, operations, comparison, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "relacion añadida"
|
||||
- "relacion eliminada"
|
||||
- "relacion con metadata modificada (mismo source/target/type, distinto weight)"
|
||||
- "key compuesta funciona correctamente"
|
||||
test_file_path: "python/functions/datascience/diff_relations_test.py"
|
||||
file_path: "python/functions/datascience/diff_relations.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
before = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
|
||||
{"source_id": "B", "target_id": "C", "relation_type": "owns", "weight": 0.5},
|
||||
]
|
||||
after = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 2.0},
|
||||
{"source_id": "C", "target_id": "D", "relation_type": "knows", "weight": 1.0},
|
||||
]
|
||||
|
||||
result = diff_relations(before, after)
|
||||
# result["added"] -> [{"source_id": "C", "target_id": "D", ...}]
|
||||
# result["removed"] -> [{"source_id": "B", "target_id": "C", ...}]
|
||||
# result["modified"] -> [{"key": "A|B|knows", "changes": {"weight": {"old": 1.0, "new": 2.0}}}]
|
||||
# result["unchanged"] -> 0
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
La key compuesta se serializa como `source_id|target_id|relation_type`. Si alguno de los campos clave no existe en la relacion, se usa string vacio.
|
||||
|
||||
Misma semantica que `diff_entities_py_datascience` pero adaptada para relaciones donde no hay un ID unico — la identidad se define por los tres campos de la key.
|
||||
|
||||
Complemento natural de `diff_entities_py_datascience` para comparar grafos completos entre ejecuciones de pipelines.
|
||||
@@ -0,0 +1,82 @@
|
||||
"""diff_relations — compara dos snapshots de relaciones con key compuesta."""
|
||||
|
||||
|
||||
def diff_relations(
|
||||
before: list[dict],
|
||||
after: list[dict],
|
||||
key: tuple[str, str, str] = ("source_id", "target_id", "relation_type"),
|
||||
ignore_fields: list[str] | None = None,
|
||||
compare_fields: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""Compara relaciones entre dos snapshots usando key compuesta.
|
||||
|
||||
Las relaciones se identifican por (source_id, target_id, relation_type)
|
||||
porque no tienen un ID unico propio. Detecta relaciones añadidas,
|
||||
eliminadas y modificadas (mismo source/target/type, distinta metadata).
|
||||
|
||||
Args:
|
||||
before: Lista de relaciones del snapshot anterior.
|
||||
after: Lista de relaciones del snapshot posterior.
|
||||
key: Tupla de campos que forman la key compuesta.
|
||||
Default ("source_id", "target_id", "relation_type").
|
||||
ignore_fields: Campos a excluir de la comparacion.
|
||||
Default ["created_at", "updated_at"].
|
||||
compare_fields: Si se da, solo compara estos campos.
|
||||
|
||||
Returns:
|
||||
Dict con keys: added, removed, modified, unchanged, summary.
|
||||
modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}.
|
||||
"""
|
||||
if ignore_fields is None:
|
||||
ignore_fields = ["created_at", "updated_at"]
|
||||
|
||||
def make_key(rel: dict) -> str:
|
||||
return "|".join(str(rel.get(k, "")) for k in key)
|
||||
|
||||
before_map = {make_key(r): r for r in before}
|
||||
after_map = {make_key(r): r for r in after}
|
||||
|
||||
before_keys = set(before_map.keys())
|
||||
after_keys = set(after_map.keys())
|
||||
|
||||
added = [after_map[k] for k in after_keys - before_keys]
|
||||
removed = [before_map[k] for k in before_keys - after_keys]
|
||||
|
||||
modified = []
|
||||
unchanged = 0
|
||||
|
||||
for k in before_keys & after_keys:
|
||||
b = before_map[k]
|
||||
a = after_map[k]
|
||||
|
||||
if compare_fields is not None:
|
||||
fields_to_check = compare_fields
|
||||
else:
|
||||
all_fields = set(b.keys()) | set(a.keys())
|
||||
key_set = set(key)
|
||||
fields_to_check = [f for f in all_fields if f not in ignore_fields and f not in key_set]
|
||||
|
||||
changes = {}
|
||||
for field in fields_to_check:
|
||||
old_val = b.get(field)
|
||||
new_val = a.get(field)
|
||||
if old_val != new_val:
|
||||
changes[field] = {"old": old_val, "new": new_val}
|
||||
|
||||
if changes:
|
||||
modified.append({"key": k, "changes": changes})
|
||||
else:
|
||||
unchanged += 1
|
||||
|
||||
n_added = len(added)
|
||||
n_removed = len(removed)
|
||||
n_modified = len(modified)
|
||||
summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged"
|
||||
|
||||
return {
|
||||
"added": added,
|
||||
"removed": removed,
|
||||
"modified": modified,
|
||||
"unchanged": unchanged,
|
||||
"summary": summary,
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
"""Tests para diff_relations."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from diff_relations import diff_relations
|
||||
|
||||
|
||||
def test_relacion_anadida():
|
||||
before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
|
||||
after = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
|
||||
{"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5},
|
||||
]
|
||||
result = diff_relations(before, after)
|
||||
assert len(result["added"]) == 1
|
||||
assert result["added"][0]["source_id"] == "C"
|
||||
assert result["removed"] == []
|
||||
assert result["unchanged"] == 1
|
||||
assert "1 added" in result["summary"]
|
||||
|
||||
|
||||
def test_relacion_eliminada():
|
||||
before = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
|
||||
{"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5},
|
||||
]
|
||||
after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
|
||||
result = diff_relations(before, after)
|
||||
assert result["added"] == []
|
||||
assert len(result["removed"]) == 1
|
||||
assert result["removed"][0]["source_id"] == "C"
|
||||
assert result["unchanged"] == 1
|
||||
assert "1 removed" in result["summary"]
|
||||
|
||||
|
||||
def test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight():
|
||||
before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
|
||||
after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 5.0}]
|
||||
result = diff_relations(before, after)
|
||||
assert result["added"] == []
|
||||
assert result["removed"] == []
|
||||
assert len(result["modified"]) == 1
|
||||
mod = result["modified"][0]
|
||||
assert mod["key"] == "A|B|knows"
|
||||
assert "weight" in mod["changes"]
|
||||
assert mod["changes"]["weight"]["old"] == 1.0
|
||||
assert mod["changes"]["weight"]["new"] == 5.0
|
||||
assert result["unchanged"] == 0
|
||||
|
||||
|
||||
def test_key_compuesta_funciona_correctamente():
|
||||
# Misma pareja A->B pero diferente tipo de relacion -> dos relaciones distintas
|
||||
before = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "owns", "weight": 0.5},
|
||||
]
|
||||
after = [
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
|
||||
{"source_id": "A", "target_id": "B", "relation_type": "trusts", "weight": 0.8},
|
||||
]
|
||||
result = diff_relations(before, after)
|
||||
# owns eliminada, trusts añadida, knows sin cambios
|
||||
assert len(result["added"]) == 1
|
||||
assert result["added"][0]["relation_type"] == "trusts"
|
||||
assert len(result["removed"]) == 1
|
||||
assert result["removed"][0]["relation_type"] == "owns"
|
||||
assert result["unchanged"] == 1
|
||||
assert result["modified"] == []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_relacion_anadida()
|
||||
test_relacion_eliminada()
|
||||
test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight()
|
||||
test_key_compuesta_funciona_correctamente()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: estimate_hawkes
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict"
|
||||
description: "Estima parámetros de un proceso Hawkes (alpha, beta, branching_ratio) desde la autocorrelación de arrivals ajustando una exponencial decreciente sobre la ACF."
|
||||
tags: [estimation, hawkes, stochastic-process, microstructure, timeseries]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [numpy, scipy]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/datascience/datascience.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
arrivals = [0, 1, 3, 2, 0, 1, 4, 2, 1, 0] * 10
|
||||
result = estimate_hawkes(arrivals, max_lag=10)
|
||||
# {'alpha': 0.312, 'beta': 0.874, 'branching_ratio': 0.357, 'acf': [...]}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Ajusta la función `a * exp(-b * lag)` sobre los lags 1..max_lag de la ACF usando `curve_fit` de scipy.
|
||||
Si el primer lag de la ACF es <= 0.01 (sin autocorrelación), retorna alpha=0, beta=1, branching_ratio=0.
|
||||
El branching_ratio = alpha/beta; si se acerca a 1, el proceso es explosivo.
|
||||
Función pura: requiere numpy y scipy instalados.
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
name: estimate_pareto_alpha
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict"
|
||||
description: "Estima el exponente alpha de una distribución Pareto via MLE. Alpha bajo indica cola más pesada y mayor frecuencia de valores extremos."
|
||||
tags: [estimation, pareto, power-law, heavy-tail, statistics]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [numpy]
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/datascience/datascience.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
# Simular datos con cola pesada
|
||||
values = list(np.random.pareto(2.0, 1000) + 1)
|
||||
result = estimate_pareto_alpha(values, x_min_percentile=90.0)
|
||||
# {'alpha': ~2.0, 'x_min': ..., 'n_tail': 100}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Usa el estimador MLE de Hill: α = n / Σ ln(xᵢ / x_min).
|
||||
x_min se determina como el percentil indicado de los valores positivos.
|
||||
Retorna alpha=0 si hay menos de 10 valores positivos o la cola tiene menos de 2 elementos.
|
||||
Función pura: requiere numpy instalado.
|
||||
@@ -0,0 +1,87 @@
|
||||
---
|
||||
name: extract_entities_llm
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_entities_llm(text: str, entity_schema: list[dict], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list[EntityCandidate]"
|
||||
description: "Extrae entidades de un chunk de texto usando un LLM inyectado. Construye el system prompt con el schema, llama al LLM y valida la respuesta retornando EntityCandidate. JSON invalido o type_ref fuera del schema se descartan con warning."
|
||||
tags: [llm, extraction, entity, nlp, osint, graph, fuzzygraph, datascience, prompt]
|
||||
uses_functions: []
|
||||
uses_types: [entity_candidate_py_datascience]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [warnings, typing.Callable]
|
||||
tested: true
|
||||
tests:
|
||||
- "texto con entidades claras retorna EntityCandidate"
|
||||
- "texto sin entidades retorna lista vacia"
|
||||
- "llm retorna json mal formado retorna lista vacia con warning"
|
||||
- "type_ref invalido en respuesta se descarta con warning"
|
||||
- "confidence se propaga correctamente"
|
||||
- "schema vacio lanza ValueError"
|
||||
test_file_path: "python/functions/datascience/extract_entities_llm_test.py"
|
||||
file_path: "python/functions/datascience/extract_entities_llm.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import json
|
||||
from extract_entities_llm import extract_entities_llm
|
||||
|
||||
# LLM stub para tests — en produccion usar litellm o similar
|
||||
def mock_llm(messages: list[dict]) -> dict:
|
||||
return {
|
||||
"entities": [
|
||||
{
|
||||
"name": "John Smith",
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"attributes": {"full_name": "John Smith", "nationality": "US"},
|
||||
"confidence": 0.95,
|
||||
},
|
||||
{
|
||||
"name": "evil-corp.com",
|
||||
"type_ref": "osint_domain_go_cybersecurity",
|
||||
"attributes": {"fqdn": "evil-corp.com"},
|
||||
"confidence": 0.88,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
schema = [
|
||||
{
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
|
||||
},
|
||||
{
|
||||
"type_ref": "osint_domain_go_cybersecurity",
|
||||
"label": "Domain",
|
||||
"metadata_fields": ["fqdn", "registrar", "created_date"],
|
||||
},
|
||||
]
|
||||
|
||||
text = "John Smith, a US citizen, was linked to the domain evil-corp.com."
|
||||
candidates = extract_entities_llm(text, schema, mock_llm)
|
||||
# [EntityCandidate(name='John Smith', type_ref='osint_person_go_cybersecurity', confidence=0.95),
|
||||
# EntityCandidate(name='evil-corp.com', type_ref='osint_domain_go_cybersecurity', confidence=0.88)]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe mensajes en formato OpenAI (`[{"role": "system", "content": "..."}, ...]`) y retorna un `dict` con la respuesta ya parseada como JSON. Esto desacopla la funcion de cualquier cliente especifico — puede usarse con OpenAI, Anthropic via litellm, o cualquier mock.
|
||||
|
||||
**Validacion de type_ref:** Solo se aceptan entidades cuyo `type_ref` aparece en el `entity_schema`. Entidades con type_ref desconocido se descartan con `warnings.warn` (no lanzan excepcion) para ser resiliente ante alucinaciones del LLM.
|
||||
|
||||
**Manejo de JSON invalido:** Si `llm_chat_json` lanza una excepcion o retorna un dict sin la clave `entities`, se retorna lista vacia y se emite un warning. El llamador puede decidir si reintentar.
|
||||
|
||||
**Confidence clamping:** El valor de confidence se clampea al rango [0.0, 1.0] automaticamente.
|
||||
|
||||
**Atributos null:** Los atributos con valor `None` se filtran del dict de atributos para mantener el output limpio.
|
||||
|
||||
**source_chunk_indices:** Esta funcion no setea `source_chunk_indices` — ese campo lo llena el pipeline exterior que conoce el indice del chunk actual.
|
||||
|
||||
Esta funcion es el bloque atomico de extraccion. El pipeline completo de grafos la llama por cada chunk del documento y luego deduplica los candidatos resultantes.
|
||||
@@ -0,0 +1,145 @@
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
|
||||
|
||||
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
|
||||
"""Construye el system prompt para extraccion de entidades."""
|
||||
lines = [
|
||||
"You are an entity extraction expert. Given text, extract all entities",
|
||||
"matching these types. For each entity, provide: name, type_ref,",
|
||||
"attributes (matching the metadata_fields for that type), and a",
|
||||
"confidence score (0.0-1.0).",
|
||||
"",
|
||||
"Entity types:",
|
||||
]
|
||||
|
||||
for schema_entry in entity_schema:
|
||||
label = schema_entry.get("label", "Unknown")
|
||||
type_ref = schema_entry.get("type_ref", "")
|
||||
metadata_fields = schema_entry.get("metadata_fields", [])
|
||||
lines.append(f"- {label} (type_ref: {type_ref})")
|
||||
if metadata_fields:
|
||||
lines.append(f" fields: {', '.join(metadata_fields)}")
|
||||
|
||||
lines += [
|
||||
"",
|
||||
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
|
||||
"",
|
||||
"Rules:",
|
||||
"- Only extract entities explicitly mentioned in the text",
|
||||
"- Use the exact type_ref from the schema",
|
||||
"- Leave unknown attributes as null",
|
||||
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
|
||||
f"- {language_instruction}",
|
||||
]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def extract_entities_llm(
|
||||
text: str,
|
||||
entity_schema: list[dict],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[EntityCandidate]:
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
|
||||
|
||||
Construye un system prompt con el schema de entity types, llama al LLM
|
||||
y valida la respuesta retornando una lista de EntityCandidate.
|
||||
|
||||
Args:
|
||||
text: Chunk de texto a analizar.
|
||||
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
|
||||
dict con las claves 'type_ref', 'label' y opcionalmente
|
||||
'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
|
||||
y retorna un dict con la respuesta JSON del LLM. Interfaz:
|
||||
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
|
||||
language_instruction: Instruccion de idioma para el LLM. Por defecto
|
||||
"Respond in English."
|
||||
|
||||
Returns:
|
||||
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
|
||||
no retorna JSON valido o si no se encuentran entidades.
|
||||
|
||||
Raises:
|
||||
ValueError: Si entity_schema esta vacio.
|
||||
"""
|
||||
if not entity_schema:
|
||||
raise ValueError("entity_schema no puede estar vacio")
|
||||
|
||||
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
|
||||
type_ref_to_label = {
|
||||
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
|
||||
}
|
||||
|
||||
system_prompt = _build_system_prompt(entity_schema, language_instruction)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
|
||||
return []
|
||||
|
||||
raw_entities = response.get("entities", [])
|
||||
if not isinstance(raw_entities, list):
|
||||
warnings.warn(
|
||||
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
|
||||
stacklevel=2,
|
||||
)
|
||||
return []
|
||||
|
||||
candidates: list[EntityCandidate] = []
|
||||
for item in raw_entities:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
name = item.get("name", "")
|
||||
if not name:
|
||||
continue
|
||||
|
||||
type_ref = item.get("type_ref", "")
|
||||
if type_ref not in valid_type_refs:
|
||||
warnings.warn(
|
||||
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
|
||||
stacklevel=2,
|
||||
)
|
||||
continue
|
||||
|
||||
attributes = item.get("attributes", {})
|
||||
if not isinstance(attributes, dict):
|
||||
attributes = {}
|
||||
# Normalizar null values a None
|
||||
attributes = {k: v for k, v in attributes.items() if v is not None}
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
candidates.append(
|
||||
EntityCandidate(
|
||||
name=name,
|
||||
type_ref=type_ref,
|
||||
type_label=type_ref_to_label.get(type_ref, ""),
|
||||
attributes=attributes,
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return candidates
|
||||
@@ -0,0 +1,164 @@
|
||||
"""Tests para extract_entities_llm."""
|
||||
|
||||
import warnings
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from python.functions.datascience.extract_entities_llm import extract_entities_llm
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
|
||||
SCHEMA = [
|
||||
{
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
|
||||
},
|
||||
{
|
||||
"type_ref": "osint_domain_go_cybersecurity",
|
||||
"label": "Domain",
|
||||
"metadata_fields": ["fqdn", "registrar", "created_date"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def make_llm(response: dict):
|
||||
"""Crea un stub de LLM que retorna la respuesta dada."""
|
||||
def _llm(messages: list[dict]) -> dict:
|
||||
return response
|
||||
return _llm
|
||||
|
||||
|
||||
def test_texto_con_entidades_claras_retorna_entity_candidate():
|
||||
"""texto con entidades claras retorna EntityCandidate"""
|
||||
llm = make_llm({
|
||||
"entities": [
|
||||
{
|
||||
"name": "John Smith",
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"attributes": {"full_name": "John Smith", "nationality": "US"},
|
||||
"confidence": 0.95,
|
||||
},
|
||||
{
|
||||
"name": "evil-corp.com",
|
||||
"type_ref": "osint_domain_go_cybersecurity",
|
||||
"attributes": {"fqdn": "evil-corp.com"},
|
||||
"confidence": 0.88,
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
result = extract_entities_llm(
|
||||
"John Smith, US citizen, linked to evil-corp.com.", SCHEMA, llm
|
||||
)
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
person = next(e for e in result if e.name == "John Smith")
|
||||
assert person.type_ref == "osint_person_go_cybersecurity"
|
||||
assert person.type_label == "Person"
|
||||
assert person.attributes["full_name"] == "John Smith"
|
||||
assert person.confidence == 0.95
|
||||
|
||||
domain = next(e for e in result if e.name == "evil-corp.com")
|
||||
assert domain.type_ref == "osint_domain_go_cybersecurity"
|
||||
assert domain.type_label == "Domain"
|
||||
assert domain.attributes["fqdn"] == "evil-corp.com"
|
||||
assert domain.confidence == 0.88
|
||||
|
||||
|
||||
def test_texto_sin_entidades_retorna_lista_vacia():
|
||||
"""texto sin entidades retorna lista vacia"""
|
||||
llm = make_llm({"entities": []})
|
||||
|
||||
result = extract_entities_llm(
|
||||
"The sky is blue and the grass is green.", SCHEMA, llm
|
||||
)
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_llm_retorna_json_mal_formado_retorna_lista_vacia_con_warning():
|
||||
"""llm retorna json mal formado retorna lista vacia con warning"""
|
||||
def bad_llm(messages: list[dict]) -> dict:
|
||||
raise ValueError("JSON decode error")
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
result = extract_entities_llm("Some text with entities.", SCHEMA, bad_llm)
|
||||
|
||||
assert result == []
|
||||
assert len(caught) == 1
|
||||
assert "error llamando al LLM" in str(caught[0].message)
|
||||
|
||||
|
||||
def test_type_ref_invalido_en_respuesta_se_descarta_con_warning():
|
||||
"""type_ref invalido en respuesta se descarta con warning"""
|
||||
llm = make_llm({
|
||||
"entities": [
|
||||
{
|
||||
"name": "Valid Person",
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"attributes": {},
|
||||
"confidence": 0.9,
|
||||
},
|
||||
{
|
||||
"name": "Unknown Thing",
|
||||
"type_ref": "nonexistent_type_ref",
|
||||
"attributes": {},
|
||||
"confidence": 0.8,
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
result = extract_entities_llm("Text with entities.", SCHEMA, llm)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].name == "Valid Person"
|
||||
assert any("nonexistent_type_ref" in str(w.message) for w in caught)
|
||||
|
||||
|
||||
def test_confidence_se_propaga_correctamente():
|
||||
"""confidence se propaga correctamente"""
|
||||
llm = make_llm({
|
||||
"entities": [
|
||||
{
|
||||
"name": "Implied Person",
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"attributes": {},
|
||||
"confidence": 0.7,
|
||||
},
|
||||
{
|
||||
"name": "Weakly Implied Domain",
|
||||
"type_ref": "osint_domain_go_cybersecurity",
|
||||
"attributes": {},
|
||||
"confidence": 0.5,
|
||||
},
|
||||
{
|
||||
"name": "Explicit Entity",
|
||||
"type_ref": "osint_person_go_cybersecurity",
|
||||
"attributes": {},
|
||||
"confidence": 1.0,
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
result = extract_entities_llm("Some text.", SCHEMA, llm)
|
||||
|
||||
assert len(result) == 3
|
||||
confidences = {e.name: e.confidence for e in result}
|
||||
assert confidences["Implied Person"] == 0.7
|
||||
assert confidences["Weakly Implied Domain"] == 0.5
|
||||
assert confidences["Explicit Entity"] == 1.0
|
||||
|
||||
|
||||
def test_schema_vacio_lanza_value_error():
|
||||
"""schema vacio lanza ValueError"""
|
||||
llm = make_llm({"entities": []})
|
||||
|
||||
with pytest.raises(ValueError, match="entity_schema no puede estar vacio"):
|
||||
extract_entities_llm("Some text.", [], llm)
|
||||
@@ -0,0 +1,75 @@
|
||||
---
|
||||
name: extract_relations_llm
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_relations_llm(text: str, entities: list, relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list"
|
||||
description: "Extrae relaciones entre entidades de un chunk de texto usando un LLM inyectado. Valida que from_name y to_name correspondan a entidades existentes, y usa 'related_to' como fallback para tipos de relacion no permitidos."
|
||||
tags: [extraction, relation, llm, knowledge-graph, nlp, datascience, fuzzygraph, graph]
|
||||
uses_functions: []
|
||||
uses_types:
|
||||
- entity_candidate_py_datascience
|
||||
- relation_candidate_py_datascience
|
||||
returns:
|
||||
- relation_candidate_py_datascience
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [logging, sys, os, typing]
|
||||
tested: true
|
||||
tests:
|
||||
- "texto con dos entidades relacionadas"
|
||||
- "texto con entidades pero sin relacion"
|
||||
- "menos de dos entidades retorna lista vacia"
|
||||
- "llm inventa entidad que no existe se descarta"
|
||||
test_file_path: "python/functions/datascience/extract_relations_llm_test.py"
|
||||
file_path: "python/functions/datascience/extract_relations_llm.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from extract_relations_llm import extract_relations_llm
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
|
||||
# Stub de llm_chat_json (en produccion usar llm_completion_retry o similar)
|
||||
def my_llm(messages: list[dict]) -> dict:
|
||||
# Llamar al LLM real aqui
|
||||
return {"relations": [...]}
|
||||
|
||||
entities = [
|
||||
EntityCandidate(name="Acme Corp", type_label="Organization", confidence=0.95),
|
||||
EntityCandidate(name="John Smith", type_label="Person", confidence=0.9),
|
||||
]
|
||||
|
||||
relation_types = ["employs", "funds", "owns", "communicates_with", "related_to"]
|
||||
|
||||
relations = extract_relations_llm(
|
||||
text="Acme Corp employs John Smith as CEO and funds his research.",
|
||||
entities=entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=my_llm,
|
||||
)
|
||||
|
||||
for rel in relations:
|
||||
print(f"{rel.from_name} --[{rel.relation_type}]--> {rel.to_name} ({rel.confidence:.2f})")
|
||||
# Acme Corp --[employs]--> John Smith (0.90)
|
||||
# Acme Corp --[funds]--> John Smith (0.85)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe una lista de mensajes en formato OpenAI (`[{"role": "system", "content": ...}, {"role": "user", "content": ...}]`) y retorna un dict con la clave `"relations"`. Esto desacopla la funcion de cualquier proveedor de LLM concreto.
|
||||
|
||||
**Validacion de entidades:** Solo se aceptan relaciones donde `from_name` y `to_name` aparecen exactamente en los nombres de las entidades proporcionadas. Relaciones con nombres inventados por el LLM se descartan silenciosamente (con debug log).
|
||||
|
||||
**Fallback de tipo:** Si el LLM propone un `relation_type` que no esta en la lista permitida, se reemplaza por `"related_to"`. Si `"related_to"` tampoco esta en la lista, se incluye igualmente como catch-all seguro.
|
||||
|
||||
**Menos de 2 entidades:** La funcion retorna `[]` inmediatamente sin llamar al LLM, ya que no puede haber relaciones con menos de 2 participantes.
|
||||
|
||||
**Error handling:** Si `llm_chat_json` lanza una excepcion, se captura con warning y retorna `[]`. Si la respuesta no contiene la clave `"relations"` o no es una lista, idem.
|
||||
|
||||
**Confianza:** Los valores de confianza del LLM se clampean al rango `[0.0, 1.0]`. Valores no numericos se convierten a `0.0`.
|
||||
|
||||
Disenado para fuzzygraph — se compone con `extract_entities_llm` (paso anterior) y `deduplicate_relations` (paso siguiente en el pipeline de extraccion).
|
||||
@@ -0,0 +1,141 @@
|
||||
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.types.datascience.relation_candidate import RelationCandidate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_relations_llm(
|
||||
text: str,
|
||||
entities: list[EntityCandidate],
|
||||
relation_types: list[str],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[RelationCandidate]:
|
||||
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
|
||||
|
||||
Dado el texto original y las entidades ya extraidas, pide al LLM que
|
||||
identifique relaciones entre pares de entidades. Las relaciones cuyo
|
||||
from_name o to_name no coincidan con ninguna entidad existente se descartan.
|
||||
Los tipos de relacion no permitidos se reemplazan por "related_to".
|
||||
|
||||
Args:
|
||||
text: chunk de texto (el mismo que se uso para extraer las entidades).
|
||||
entities: entidades ya extraidas del chunk.
|
||||
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
|
||||
"communicates_with", "owns", "related_to"].
|
||||
llm_chat_json: funcion inyectada que recibe una lista de mensajes
|
||||
(dicts con "role" y "content") y retorna un dict con la respuesta
|
||||
JSON del LLM.
|
||||
language_instruction: instruccion de idioma para el LLM.
|
||||
|
||||
Returns:
|
||||
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
|
||||
o si el LLM no encuentra relaciones.
|
||||
"""
|
||||
if len(entities) < 2:
|
||||
return []
|
||||
|
||||
entity_names = {e.name for e in entities}
|
||||
relation_types_set = set(relation_types)
|
||||
|
||||
# Construir lista de entidades para el prompt
|
||||
entity_lines = "\n".join(
|
||||
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
|
||||
)
|
||||
|
||||
# Construir tipos de relacion para el prompt
|
||||
relation_types_str = ", ".join(relation_types)
|
||||
|
||||
system_prompt = f"""\
|
||||
You are a relation extraction expert. Given text and a list of entities already \
|
||||
extracted, identify relationships between them.
|
||||
|
||||
Entities found in this text:
|
||||
{entity_lines}
|
||||
|
||||
Allowed relation types: {relation_types_str}
|
||||
|
||||
Output JSON: {{"relations": [
|
||||
{{"from_name": "Entity A", "to_name": "Entity B",
|
||||
"relation_type": "employs", "description": "...", "confidence": 0.8}}
|
||||
]}}
|
||||
|
||||
Rules:
|
||||
- Only extract relations explicitly stated or strongly implied in the text
|
||||
- from_name and to_name must match entity names exactly as listed above
|
||||
- relation_type must be one of the allowed types
|
||||
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
|
||||
- Do not invent entities not in the list above
|
||||
- {language_instruction}"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
|
||||
return []
|
||||
|
||||
raw_relations = response.get("relations", [])
|
||||
if not isinstance(raw_relations, list):
|
||||
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
|
||||
return []
|
||||
|
||||
results: list[RelationCandidate] = []
|
||||
for item in raw_relations:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
from_name = item.get("from_name", "")
|
||||
to_name = item.get("to_name", "")
|
||||
|
||||
# Validar que ambos nombres corresponden a entidades existentes
|
||||
if from_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
|
||||
from_name,
|
||||
)
|
||||
continue
|
||||
if to_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
|
||||
to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
relation_type = item.get("relation_type", "")
|
||||
if relation_type not in relation_types_set:
|
||||
logger.debug(
|
||||
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
|
||||
relation_type,
|
||||
)
|
||||
relation_type = "related_to"
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
results.append(
|
||||
RelationCandidate(
|
||||
from_name=from_name,
|
||||
to_name=to_name,
|
||||
relation_type=relation_type,
|
||||
description=item.get("description", ""),
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,140 @@
|
||||
"""Tests para extract_relations_llm."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Rutas para importar desde el registry
|
||||
REGISTRY_ROOT = os.path.join(os.path.dirname(__file__), "..", "..", "..", "")
|
||||
sys.path.insert(0, REGISTRY_ROOT)
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.types.datascience.relation_candidate import RelationCandidate
|
||||
from extract_relations_llm import extract_relations_llm
|
||||
|
||||
|
||||
def _make_entity(name: str, type_label: str = "Entity") -> EntityCandidate:
|
||||
return EntityCandidate(name=name, type_label=type_label, confidence=0.9)
|
||||
|
||||
|
||||
def _make_llm(response: dict):
|
||||
"""Crea un stub de llm_chat_json que retorna la respuesta fija."""
|
||||
def llm_chat_json(messages: list[dict]) -> dict:
|
||||
return response
|
||||
return llm_chat_json
|
||||
|
||||
|
||||
def test_texto_con_dos_entidades_relacionadas():
|
||||
entities = [_make_entity("Acme Corp", "Organization"), _make_entity("John Smith", "Person")]
|
||||
relation_types = ["employs", "funds", "related_to"]
|
||||
|
||||
llm_response = {
|
||||
"relations": [
|
||||
{
|
||||
"from_name": "Acme Corp",
|
||||
"to_name": "John Smith",
|
||||
"relation_type": "employs",
|
||||
"description": "Acme Corp employs John Smith as CEO",
|
||||
"confidence": 0.9,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
result = extract_relations_llm(
|
||||
text="Acme Corp employs John Smith as CEO.",
|
||||
entities=entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=_make_llm(llm_response),
|
||||
)
|
||||
|
||||
assert len(result) == 1
|
||||
rel = result[0]
|
||||
assert rel.from_name == "Acme Corp"
|
||||
assert rel.to_name == "John Smith"
|
||||
assert rel.relation_type == "employs"
|
||||
assert rel.confidence == 0.9
|
||||
assert "CEO" in rel.description
|
||||
|
||||
|
||||
def test_texto_con_entidades_pero_sin_relacion():
|
||||
entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")]
|
||||
relation_types = ["funds", "employs"]
|
||||
|
||||
llm_response = {"relations": []}
|
||||
|
||||
result = extract_relations_llm(
|
||||
text="Alice and Bob both attended the conference.",
|
||||
entities=entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=_make_llm(llm_response),
|
||||
)
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_menos_de_dos_entidades_retorna_lista_vacia():
|
||||
entities = [_make_entity("Solo Corp", "Organization")]
|
||||
relation_types = ["employs", "funds"]
|
||||
|
||||
# El LLM nunca deberia ser llamado, pero si lo fuera retornaria relaciones
|
||||
llm_response = {
|
||||
"relations": [
|
||||
{"from_name": "Solo Corp", "to_name": "Nobody", "relation_type": "employs", "confidence": 0.9}
|
||||
]
|
||||
}
|
||||
|
||||
result = extract_relations_llm(
|
||||
text="Solo Corp is a company.",
|
||||
entities=entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=_make_llm(llm_response),
|
||||
)
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_llm_inventa_entidad_que_no_existe_se_descarta():
|
||||
entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")]
|
||||
relation_types = ["funds", "employs", "related_to"]
|
||||
|
||||
llm_response = {
|
||||
"relations": [
|
||||
# Valida — Alice y Bob existen
|
||||
{
|
||||
"from_name": "Alice",
|
||||
"to_name": "Bob",
|
||||
"relation_type": "funds",
|
||||
"description": "Alice funds Bob",
|
||||
"confidence": 0.8,
|
||||
},
|
||||
# Invalida — "Charlie" no esta en entities
|
||||
{
|
||||
"from_name": "Alice",
|
||||
"to_name": "Charlie",
|
||||
"relation_type": "employs",
|
||||
"description": "Alice employs Charlie",
|
||||
"confidence": 0.7,
|
||||
},
|
||||
# Invalida — "Unknown Corp" no esta en entities
|
||||
{
|
||||
"from_name": "Unknown Corp",
|
||||
"to_name": "Bob",
|
||||
"relation_type": "related_to",
|
||||
"description": "...",
|
||||
"confidence": 0.6,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
result = extract_relations_llm(
|
||||
text="Alice funds Bob. Alice also employs Charlie from Unknown Corp.",
|
||||
entities=entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=_make_llm(llm_response),
|
||||
)
|
||||
|
||||
# Solo la primera relacion es valida
|
||||
assert len(result) == 1
|
||||
assert result[0].from_name == "Alice"
|
||||
assert result[0].to_name == "Bob"
|
||||
assert result[0].relation_type == "funds"
|
||||
@@ -0,0 +1,72 @@
|
||||
---
|
||||
name: hotness_score
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def hotness_score(active_count: int, updated_at: datetime | None, now: datetime | None = None, half_life_days: float = 7.0) -> float"
|
||||
description: "Calcula un score de hotness combinando frecuencia de acceso y recencia temporal. Util para ranking de resultados, memoria hot/cold y cache eviction."
|
||||
tags: [ranking, decay, recency, frequency, scoring, cache, memory, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [math, datetime]
|
||||
tested: true
|
||||
tests:
|
||||
- "active_count=0, updated_at reciente"
|
||||
- "active_count=100, updated_at reciente (score alto)"
|
||||
- "active_count=100, updated_at hace 30 dias (score bajo)"
|
||||
- "updated_at=None (retorna 0.0)"
|
||||
- "now explicito (determinista para tests)"
|
||||
- "half_life_days custom"
|
||||
test_file_path: "python/functions/datascience/hotness_score_test.py"
|
||||
file_path: "python/functions/datascience/hotness_score.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datascience.hotness_score import hotness_score
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Item reciente con muchos accesos -> score alto
|
||||
score = hotness_score(active_count=150, updated_at=now - timedelta(hours=2), now=now)
|
||||
# score > 0.95
|
||||
|
||||
# Item antiguo aunque muy accedido -> score bajo
|
||||
score = hotness_score(active_count=150, updated_at=now - timedelta(days=30), now=now)
|
||||
# score ~ 0.05
|
||||
|
||||
# Item sin fecha -> siempre 0
|
||||
score = hotness_score(active_count=999, updated_at=None)
|
||||
# score == 0.0
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Formula: `score = sigmoid(log1p(active_count)) * exp(-ln(2)/half_life_days * age_days)`
|
||||
|
||||
**Componente de frecuencia** — `sigmoid(log1p(count))` mapea enteros no negativos al rango `(0.5, 1.0)`:
|
||||
- count=0 -> 0.5
|
||||
- count=10 -> ~0.92
|
||||
- count=100 -> ~0.99
|
||||
|
||||
**Componente de recencia** — decaimiento exponencial con vida media configurable:
|
||||
- `half_life_days=7` (default): score se reduce a la mitad cada 7 dias
|
||||
- `half_life_days=1`: decaimiento agresivo (util para feeds en tiempo real)
|
||||
- `half_life_days=365`: decaimiento lento (util para contenido evergreen)
|
||||
|
||||
**Propiedades del score:**
|
||||
- `updated_at=None` -> 0.0 siempre (item sin fecha no tiene hotness)
|
||||
- `active_count=0, reciente` -> ~0.5 (neutro pero fresco)
|
||||
- `active_count alto, reciente` -> ~1.0 (muy caliente)
|
||||
- `active_count alto, antiguo` -> ~0.0 (frio a pesar de popularidad pasada)
|
||||
|
||||
Timestamps sin timezone se interpretan como UTC. Pasar `now` explicitamente garantiza determinismo en tests y reproducibilidad en pipelines batch.
|
||||
|
||||
Fuente conceptual: openviking/retrieve/memory_lifecycle.py (AGPL-3.0). Reimplementado desde cero con formula equivalente.
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Hotness score — combining access frequency and recency decay."""
|
||||
|
||||
import math
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def hotness_score(
|
||||
active_count: int,
|
||||
updated_at: datetime | None,
|
||||
now: datetime | None = None,
|
||||
half_life_days: float = 7.0,
|
||||
) -> float:
|
||||
"""Calcula un score de hotness combinando frecuencia de acceso y recencia.
|
||||
|
||||
Formula: sigmoid(log1p(active_count)) * exp_decay(age_days, half_life_days)
|
||||
|
||||
El componente de frecuencia mapea conteos enteros al rango (0, 1) via sigmoid(log1p).
|
||||
El componente de recencia decae exponencialmente con vida media configurable.
|
||||
|
||||
Args:
|
||||
active_count: Numero de accesos o activaciones. Debe ser >= 0.
|
||||
updated_at: Timestamp de la ultima actualizacion. None retorna 0.0.
|
||||
now: Momento de referencia para calcular la edad. Si es None usa datetime.now(UTC).
|
||||
half_life_days: Dias para que la recencia se reduzca a la mitad. Default 7.
|
||||
|
||||
Returns:
|
||||
float en [0.0, 1.0]. Valores mas cercanos a 1.0 indican mayor hotness.
|
||||
"""
|
||||
if updated_at is None:
|
||||
return 0.0
|
||||
|
||||
# Componente de frecuencia: sigmoid(log1p(count)) mapea 0..inf -> (0.5, 1.0)
|
||||
freq = 1.0 / (1.0 + math.exp(-math.log1p(active_count)))
|
||||
|
||||
# Componente de recencia: decaimiento exponencial
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Normalizar ambos timestamps a UTC para comparacion segura
|
||||
if updated_at.tzinfo is None:
|
||||
updated_at = updated_at.replace(tzinfo=timezone.utc)
|
||||
if now.tzinfo is None:
|
||||
now = now.replace(tzinfo=timezone.utc)
|
||||
|
||||
age_days = max((now - updated_at).total_seconds() / 86400.0, 0.0)
|
||||
decay_rate = math.log(2) / half_life_days
|
||||
recency = math.exp(-decay_rate * age_days)
|
||||
|
||||
return freq * recency
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Tests para hotness_score."""
|
||||
|
||||
import math
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from hotness_score import hotness_score
|
||||
|
||||
NOW = datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def test_active_count_zero_updated_at_reciente():
|
||||
"""active_count=0, updated_at reciente."""
|
||||
updated_at = NOW - timedelta(hours=1)
|
||||
score = hotness_score(0, updated_at, now=NOW)
|
||||
# freq = sigmoid(log1p(0)) = sigmoid(0) = 0.5
|
||||
# recency ~ 1.0 (casi nuevo)
|
||||
assert 0.45 < score < 0.55, f"Expected ~0.5, got {score}"
|
||||
|
||||
|
||||
def test_active_count_alto_updated_at_reciente():
|
||||
"""active_count=100, updated_at reciente (score alto)."""
|
||||
updated_at = NOW - timedelta(hours=1)
|
||||
score = hotness_score(100, updated_at, now=NOW)
|
||||
# freq = sigmoid(log1p(100)) = sigmoid(4.615) ~ 0.99
|
||||
# recency ~ 1.0
|
||||
assert score > 0.95, f"Expected > 0.95, got {score}"
|
||||
|
||||
|
||||
def test_active_count_alto_updated_at_hace_30_dias():
|
||||
"""active_count=100, updated_at hace 30 dias (score bajo)."""
|
||||
updated_at = NOW - timedelta(days=30)
|
||||
score = hotness_score(100, updated_at, now=NOW)
|
||||
# recency = exp(-ln2/7 * 30) = exp(-2.97) ~ 0.051
|
||||
# score ~ 0.99 * 0.051 ~ 0.05
|
||||
assert score < 0.1, f"Expected < 0.1, got {score}"
|
||||
|
||||
|
||||
def test_updated_at_none_retorna_cero():
|
||||
"""updated_at=None (retorna 0.0)."""
|
||||
score = hotness_score(100, None, now=NOW)
|
||||
assert score == 0.0, f"Expected 0.0, got {score}"
|
||||
|
||||
|
||||
def test_now_explicito():
|
||||
"""now explicito (determinista para tests)."""
|
||||
updated_at = NOW - timedelta(days=7)
|
||||
score = hotness_score(50, updated_at, now=NOW)
|
||||
# recency = exp(-ln2/7 * 7) = 0.5
|
||||
# freq = sigmoid(log1p(50)) ~ sigmoid(3.93) ~ 0.981
|
||||
expected = (1.0 / (1.0 + math.exp(-math.log1p(50)))) * 0.5
|
||||
assert abs(score - expected) < 1e-9, f"Expected {expected}, got {score}"
|
||||
|
||||
|
||||
def test_half_life_days_custom():
|
||||
"""half_life_days custom."""
|
||||
updated_at = NOW - timedelta(days=1)
|
||||
# Con half_life=1 dia, despues de 1 dia recency = 0.5
|
||||
score = hotness_score(50, updated_at, now=NOW, half_life_days=1.0)
|
||||
freq = 1.0 / (1.0 + math.exp(-math.log1p(50)))
|
||||
expected = freq * 0.5
|
||||
assert abs(score - expected) < 1e-6, f"Expected {expected}, got {score}"
|
||||
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: melt
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def melt(rows: list[dict], id_vars: list[str], value_vars: list[str] | None = None, var_name: str = 'variable', value_name: str = 'value') -> list[dict]"
|
||||
description: "Inversa de pivot. Convierte columnas en filas (formato largo). Cada combinacion de id_vars + value_var genera una fila. Si value_vars es None, derrite todas las columnas no-id."
|
||||
tags: [datascience, tabular, melt, unpivot, transform, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests:
|
||||
- "Melt basico"
|
||||
- "Multiples id_vars"
|
||||
- "value_vars None derrite todas las columnas no-id"
|
||||
- "Fila con campo faltante en value_vars"
|
||||
test_file_path: "python/functions/datascience/melt_test.py"
|
||||
file_path: "python/functions/datascience/melt.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
rows = [{"region": "US", "q1": 10, "q2": 20}]
|
||||
melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
|
||||
# [{"region": "US", "variable": "q1", "value": 10},
|
||||
# {"region": "US", "variable": "q2", "value": 20}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin dependencias externas.
|
||||
Si un campo de value_vars no existe en la fila, su valor sera None.
|
||||
El parametro value_vars=None es util cuando se desconoce el schema exacto.
|
||||
@@ -0,0 +1,40 @@
|
||||
"""Melt (unpivot) para datos tabulares list[dict]."""
|
||||
|
||||
|
||||
def melt(
|
||||
rows: list[dict],
|
||||
id_vars: list[str],
|
||||
value_vars: list[str] | None = None,
|
||||
var_name: str = "variable",
|
||||
value_name: str = "value",
|
||||
) -> list[dict]:
|
||||
"""Convierte columnas en filas (formato largo). Inversa de pivot.
|
||||
|
||||
Cada combinacion de id_vars + value_var genera una fila nueva.
|
||||
Si value_vars es None, se usan todas las columnas que no esten en id_vars.
|
||||
|
||||
Args:
|
||||
rows: Lista de dicts en formato ancho.
|
||||
id_vars: Columnas que se mantienen como identificadores en cada fila.
|
||||
value_vars: Columnas a convertir en filas. None = todas las no-id.
|
||||
var_name: Nombre de la columna que contendra los nombres de variables.
|
||||
value_name: Nombre de la columna que contendra los valores.
|
||||
|
||||
Returns:
|
||||
Lista de dicts en formato largo con una fila por combinacion id+variable.
|
||||
"""
|
||||
result = []
|
||||
for row in rows:
|
||||
# Determinar que columnas derretir
|
||||
if value_vars is None:
|
||||
vars_to_melt = [k for k in row if k not in id_vars]
|
||||
else:
|
||||
vars_to_melt = value_vars
|
||||
|
||||
for var in vars_to_melt:
|
||||
new_row: dict = {k: row.get(k) for k in id_vars}
|
||||
new_row[var_name] = var
|
||||
new_row[value_name] = row.get(var)
|
||||
result.append(new_row)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Tests para melt."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from melt import melt
|
||||
|
||||
|
||||
def test_melt_basico():
|
||||
"""Melt basico."""
|
||||
rows = [{"region": "US", "q1": 10, "q2": 20}]
|
||||
result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
|
||||
assert len(result) == 2
|
||||
assert result[0] == {"region": "US", "variable": "q1", "value": 10}
|
||||
assert result[1] == {"region": "US", "variable": "q2", "value": 20}
|
||||
|
||||
|
||||
def test_melt_multiples_id_vars():
|
||||
"""Multiples id_vars."""
|
||||
rows = [{"region": "US", "year": 2023, "q1": 10, "q2": 20}]
|
||||
result = melt(rows, id_vars=["region", "year"], value_vars=["q1", "q2"])
|
||||
assert len(result) == 2
|
||||
assert result[0]["region"] == "US"
|
||||
assert result[0]["year"] == 2023
|
||||
assert result[0]["variable"] == "q1"
|
||||
assert result[0]["value"] == 10
|
||||
assert result[1]["variable"] == "q2"
|
||||
assert result[1]["value"] == 20
|
||||
|
||||
|
||||
def test_melt_value_vars_none_derrite_todas_las_columnas_no_id():
|
||||
"""value_vars None derrite todas las columnas no-id."""
|
||||
rows = [{"id": 1, "a": 10, "b": 20, "c": 30}]
|
||||
result = melt(rows, id_vars=["id"])
|
||||
assert len(result) == 3
|
||||
vars_found = {r["variable"] for r in result}
|
||||
assert vars_found == {"a", "b", "c"}
|
||||
values_found = {r["value"] for r in result}
|
||||
assert values_found == {10, 20, 30}
|
||||
|
||||
|
||||
def test_melt_fila_con_campo_faltante_en_value_vars():
|
||||
"""Fila con campo faltante en value_vars."""
|
||||
rows = [{"region": "US", "q1": 10}] # q2 no existe
|
||||
result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
|
||||
assert len(result) == 2
|
||||
q2_row = next(r for r in result if r["variable"] == "q2")
|
||||
assert q2_row["value"] is None
|
||||
@@ -0,0 +1,68 @@
|
||||
---
|
||||
name: merge_graphs
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def merge_graphs(graphs: list[dict], entity_key: str = 'name', similarity_threshold: float = 0.85) -> dict"
|
||||
description: "Mergea multiples grafos de conocimiento en uno deduplicando entities por similitud de nombre (Levenshtein normalizado). Relaciones se re-apuntan a las entities canonicas. Atributos se combinan por union."
|
||||
tags: [graph, merge, deduplication, knowledge-graph, levenshtein, similarity, datascience]
|
||||
uses_functions: [levenshtein_distance_py_cybersecurity]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [sys, os]
|
||||
tested: true
|
||||
tests:
|
||||
- "dos grafos con entity duplicada → merge"
|
||||
- "entities similares pero bajo threshold → no merge"
|
||||
- "relaciones re-apuntadas correctamente"
|
||||
- "merge log registra cada merge"
|
||||
- "tres grafos → merge transitivo"
|
||||
- "grafos sin overlap → concatenacion simple"
|
||||
test_file_path: "python/functions/datascience/merge_graphs_test.py"
|
||||
file_path: "python/functions/datascience/merge_graphs.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
g1 = {
|
||||
"entities": [
|
||||
{"id": "1", "name": "Alice Corp", "type": "company"},
|
||||
{"id": "2", "name": "Bob", "type": "person"},
|
||||
],
|
||||
"relations": [
|
||||
{"source_id": "2", "target_id": "1", "relation_type": "works_at"},
|
||||
],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [
|
||||
{"id": "3", "name": "Alice Corp.", "type": "company", "country": "US"},
|
||||
],
|
||||
"relations": [],
|
||||
}
|
||||
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.85)
|
||||
# result["entities"] -> 2 entities (Alice Corp mergeada, Bob)
|
||||
# result["merge_log"] -> [{"merged": ["3", "1"], "into": "1", "similarity": 0.909}]
|
||||
# "Alice Corp." mergeada en "Alice Corp" porque similitud > 0.85
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura. Reutiliza `levenshtein_distance_py_cybersecurity` para calcular similitud normalizada entre nombres.
|
||||
|
||||
**Algoritmo de merge transitivo**: si A~B y B~C, entonces A, B, C se mergean en uno solo. Se implementa via union-find (path compression simple).
|
||||
|
||||
**Eleccion de canonical**: la entity con mas campos no-null gana. En caso de empate, la primera encontrada en el par.
|
||||
|
||||
**Conflictos de atributos**: si ambas entities tienen un campo con valor, el canonical conserva el suyo (primero gana). Solo se copian campos que el canonical no tiene o tiene null.
|
||||
|
||||
**Deduplicacion de relaciones**: por (source_id, target_id, relation_type). Si dos relaciones son identicas tras re-apuntar los IDs, se conserva la primera encontrada.
|
||||
|
||||
**Complejidad**: O(n^2) en numero de entities por la comparacion de pares. Adecuado para grafos de knowledge tipicos (< 10K entities). Para grafos muy grandes, usar indexado por prefijo antes de comparar.
|
||||
|
||||
**Importacion**: intenta importar `levenshtein_distance` desde el paquete `cybersecurity` del registry. Si no esta disponible, usa una reimplementacion inline equivalente.
|
||||
@@ -0,0 +1,169 @@
|
||||
"""merge_graphs — mergea multiples grafos de conocimiento deduplicando entities por similitud."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Importar levenshtein_distance desde el registry
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "cybersecurity"))
|
||||
try:
|
||||
from cybersecurity import levenshtein_distance
|
||||
except ImportError:
|
||||
# Fallback: reimplementacion inline si el paquete no esta disponible
|
||||
def levenshtein_distance(a: str, b: str) -> int:
|
||||
"""Calcula la distancia de Levenshtein entre dos strings."""
|
||||
if len(a) < len(b):
|
||||
return levenshtein_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev_row = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr_row = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr_row.append(
|
||||
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
|
||||
)
|
||||
prev_row = curr_row
|
||||
return prev_row[-1]
|
||||
|
||||
|
||||
def _name_similarity(a: str, b: str) -> float:
|
||||
"""Similitud de Levenshtein normalizada entre 0 y 1."""
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
max_len = max(len(a), len(b))
|
||||
if max_len == 0:
|
||||
return 1.0
|
||||
dist = levenshtein_distance(a.lower(), b.lower())
|
||||
return 1.0 - dist / max_len
|
||||
|
||||
|
||||
def _count_non_null_fields(entity: dict) -> int:
|
||||
"""Cuenta campos con valor no-None."""
|
||||
return sum(1 for v in entity.values() if v is not None)
|
||||
|
||||
|
||||
def _merge_two_entities(canonical: dict, other: dict) -> dict:
|
||||
"""Combina dos entities: union de campos, ultimo gana en conflictos."""
|
||||
merged = dict(canonical)
|
||||
for k, v in other.items():
|
||||
if k not in merged or merged[k] is None:
|
||||
merged[k] = v
|
||||
# Si ambos tienen valor, el canonical (primero) gana — no sobreescribir
|
||||
return merged
|
||||
|
||||
|
||||
def merge_graphs(
|
||||
graphs: list[dict],
|
||||
entity_key: str = "name",
|
||||
similarity_threshold: float = 0.85,
|
||||
) -> dict:
|
||||
"""Mergea multiples grafos de conocimiento en uno, deduplicando entities por similitud.
|
||||
|
||||
Algoritmo:
|
||||
1. Juntar todas las entities de todos los grafos (con ID de origen).
|
||||
2. Para cada par con similitud de nombre >= threshold, mergear.
|
||||
3. Elegir entity canonica (la que tiene mas campos no-null).
|
||||
4. Re-apuntar relaciones al ID canonico.
|
||||
5. Deduplicar relaciones identicas (mismo source, target, type).
|
||||
6. Registrar cada merge en merge_log.
|
||||
|
||||
Args:
|
||||
graphs: Lista de grafos. Cada grafo es un dict con keys:
|
||||
"entities" (list[dict]) y "relations" (list[dict]).
|
||||
Las entities deben tener "id" y el campo entity_key.
|
||||
entity_key: Campo de texto usado para calcular similitud. Default "name".
|
||||
similarity_threshold: Umbral de similitud Levenshtein normalizada [0,1].
|
||||
Default 0.85.
|
||||
|
||||
Returns:
|
||||
Dict con keys: entities, relations, merge_log.
|
||||
"""
|
||||
# Recopilar todas las entities y relaciones
|
||||
all_entities: list[dict] = []
|
||||
all_relations: list[dict] = []
|
||||
|
||||
for graph in graphs:
|
||||
all_entities.extend(graph.get("entities", []))
|
||||
all_relations.extend(graph.get("relations", []))
|
||||
|
||||
# Construir union-find para agrupar entities similares
|
||||
# id_map: entity_id original -> entity_id canonico
|
||||
id_map: dict[str, str] = {e["id"]: e["id"] for e in all_entities if "id" in e}
|
||||
entity_by_id: dict[str, dict] = {e["id"]: e for e in all_entities if "id" in e}
|
||||
|
||||
merge_log: list[dict] = []
|
||||
|
||||
def find_canonical(eid: str) -> str:
|
||||
while id_map.get(eid, eid) != eid:
|
||||
eid = id_map[eid]
|
||||
return eid
|
||||
|
||||
entity_ids = [e["id"] for e in all_entities if "id" in e]
|
||||
|
||||
# Comparar todos los pares (O(n^2) — aceptable para grafos de knowledge tipicos)
|
||||
for i in range(len(entity_ids)):
|
||||
for j in range(i + 1, len(entity_ids)):
|
||||
id_i = find_canonical(entity_ids[i])
|
||||
id_j = find_canonical(entity_ids[j])
|
||||
|
||||
if id_i == id_j:
|
||||
continue # ya mergeados
|
||||
|
||||
e_i = entity_by_id.get(id_i)
|
||||
e_j = entity_by_id.get(id_j)
|
||||
|
||||
if e_i is None or e_j is None:
|
||||
continue
|
||||
|
||||
name_i = str(e_i.get(entity_key, ""))
|
||||
name_j = str(e_j.get(entity_key, ""))
|
||||
|
||||
sim = _name_similarity(name_i, name_j)
|
||||
if sim >= similarity_threshold:
|
||||
# Elegir canonical: el que tiene mas campos no-null
|
||||
if _count_non_null_fields(e_i) >= _count_non_null_fields(e_j):
|
||||
canonical_id, other_id = id_i, id_j
|
||||
else:
|
||||
canonical_id, other_id = id_j, id_i
|
||||
|
||||
# Mergear datos
|
||||
merged = _merge_two_entities(entity_by_id[canonical_id], entity_by_id[other_id])
|
||||
entity_by_id[canonical_id] = merged
|
||||
|
||||
# Redirigir other_id -> canonical_id
|
||||
id_map[other_id] = canonical_id
|
||||
|
||||
merge_log.append({
|
||||
"merged": [other_id, canonical_id],
|
||||
"into": canonical_id,
|
||||
"similarity": round(sim, 4),
|
||||
})
|
||||
|
||||
# Construir lista final de entities (solo canonicas)
|
||||
canonical_ids = {eid for eid in entity_ids if find_canonical(eid) == eid}
|
||||
final_entities = [entity_by_id[eid] for eid in canonical_ids if eid in entity_by_id]
|
||||
|
||||
# Re-apuntar relaciones a IDs canonicos
|
||||
final_relations_set: dict[tuple, dict] = {}
|
||||
for rel in all_relations:
|
||||
new_rel = dict(rel)
|
||||
if "source_id" in new_rel:
|
||||
new_rel["source_id"] = find_canonical(new_rel["source_id"])
|
||||
if "target_id" in new_rel:
|
||||
new_rel["target_id"] = find_canonical(new_rel["target_id"])
|
||||
|
||||
# Deduplicar por (source_id, target_id, relation_type)
|
||||
rel_key = (
|
||||
new_rel.get("source_id", ""),
|
||||
new_rel.get("target_id", ""),
|
||||
new_rel.get("relation_type", ""),
|
||||
)
|
||||
if rel_key not in final_relations_set:
|
||||
final_relations_set[rel_key] = new_rel
|
||||
|
||||
return {
|
||||
"entities": final_entities,
|
||||
"relations": list(final_relations_set.values()),
|
||||
"merge_log": merge_log,
|
||||
}
|
||||
@@ -0,0 +1,120 @@
|
||||
"""Tests para merge_graphs."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from merge_graphs import merge_graphs
|
||||
|
||||
|
||||
def test_dos_grafos_con_entity_duplicada_merge():
|
||||
g1 = {
|
||||
"entities": [{"id": "1", "name": "Alice Corp", "type": "company"}],
|
||||
"relations": [],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [{"id": "2", "name": "Alice Corp", "type": "company", "country": "US"}],
|
||||
"relations": [],
|
||||
}
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.95)
|
||||
# Nombres identicos -> similitud 1.0 -> deben mergearse
|
||||
assert len(result["entities"]) == 1
|
||||
assert len(result["merge_log"]) == 1
|
||||
merged = result["entities"][0]
|
||||
# El merge debe preservar "country" aunque el canonical no lo tuviera
|
||||
assert merged.get("country") == "US" or merged.get("name") == "Alice Corp"
|
||||
|
||||
|
||||
def test_entities_similares_pero_bajo_threshold_no_merge():
|
||||
g1 = {
|
||||
"entities": [{"id": "1", "name": "Alice"}],
|
||||
"relations": [],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [{"id": "2", "name": "Bob"}],
|
||||
"relations": [],
|
||||
}
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.85)
|
||||
# Alice y Bob son muy distintos -> no merge
|
||||
assert len(result["entities"]) == 2
|
||||
assert len(result["merge_log"]) == 0
|
||||
|
||||
|
||||
def test_relaciones_re_apuntadas_correctamente():
|
||||
g1 = {
|
||||
"entities": [
|
||||
{"id": "1", "name": "Alice Corp"},
|
||||
{"id": "2", "name": "Bob"},
|
||||
],
|
||||
"relations": [
|
||||
{"source_id": "2", "target_id": "1", "relation_type": "works_at"},
|
||||
],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [
|
||||
{"id": "3", "name": "Alice Corp"}, # duplicada de id=1
|
||||
],
|
||||
"relations": [
|
||||
{"source_id": "3", "target_id": "2", "relation_type": "knows"},
|
||||
],
|
||||
}
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.95)
|
||||
# Entity 3 mergeada en 1 -> relacion source_id=3 debe apuntar al canonical de 1
|
||||
assert len(result["entities"]) == 2 # Alice Corp + Bob
|
||||
# Verificar que las relaciones tienen IDs canonicos (no "3")
|
||||
for rel in result["relations"]:
|
||||
assert rel["source_id"] != "3"
|
||||
assert rel["target_id"] != "3"
|
||||
|
||||
|
||||
def test_merge_log_registra_cada_merge():
|
||||
g1 = {
|
||||
"entities": [{"id": "1", "name": "OpenAI"}],
|
||||
"relations": [],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [{"id": "2", "name": "OpenAI"}],
|
||||
"relations": [],
|
||||
}
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.9)
|
||||
assert len(result["merge_log"]) == 1
|
||||
log = result["merge_log"][0]
|
||||
assert "merged" in log
|
||||
assert "into" in log
|
||||
assert "similarity" in log
|
||||
assert log["similarity"] == 1.0
|
||||
|
||||
|
||||
def test_tres_grafos_merge_transitivo():
|
||||
# A~B y B~C -> A, B, C deben mergearse en uno
|
||||
g1 = {"entities": [{"id": "1", "name": "Acme Corp"}], "relations": []}
|
||||
g2 = {"entities": [{"id": "2", "name": "Acme Corp"}], "relations": []}
|
||||
g3 = {"entities": [{"id": "3", "name": "Acme Corp"}], "relations": []}
|
||||
result = merge_graphs([g1, g2, g3], similarity_threshold=0.9)
|
||||
assert len(result["entities"]) == 1
|
||||
|
||||
|
||||
def test_grafos_sin_overlap_concatenacion_simple():
|
||||
g1 = {
|
||||
"entities": [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}],
|
||||
"relations": [{"source_id": "1", "target_id": "2", "relation_type": "knows"}],
|
||||
}
|
||||
g2 = {
|
||||
"entities": [{"id": "3", "name": "Carol"}, {"id": "4", "name": "Dave"}],
|
||||
"relations": [{"source_id": "3", "target_id": "4", "relation_type": "knows"}],
|
||||
}
|
||||
result = merge_graphs([g1, g2], similarity_threshold=0.85)
|
||||
# Ninguna entity similar -> concatenacion directa
|
||||
assert len(result["entities"]) == 4
|
||||
assert len(result["relations"]) == 2
|
||||
assert len(result["merge_log"]) == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_dos_grafos_con_entity_duplicada_merge()
|
||||
test_entities_similares_pero_bajo_threshold_no_merge()
|
||||
test_relaciones_re_apuntadas_correctamente()
|
||||
test_merge_log_registra_cada_merge()
|
||||
test_tres_grafos_merge_transitivo()
|
||||
test_grafos_sin_overlap_concatenacion_simple()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,44 @@
|
||||
---
|
||||
name: pivot
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def pivot(rows: list[dict], index: str, columns: str, values: str, agg: str = 'sum') -> list[dict]"
|
||||
description: "Pivot table sin pandas. Agrupa por index, expande valores unicos de columns como nuevas columnas y agrega values con la funcion indicada (sum, count, mean, min, max, first, last)."
|
||||
tags: [datascience, tabular, pivot, transform, aggregation, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["collections"]
|
||||
tested: true
|
||||
tests:
|
||||
- "Pivot basico con sum"
|
||||
- "Pivot con count y mean"
|
||||
- "Valores faltantes rellenados con 0"
|
||||
- "Una sola fila"
|
||||
- "Multiples valores por celda requieren agregacion"
|
||||
test_file_path: "python/functions/datascience/pivot_test.py"
|
||||
file_path: "python/functions/datascience/pivot.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
rows = [
|
||||
{"region": "US", "product": "A", "sales": 10},
|
||||
{"region": "US", "product": "B", "sales": 20},
|
||||
{"region": "EU", "product": "A", "sales": 15},
|
||||
]
|
||||
pivot(rows, index="region", columns="product", values="sales")
|
||||
# [{"region": "US", "A": 10, "B": 20}, {"region": "EU", "A": 15, "B": 0}]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib).
|
||||
Preserva el orden de aparicion de los valores de index y columns.
|
||||
Valores numericos faltantes se rellenan con 0; no numericos con None.
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Pivot table sin pandas para datos tabulares list[dict]."""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def pivot(
|
||||
rows: list[dict],
|
||||
index: str,
|
||||
columns: str,
|
||||
values: str,
|
||||
agg: str = "sum",
|
||||
) -> list[dict]:
|
||||
"""Transforma datos del formato largo al formato ancho (pivot table).
|
||||
|
||||
Agrupa por `index`, expande los valores unicos de `columns` como nuevas
|
||||
columnas y agrega la columna `values` con la funcion indicada.
|
||||
|
||||
Args:
|
||||
rows: Lista de dicts con los datos en formato largo.
|
||||
index: Nombre de la columna que actua como indice de filas.
|
||||
columns: Nombre de la columna cuyos valores unicos se convierten en columnas.
|
||||
values: Nombre de la columna cuyos valores se agregan.
|
||||
agg: Funcion de agregacion: sum, count, mean, min, max, first, last.
|
||||
|
||||
Returns:
|
||||
Lista de dicts con una fila por valor unico de index y una columna
|
||||
por cada valor unico de columns. Valores numericos faltantes rellenados
|
||||
con 0, valores no numericos con None.
|
||||
"""
|
||||
# Recopilar valores unicos de columns (orden de aparicion)
|
||||
col_values: list = []
|
||||
seen_cols: set = set()
|
||||
index_order: list = []
|
||||
seen_index: set = set()
|
||||
|
||||
for row in rows:
|
||||
idx = row.get(index)
|
||||
col = row.get(columns)
|
||||
if idx not in seen_index:
|
||||
seen_index.add(idx)
|
||||
index_order.append(idx)
|
||||
if col not in seen_cols:
|
||||
seen_cols.add(col)
|
||||
col_values.append(col)
|
||||
|
||||
# Acumular: groups[index_val][col_val] = lista de values
|
||||
groups: dict[any, dict[any, list]] = defaultdict(lambda: defaultdict(list))
|
||||
for row in rows:
|
||||
idx = row.get(index)
|
||||
col = row.get(columns)
|
||||
val = row.get(values)
|
||||
if val is not None:
|
||||
groups[idx][col].append(val)
|
||||
|
||||
# Determinar si los valores son numericos (para relleno de 0)
|
||||
sample_vals = [v for g in groups.values() for vs in g.values() for v in vs]
|
||||
is_numeric = all(isinstance(v, (int, float)) for v in sample_vals) if sample_vals else True
|
||||
|
||||
def _aggregate(vals: list, func: str):
|
||||
if not vals:
|
||||
return 0 if is_numeric else None
|
||||
if func == "sum":
|
||||
return sum(vals)
|
||||
if func == "count":
|
||||
return len(vals)
|
||||
if func == "mean":
|
||||
return sum(vals) / len(vals)
|
||||
if func == "min":
|
||||
return min(vals)
|
||||
if func == "max":
|
||||
return max(vals)
|
||||
if func == "first":
|
||||
return vals[0]
|
||||
if func == "last":
|
||||
return vals[-1]
|
||||
raise ValueError(f"Funcion de agregacion no soportada: {func}")
|
||||
|
||||
result = []
|
||||
for idx in index_order:
|
||||
record: dict = {index: idx}
|
||||
for col in col_values:
|
||||
vals = groups[idx][col]
|
||||
if vals:
|
||||
record[col] = _aggregate(vals, agg)
|
||||
else:
|
||||
record[col] = 0 if is_numeric else None
|
||||
result.append(record)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,78 @@
|
||||
"""Tests para pivot."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from pivot import pivot
|
||||
|
||||
|
||||
def test_pivot_basico_con_sum():
|
||||
"""Pivot basico con sum."""
|
||||
rows = [
|
||||
{"region": "US", "product": "A", "sales": 10},
|
||||
{"region": "US", "product": "B", "sales": 20},
|
||||
{"region": "EU", "product": "A", "sales": 15},
|
||||
]
|
||||
result = pivot(rows, index="region", columns="product", values="sales")
|
||||
assert len(result) == 2
|
||||
us = next(r for r in result if r["region"] == "US")
|
||||
eu = next(r for r in result if r["region"] == "EU")
|
||||
assert us["A"] == 10
|
||||
assert us["B"] == 20
|
||||
assert eu["A"] == 15
|
||||
assert eu["B"] == 0
|
||||
|
||||
|
||||
def test_pivot_con_count_y_mean():
|
||||
"""Pivot con count y mean."""
|
||||
rows = [
|
||||
{"region": "US", "product": "A", "sales": 10},
|
||||
{"region": "US", "product": "A", "sales": 20},
|
||||
{"region": "EU", "product": "A", "sales": 15},
|
||||
]
|
||||
result_count = pivot(rows, index="region", columns="product", values="sales", agg="count")
|
||||
us_count = next(r for r in result_count if r["region"] == "US")
|
||||
assert us_count["A"] == 2
|
||||
|
||||
result_mean = pivot(rows, index="region", columns="product", values="sales", agg="mean")
|
||||
us_mean = next(r for r in result_mean if r["region"] == "US")
|
||||
assert us_mean["A"] == 15.0
|
||||
|
||||
|
||||
def test_pivot_valores_faltantes_rellenados_con_0():
|
||||
"""Valores faltantes rellenados con 0."""
|
||||
rows = [
|
||||
{"region": "US", "product": "A", "sales": 5},
|
||||
{"region": "EU", "product": "B", "sales": 8},
|
||||
]
|
||||
result = pivot(rows, index="region", columns="product", values="sales")
|
||||
us = next(r for r in result if r["region"] == "US")
|
||||
eu = next(r for r in result if r["region"] == "EU")
|
||||
assert us["B"] == 0
|
||||
assert eu["A"] == 0
|
||||
|
||||
|
||||
def test_pivot_una_sola_fila():
|
||||
"""Una sola fila."""
|
||||
rows = [{"region": "US", "product": "A", "sales": 42}]
|
||||
result = pivot(rows, index="region", columns="product", values="sales")
|
||||
assert len(result) == 1
|
||||
assert result[0]["region"] == "US"
|
||||
assert result[0]["A"] == 42
|
||||
|
||||
|
||||
def test_pivot_multiples_valores_por_celda_requieren_agregacion():
|
||||
"""Multiples valores por celda requieren agregacion."""
|
||||
rows = [
|
||||
{"region": "US", "product": "A", "sales": 10},
|
||||
{"region": "US", "product": "A", "sales": 30},
|
||||
]
|
||||
result_sum = pivot(rows, index="region", columns="product", values="sales", agg="sum")
|
||||
assert result_sum[0]["A"] == 40
|
||||
|
||||
result_min = pivot(rows, index="region", columns="product", values="sales", agg="min")
|
||||
assert result_min[0]["A"] == 10
|
||||
|
||||
result_max = pivot(rows, index="region", columns="product", values="sales", agg="max")
|
||||
assert result_max[0]["A"] == 30
|
||||
Reference in New Issue
Block a user