fix(infra): gradle_run detecta android-sdk — issue 0076 #2

Open
dataforge wants to merge 538 commits from auto/0076-gradle-sdk-detect into master
62 changed files with 5376 additions and 0 deletions
Showing only changes of commit 63a9cb5273 - Show all commits
@@ -9,6 +9,8 @@ from .cybersecurity import (
levenshtein_distance,
jaccard_similarity,
normalize_url,
envelope_encrypt,
envelope_decrypt,
)
__all__ = [
@@ -22,4 +24,6 @@ __all__ = [
"levenshtein_distance",
"jaccard_similarity",
"normalize_url",
"envelope_encrypt",
"envelope_decrypt",
]
@@ -4,8 +4,11 @@ import hashlib
import math
import re
import base64
import secrets
import struct
from collections import Counter
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
def hash_sha256(data: bytes) -> str:
@@ -165,3 +168,147 @@ def normalize_url(raw_url: str) -> str:
sorted_query = urlencode(sorted(params.items()), doseq=True)
# Drop fragment
return urlunparse((scheme, netloc, path, parsed.params, sorted_query, ""))
# --- Envelope Encryption (AES-256-GCM) ---
_ENVELOPE_MAGIC = b"OVE1"
_ENVELOPE_VERSION = 0x01
_HEADER_SIZE = 12 # magic(4) + version(1) + reserved(1) + efk_len(2) + kiv_len(2) + div_len(2)
def _build_envelope(
encrypted_file_key: bytes,
key_iv: bytes,
data_iv: bytes,
encrypted_content: bytes,
) -> bytes:
"""Construye el formato binario del envelope (helper puro interno).
Header (12 bytes):
Magic (4B): b"OVE1"
Version (1B): 0x01
Reserved (1B): 0x00
EFK_len (2B): longitud de encrypted_file_key (big-endian)
KIV_len (2B): longitud de key_iv (big-endian)
DIV_len (2B): longitud de data_iv (big-endian)
Seguido de: encrypted_file_key + key_iv + data_iv + encrypted_content
"""
header = (
_ENVELOPE_MAGIC
+ struct.pack(">BBHHH", _ENVELOPE_VERSION, 0x00,
len(encrypted_file_key), len(key_iv), len(data_iv))
)
return header + encrypted_file_key + key_iv + data_iv + encrypted_content
def _parse_envelope(ciphertext: bytes) -> tuple:
"""Parsea el envelope binario y retorna sus componentes (helper puro interno).
Returns:
(encrypted_file_key, key_iv, data_iv, encrypted_content)
Raises:
ValueError: si el envelope esta truncado o la version no es soportada.
"""
if len(ciphertext) < _HEADER_SIZE:
raise ValueError(
f"Envelope truncado: se esperaban al menos {_HEADER_SIZE} bytes, "
f"se recibieron {len(ciphertext)}"
)
magic = ciphertext[:4]
if magic != _ENVELOPE_MAGIC:
raise ValueError(f"Magic invalido: se esperaba {_ENVELOPE_MAGIC!r}, se obtuvo {magic!r}")
version, _reserved, efk_len, kiv_len, div_len = struct.unpack(">BBHHH", ciphertext[4:12])
if version != _ENVELOPE_VERSION:
raise ValueError(f"Version de envelope no soportada: {version}")
offset = _HEADER_SIZE
encrypted_file_key = ciphertext[offset : offset + efk_len]
offset += efk_len
key_iv = ciphertext[offset : offset + kiv_len]
offset += kiv_len
data_iv = ciphertext[offset : offset + div_len]
offset += div_len
encrypted_content = ciphertext[offset:]
if (
len(encrypted_file_key) != efk_len
or len(key_iv) != kiv_len
or len(data_iv) != div_len
):
raise ValueError("Envelope truncado: longitudes declaradas exceden los datos disponibles")
return encrypted_file_key, key_iv, data_iv, encrypted_content
def envelope_encrypt(plaintext: bytes, master_key: bytes) -> bytes:
"""Cifra datos usando patron Envelope Encryption con AES-256-GCM.
Genera una file key aleatoria de 32 bytes, cifra los datos con ella,
luego cifra la file key con la master_key. El resultado es un envelope
binario que contiene todo lo necesario para descifrar con la master_key.
Args:
plaintext: Datos a cifrar (puede ser vacio).
master_key: Clave maestra de 32 bytes (AES-256).
Returns:
Envelope binario cifrado.
Raises:
Exception: Si ocurre un error en el cifrado (clave de longitud incorrecta, etc.).
"""
# 1. Generar file_key aleatoria (DEK: Data Encryption Key)
file_key = secrets.token_bytes(32)
# 2. Cifrar contenido con la file_key
data_iv = secrets.token_bytes(12)
aesgcm_data = AESGCM(file_key)
encrypted_content = aesgcm_data.encrypt(data_iv, plaintext, None)
# 3. Cifrar file_key con la master_key (KEK: Key Encryption Key)
key_iv = secrets.token_bytes(12)
aesgcm_key = AESGCM(master_key)
encrypted_file_key = aesgcm_key.encrypt(key_iv, file_key, None)
# 4. Construir envelope
return _build_envelope(encrypted_file_key, key_iv, data_iv, encrypted_content)
def envelope_decrypt(ciphertext: bytes, master_key: bytes) -> bytes:
"""Descifra datos cifrados con envelope_encrypt.
Si los datos no empiezan con el magic b"OVE1", se asume que no estan
cifrados y se retornan tal cual (comportamiento passthrough). Esto
permite usar la funcion en archivos que pueden o no estar cifrados.
Args:
ciphertext: Envelope cifrado (o datos en plano si no tienen magic).
master_key: Clave maestra de 32 bytes (AES-256).
Returns:
Datos descifrados, o ciphertext sin modificar si no tiene magic.
Raises:
ValueError: Si el envelope esta corrupto o truncado.
cryptography.exceptions.InvalidTag: Si la master_key es incorrecta
o los datos fueron manipulados (falla de autenticacion GCM).
"""
# Passthrough: si no comienza con magic, asumir que no esta cifrado
if not ciphertext.startswith(_ENVELOPE_MAGIC):
return ciphertext
# Parsear envelope
encrypted_file_key, key_iv, data_iv, encrypted_content = _parse_envelope(ciphertext)
# Descifrar file_key con master_key
aesgcm_key = AESGCM(master_key)
file_key = aesgcm_key.decrypt(key_iv, encrypted_file_key, None)
# Descifrar contenido con file_key
aesgcm_data = AESGCM(file_key)
return aesgcm_data.decrypt(data_iv, encrypted_content, None)
@@ -0,0 +1,59 @@
---
name: envelope_decrypt
kind: function
lang: py
domain: cybersecurity
version: "1.0.0"
purity: impure
signature: "def envelope_decrypt(ciphertext: bytes, master_key: bytes) -> bytes"
description: "Descifra datos cifrados con envelope_encrypt. Si los datos no comienzan con el magic b'OVE1', los retorna sin modificar (passthrough). Soporta archivos que pueden o no estar cifrados sin necesidad de chequeo previo."
tags: [decryption, aes, gcm, envelope-encryption, dek, kek, cryptography, cybersecurity, passthrough]
uses_functions: [envelope_encrypt_py_cybersecurity]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [cryptography, struct]
tested: true
tests:
- "decrypt de datos cifrados"
- "decrypt de datos no cifrados passthrough"
- "key incorrecta"
- "envelope truncado"
- "magic invalido"
test_file_path: "python/functions/cybersecurity/envelope_encrypt_test.py"
file_path: "python/functions/cybersecurity/cybersecurity.py"
---
## Ejemplo
```python
import secrets
from cybersecurity import envelope_encrypt, envelope_decrypt
master_key = secrets.token_bytes(32)
# Caso 1: descifrar datos cifrados
ciphertext = envelope_encrypt(b"datos secretos", master_key)
plaintext = envelope_decrypt(ciphertext, master_key)
# plaintext == b"datos secretos"
# Caso 2: passthrough — datos no cifrados
raw = b"archivo en plano"
result = envelope_decrypt(raw, master_key)
# result == b"archivo en plano" (sin modificar)
# Caso 3: key incorrecta — lanza InvalidTag
wrong_key = secrets.token_bytes(32)
# envelope_decrypt(ciphertext, wrong_key) → cryptography.exceptions.InvalidTag
```
## Notas
Implementacion original inspirada en OpenViking `openviking/crypto/encryptor.py` (AGPL-3.0). Reimplementada desde cero.
- **Passthrough**: si `ciphertext` no empieza con `b"OVE1"`, se retorna sin modificar. Permite usar la funcion indistintamente en archivos cifrados y no cifrados.
- **Autenticacion GCM**: si la master_key es incorrecta o los datos fueron manipulados, `cryptography.exceptions.InvalidTag` es lanzado por la capa GCM — nunca se retorna texto corrupto.
- **ValueError**: lanzado si el envelope tiene magic correcto pero estructura invalida (truncado o version no soportada).
- `master_key` debe ser de exactamente 32 bytes para AES-256.
- Requiere `cryptography` instalado: `uv add cryptography`.
@@ -0,0 +1,68 @@
---
name: envelope_encrypt
kind: function
lang: py
domain: cybersecurity
version: "1.0.0"
purity: impure
signature: "def envelope_encrypt(plaintext: bytes, master_key: bytes) -> bytes"
description: "Cifra datos usando patron Envelope Encryption con AES-256-GCM. Genera una file key aleatoria (DEK), cifra los datos con ella, luego cifra la file key con la master_key (KEK). Retorna un envelope binario con magic b'OVE1'."
tags: [encryption, aes, gcm, envelope-encryption, dek, kek, cryptography, cybersecurity]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [cryptography, secrets, struct]
tested: true
tests:
- "encrypt → decrypt roundtrip"
- "datos vacios"
- "datos grandes"
- "ciphertext tiene magic correcto"
- "ciphertext es distinto cada vez"
test_file_path: "python/functions/cybersecurity/envelope_encrypt_test.py"
file_path: "python/functions/cybersecurity/cybersecurity.py"
---
## Ejemplo
```python
import secrets
from cybersecurity import envelope_encrypt, envelope_decrypt
master_key = secrets.token_bytes(32) # 256-bit KEK
plaintext = b"datos confidenciales"
ciphertext = envelope_encrypt(plaintext, master_key)
# ciphertext[:4] == b"OVE1"
recovered = envelope_decrypt(ciphertext, master_key)
# recovered == plaintext
```
## Formato del envelope
```
Magic (4B): b"OVE1" identificador de formato
Version (1B): 0x01 version del protocolo
Reserved (1B): 0x00 reservado para uso futuro
EFK_len (2B): big-endian longitud de encrypted_file_key
KIV_len (2B): big-endian longitud de key_iv
DIV_len (2B): big-endian longitud de data_iv
--- header: 12 bytes total ---
Encrypted File Key (variable, incluye GCM auth tag de 16B)
Key IV (12B)
Data IV (12B)
Encrypted Content (variable, incluye GCM auth tag de 16B)
```
## Notas
Implementacion original inspirada en OpenViking `openviking/crypto/encryptor.py` (AGPL-3.0). Reimplementada desde cero.
- La file key (DEK) es de 32 bytes generados con `secrets.token_bytes` (CSPRNG).
- Tanto el cifrado de datos como el de la file key usan AES-256-GCM con IVs de 12 bytes.
- El GCM auth tag (16 bytes) garantiza autenticidad e integridad.
- `master_key` debe ser de exactamente 32 bytes para AES-256.
- Requiere `cryptography` instalado: `uv add cryptography`.
@@ -0,0 +1,101 @@
"""Tests para envelope_encrypt y envelope_decrypt."""
import secrets
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cybersecurity import envelope_encrypt, envelope_decrypt
def test_encrypt_decrypt_roundtrip():
master_key = secrets.token_bytes(32)
plaintext = b"datos de prueba para envelope encryption"
ciphertext = envelope_encrypt(plaintext, master_key)
result = envelope_decrypt(ciphertext, master_key)
assert result == plaintext
def test_datos_vacios():
master_key = secrets.token_bytes(32)
ciphertext = envelope_encrypt(b"", master_key)
result = envelope_decrypt(ciphertext, master_key)
assert result == b""
def test_datos_grandes():
master_key = secrets.token_bytes(32)
plaintext = secrets.token_bytes(1024 * 1024) # 1 MB
ciphertext = envelope_encrypt(plaintext, master_key)
result = envelope_decrypt(ciphertext, master_key)
assert result == plaintext
def test_decrypt_datos_no_cifrados_passthrough():
master_key = secrets.token_bytes(32)
plain = b"archivo no cifrado, sin magic bytes"
result = envelope_decrypt(plain, master_key)
assert result == plain
def test_key_incorrecta():
master_key = secrets.token_bytes(32)
wrong_key = secrets.token_bytes(32)
ciphertext = envelope_encrypt(b"secreto", master_key)
try:
envelope_decrypt(ciphertext, wrong_key)
assert False, "deberia haber lanzado excepcion"
except Exception:
pass # esperado: InvalidTag de cryptography
def test_envelope_truncado():
master_key = secrets.token_bytes(32)
ciphertext = envelope_encrypt(b"datos", master_key)
truncated = ciphertext[:6] # header incompleto
try:
envelope_decrypt(truncated, master_key)
assert False, "deberia haber lanzado ValueError"
except ValueError:
pass
def test_magic_invalido():
master_key = secrets.token_bytes(32)
# Construir datos con magic valido para pasar el check del passthrough
# pero con header corrupto
bad_envelope = b"OVE1" + b"\x00" * 20 # magic correcto pero header invalido
try:
envelope_decrypt(bad_envelope, master_key)
assert False, "deberia haber lanzado excepcion"
except Exception:
pass
def test_ciphertext_tiene_magic_correcto():
master_key = secrets.token_bytes(32)
ciphertext = envelope_encrypt(b"test", master_key)
assert ciphertext[:4] == b"OVE1"
def test_ciphertext_es_distinto_cada_vez():
master_key = secrets.token_bytes(32)
plaintext = b"mismo mensaje"
ct1 = envelope_encrypt(plaintext, master_key)
ct2 = envelope_encrypt(plaintext, master_key)
# IVs aleatorios garantizan ciphertexts distintos
assert ct1 != ct2
if __name__ == "__main__":
test_encrypt_decrypt_roundtrip()
test_datos_vacios()
test_datos_grandes()
test_decrypt_datos_no_cifrados_passthrough()
test_key_incorrecta()
test_envelope_truncado()
test_magic_invalido()
test_ciphertext_tiene_magic_correcto()
test_ciphertext_es_distinto_cada_vez()
print("Todos los tests pasaron.")
@@ -0,0 +1,45 @@
---
name: aggregate_by_group
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def aggregate_by_group(rows: list[dict], group_by: list[str], aggs: dict[str, str]) -> list[dict]"
description: "GROUP BY + agregaciones sobre datos tabulares. aggs es un dict de columna a funcion (sum, mean, count, min, max, first, last, collect). collect acumula valores en lista. None se ignora en agregaciones numericas."
tags: [datascience, tabular, groupby, aggregate, transform, python]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["collections"]
tested: true
tests:
- "Group by una columna con sum"
- "Group by multiples columnas"
- "Agregacion mean count min max"
- "collect acumula en lista"
- "Grupo con una sola fila"
- "Campo con None se ignora en agregaciones numericas"
test_file_path: "python/functions/datascience/aggregate_by_group_test.py"
file_path: "python/functions/datascience/aggregate_by_group.py"
---
## Ejemplo
```python
rows = [
{"dept": "eng", "salary": 100},
{"dept": "eng", "salary": 120},
{"dept": "sales", "salary": 80},
]
aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"})
# [{"dept": "eng", "salary": 110.0}, {"dept": "sales", "salary": 80.0}]
```
## Notas
Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib).
Preserva el orden de primera aparicion de cada grupo.
La funcion 'collect' no filtra None — acumula todos los valores incluyendo None.
@@ -0,0 +1,71 @@
"""GROUP BY + agregaciones sobre datos tabulares list[dict]."""
from collections import defaultdict
def aggregate_by_group(
rows: list[dict],
group_by: list[str],
aggs: dict[str, str],
) -> list[dict]:
"""Agrupa filas por una o varias columnas y aplica agregaciones.
Equivalente a SQL GROUP BY con funciones de agregacion.
La funcion 'collect' acumula todos los valores en una lista.
Los valores None se ignoran en agregaciones numericas (sum, mean, min, max).
Args:
rows: Lista de dicts con los datos.
group_by: Lista de columnas por las que agrupar.
aggs: Dict de {columna: funcion}. Funciones: sum, mean, count,
min, max, first, last, collect.
Returns:
Lista de dicts con las columnas de group_by mas los campos agregados.
El orden de las filas sigue el orden de primera aparicion del grupo.
"""
# Mantener orden de grupos con lista de claves
group_keys: list[tuple] = []
seen_groups: set[tuple] = set()
buckets: dict[tuple, dict[str, list]] = defaultdict(lambda: defaultdict(list))
for row in rows:
gk = tuple(row.get(col) for col in group_by)
if gk not in seen_groups:
seen_groups.add(gk)
group_keys.append(gk)
for col in aggs:
val = row.get(col)
buckets[gk][col].append(val)
def _aggregate(vals: list, func: str):
if func == "collect":
return vals
if func == "count":
return len(vals)
if func == "first":
return vals[0] if vals else None
if func == "last":
return vals[-1] if vals else None
# Para sum, mean, min, max: ignorar None
numeric = [v for v in vals if v is not None]
if not numeric:
return None
if func == "sum":
return sum(numeric)
if func == "mean":
return sum(numeric) / len(numeric)
if func == "min":
return min(numeric)
if func == "max":
return max(numeric)
raise ValueError(f"Funcion de agregacion no soportada: {func}")
result = []
for gk in group_keys:
record: dict = dict(zip(group_by, gk))
for col, func in aggs.items():
record[col] = _aggregate(buckets[gk][col], func)
result.append(record)
return result
@@ -0,0 +1,90 @@
"""Tests para aggregate_by_group."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from aggregate_by_group import aggregate_by_group
def test_group_by_una_columna_con_sum():
"""Group by una columna con sum."""
rows = [
{"dept": "eng", "salary": 100},
{"dept": "eng", "salary": 120},
{"dept": "sales", "salary": 80},
]
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
assert len(result) == 2
eng = next(r for r in result if r["dept"] == "eng")
sales = next(r for r in result if r["dept"] == "sales")
assert eng["salary"] == 220
assert sales["salary"] == 80
def test_group_by_multiples_columnas():
"""Group by multiples columnas."""
rows = [
{"dept": "eng", "level": "senior", "salary": 150},
{"dept": "eng", "level": "junior", "salary": 80},
{"dept": "eng", "level": "senior", "salary": 160},
{"dept": "sales", "level": "senior", "salary": 120},
]
result = aggregate_by_group(rows, group_by=["dept", "level"], aggs={"salary": "sum"})
assert len(result) == 3
eng_senior = next(r for r in result if r["dept"] == "eng" and r["level"] == "senior")
assert eng_senior["salary"] == 310
def test_agregacion_mean_count_min_max():
"""Agregacion mean count min max."""
rows = [
{"cat": "A", "val": 10},
{"cat": "A", "val": 20},
{"cat": "A", "val": 30},
]
result_mean = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "mean"})
assert result_mean[0]["val"] == 20.0
result_count = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "count"})
assert result_count[0]["val"] == 3
result_min = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "min"})
assert result_min[0]["val"] == 10
result_max = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "max"})
assert result_max[0]["val"] == 30
def test_collect_acumula_en_lista():
"""collect acumula en lista."""
rows = [
{"dept": "eng", "name": "Alice"},
{"dept": "eng", "name": "Bob"},
{"dept": "sales", "name": "Carol"},
]
result = aggregate_by_group(rows, group_by=["dept"], aggs={"name": "collect"})
eng = next(r for r in result if r["dept"] == "eng")
assert sorted(eng["name"]) == ["Alice", "Bob"]
def test_grupo_con_una_sola_fila():
"""Grupo con una sola fila."""
rows = [{"dept": "eng", "salary": 100}]
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
assert len(result) == 1
assert result[0]["salary"] == 100
def test_campo_con_none_se_ignora_en_agregaciones_numericas():
"""Campo con None se ignora en agregaciones numericas."""
rows = [
{"dept": "eng", "salary": 100},
{"dept": "eng", "salary": None},
{"dept": "eng", "salary": 200},
]
result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"})
assert result[0]["salary"] == 300
result_mean = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"})
assert result_mean[0]["salary"] == 150.0
@@ -0,0 +1,62 @@
---
name: build_entity_schema_prompt
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def build_entity_schema_prompt(entity_presets: list[dict]) -> str"
description: "Genera la seccion del system prompt que describe los entity types disponibles para extraccion. Formatea los presets del registry en texto legible para el LLM."
tags: [prompt, llm, entity, schema, osint, graph, extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "lista con varios presets"
- "lista vacia retorna string vacio"
- "preset sin metadata_fields"
test_file_path: "python/functions/datascience/build_entity_schema_prompt_test.py"
file_path: "python/functions/datascience/build_entity_schema_prompt.py"
---
## Ejemplo
```python
from build_entity_schema_prompt import build_entity_schema_prompt
presets = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
},
{
"type_ref": "osint_organization_go_cybersecurity",
"label": "Organization",
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"],
},
]
prompt = build_entity_schema_prompt(presets)
# Entity types available for extraction:
#
# 1. Person (type_ref: osint_person_go_cybersecurity)
# Attributes: full_name, alias, nationality, dob, risk_score
#
# 2. Organization (type_ref: osint_organization_go_cybersecurity)
# Attributes: legal_name, country, sector, founded, risk_score
```
## Notas
Funcion pura. No requiere dependencias externas.
El formato de salida es deliberadamente sencillo para maximizar la comprension por el LLM: numero de orden, label humano, type_ref del registry y lista de atributos en una sola linea.
Si un preset no tiene `metadata_fields` (o tiene lista vacia), se omite la linea de atributos.
Pensada para componer con `build_relation_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT.
@@ -0,0 +1,43 @@
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
"""Genera texto legible para el LLM describiendo los entity types disponibles.
Formatea los presets del registry en una seccion del system prompt que indica
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
Args:
entity_presets: Lista de presets con campos 'label', 'type_ref' y
opcionalmente 'metadata_fields'. Ejemplo:
[{"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias"]}]
Returns:
String formateado con la seccion del prompt. Retorna string vacio si
la lista de presets esta vacia.
"""
if not entity_presets:
return ""
lines = ["Entity types available for extraction:", ""]
for i, preset in enumerate(entity_presets, start=1):
label = preset.get("label", "Unknown")
type_ref = preset.get("type_ref", "")
metadata_fields = preset.get("metadata_fields", [])
lines.append(f"{i}. {label} (type_ref: {type_ref})")
if metadata_fields:
attrs = ", ".join(metadata_fields)
lines.append(f" Attributes: {attrs}")
lines.append("")
# Remove trailing blank line
if lines and lines[-1] == "":
lines.pop()
return "\n".join(lines)
@@ -0,0 +1,41 @@
"""Tests para build_entity_schema_prompt."""
from build_entity_schema_prompt import build_entity_schema_prompt
def test_lista_con_varios_presets():
presets = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
},
{
"type_ref": "osint_organization_go_cybersecurity",
"label": "Organization",
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"],
},
]
result = build_entity_schema_prompt(presets)
assert "Entity types available for extraction:" in result
assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result
assert " Attributes: full_name, alias, nationality, dob, risk_score" in result
assert "2. Organization (type_ref: osint_organization_go_cybersecurity)" in result
assert " Attributes: legal_name, country, sector, founded, risk_score" in result
def test_lista_vacia_retorna_string_vacio():
result = build_entity_schema_prompt([])
assert result == ""
def test_preset_sin_metadata_fields():
presets = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
}
]
result = build_entity_schema_prompt(presets)
assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result
assert "Attributes:" not in result
@@ -0,0 +1,43 @@
---
name: build_relation_schema_prompt
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def build_relation_schema_prompt(relation_types: list[str]) -> str"
description: "Genera la seccion del system prompt con los tipos de relacion permitidos para extraccion. Formatea la lista de tipos en texto legible para el LLM."
tags: [prompt, llm, relation, schema, osint, graph, extraction]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "lista con varios tipos"
- "lista vacia retorna string vacio"
- "un solo tipo"
test_file_path: "python/functions/datascience/build_relation_schema_prompt_test.py"
file_path: "python/functions/datascience/build_relation_schema_prompt.py"
---
## Ejemplo
```python
from build_relation_schema_prompt import build_relation_schema_prompt
types = ["funds", "employs", "communicates_with", "owns"]
prompt = build_relation_schema_prompt(types)
# Allowed relation types:
# funds, employs, communicates_with, owns
```
## Notas
Funcion pura. No requiere dependencias externas.
La salida es una sola linea con todos los tipos separados por coma, precedida por el encabezado. El formato es minimal para no consumir tokens innecesarios del contexto del LLM.
Pensada para componer con `build_entity_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT.
@@ -0,0 +1,22 @@
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
def build_relation_schema_prompt(relation_types: list[str]) -> str:
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
Formatea la lista de tipos de relacion en una seccion del system prompt que
indica al LLM que relaciones puede extraer entre entidades.
Args:
relation_types: Lista de strings con los tipos de relacion permitidos.
Ejemplo: ["funds", "employs", "communicates_with"]
Returns:
String formateado con la seccion del prompt. Retorna string vacio si
la lista esta vacia.
"""
if not relation_types:
return ""
joined = ", ".join(relation_types)
return f"Allowed relation types:\n{joined}"
@@ -0,0 +1,19 @@
"""Tests para build_relation_schema_prompt."""
from build_relation_schema_prompt import build_relation_schema_prompt
def test_lista_normal():
relation_types = ["funds", "employs", "communicates_with", "owns", "operates"]
result = build_relation_schema_prompt(relation_types)
assert result.startswith("Allowed relation types:")
assert "funds" in result
assert "employs" in result
assert "communicates_with" in result
assert "owns" in result
assert "operates" in result
def test_lista_vacia_retorna_string_vacio():
result = build_relation_schema_prompt([])
assert result == ""
@@ -121,3 +121,72 @@ def linspace(start: float, stop: float, num: int) -> list:
return [start]
step = (stop - start) / (num - 1)
return [start + i * step for i in range(num)]
def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict:
"""Estima parámetros de un proceso Hawkes desde autocorrelación de arrivals.
Ajusta exponencial a*exp(-b*lag) sobre la ACF.
Retorna dict con alpha, beta, branching_ratio, acf.
"""
import numpy as np
from scipy.optimize import curve_fit
arr = np.array(arrivals, dtype=float)
mean_a = np.mean(arr)
var_a = np.var(arr)
if var_a == 0:
return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': [1.0]}
acf = [1.0] + [
float(np.mean((arr[lag:] - mean_a) * (arr[:-lag] - mean_a)) / var_a)
for lag in range(1, max_lag)
]
lags = np.arange(1, max_lag)
acf_vals = np.array(acf[1:])
if acf_vals[0] <= 0.01:
return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': acf}
exp_decay = lambda x, a, b: a * np.exp(-b * x)
try:
popt, _ = curve_fit(exp_decay, lags, acf_vals, p0=[0.5, 0.5], maxfev=5000)
alpha_est, beta_est = abs(popt[0]), abs(popt[1])
except RuntimeError:
alpha_est, beta_est = 0.0, 1.0
branching = alpha_est / beta_est if beta_est > 0 else 0.0
return {
'alpha': round(alpha_est, 4),
'beta': round(beta_est, 4),
'branching_ratio': round(branching, 4),
'acf': acf,
}
def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict:
"""Estima el exponente alpha de una distribución Pareto via MLE.
α = n / Σ ln(xi / x_min) donde x_min es el percentil indicado.
Alpha bajo = cola más pesada = más valores extremos.
"""
import numpy as np
arr = np.array([v for v in values if v > 0], dtype=float)
if len(arr) < 10:
return {'alpha': 0.0, 'x_min': 0.0, 'n_tail': 0}
x_min = float(np.percentile(arr, x_min_percentile))
tail = arr[arr >= x_min]
if len(tail) < 2 or x_min <= 0:
return {'alpha': 0.0, 'x_min': x_min, 'n_tail': len(tail)}
alpha = float(len(tail) / np.sum(np.log(tail / x_min)))
return {
'alpha': round(alpha, 4),
'x_min': round(x_min, 6),
'n_tail': len(tail),
}
@@ -0,0 +1,94 @@
---
name: deduplicate_entities
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def deduplicate_entities(candidates: list[EntityCandidate], name_threshold: float = 0.85, same_type_only: bool = True) -> DeduplicationResult"
description: "Agrupa entidades candidatas que refieren a la misma entidad real usando fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para clusters transitivos. Retorna entidades mergeadas con mapas de resolucion de IDs y log de merges."
tags: [deduplication, entity, fuzzy, levenshtein, jaccard, union-find, knowledge-graph, nlp, fuzzygraph, datascience]
uses_functions:
- normalize_entity_name_py_core
- merge_entity_attributes_py_core
uses_types:
- entity_candidate_py_datascience
- deduplication_result_py_datascience
returns: [deduplication_result_py_datascience]
returns_optional: false
error_type: ""
imports:
- uuid
tested: true
tests:
- "John Smith y Smith, John se mergean"
- "Google y Google LLC se mergean"
- "192.168.1.1 y 192.168.1.1 se mergean por matching exacto"
- "John Smith (person) y John Smith (organization) NO se mergean"
- "Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster"
- "Entidades sin duplicados pasan sin modificacion"
- "Confidence toma el max del cluster; atributos se fusionan"
- "Lista vacia retorna resultado vacio"
- "name_to_id contiene todos los nombres originales del cluster"
test_file_path: "python/functions/datascience/deduplicate_entities_test.py"
file_path: "python/functions/datascience/deduplicate_entities.py"
---
## Ejemplo
```python
from python.types.datascience.entity_candidate import EntityCandidate
from python.functions.datascience.deduplicate_entities import deduplicate_entities
candidates = [
EntityCandidate(name="John Smith", type_ref="person", confidence=0.9),
EntityCandidate(name="Smith, John", type_ref="person", confidence=0.85),
EntityCandidate(name="Google", type_ref="organization", confidence=0.95),
EntityCandidate(name="Google LLC", type_ref="organization", confidence=0.88),
]
result = deduplicate_entities(candidates, name_threshold=0.85, same_type_only=True)
# result.total_before = 4
# result.total_after = 2
# result.merge_log = [
# {"canonical": "John Smith", "merged": ["Smith, John"], "score": 0.91, "reason": "fuzzy_name"},
# {"canonical": "Google", "merged": ["Google LLC"], "score": 0.89, "reason": "fuzzy_name"},
# ]
```
## Algoritmo
1. **Normalizar nombres** usando `normalize_entity_name()` sobre cada candidato segun su `type_ref`
2. **Comparacion pairwise** dentro del mismo tipo (si `same_type_only=True`):
- Para tipos tecnicos (ip, email, domain, crypto_wallet, phone): matching exacto normalizado
- Para el resto: `score = max(levenshtein_sim, jaccard_sim)` + bonus por contencion (+0.3) y acronimos (+0.3)
3. **Union-Find** para clusters transitivos: si A~B y B~C, entonces {A, B, C} forman un cluster
4. **Merge por cluster:**
- Nombre canonico: candidato con mayor `confidence`
- Atributos: `merge_entity_attributes()` sobre todos los candidatos del cluster
- Confidence: `max` del cluster
- Source chunks: union de todos los candidatos
- `merged_from`: union de todos los nombres originales
## Heuristicas de similitud de nombres
| Heuristica | Efecto |
|---|---|
| Levenshtein | `1 - (edit_distance / max_len)` |
| Jaccard sobre tokens | `\|A ∩ B\| / \|A B\|` |
| Score base | `max(lev_sim, jaccard_sim)` |
| Contencion (a in b o b in a) | `+0.3` hasta max 1.0 |
| Acronimo ("FBI" ~ "Federal Bureau of Investigation") | `+0.3` hasta max 1.0 |
| Tipos exactos (ip/email/domain) | solo matching exacto, ignora umbral |
## Complejidad
- Pairwise: O(N^2) — aceptable para <1000 entidades (tipico por documento)
- Union-Find con path compression: O(α(N)) amortizado por operacion
- Para escalar a >1000: pre-filtrar por primera letra o n-gram index antes de comparar
## Notas
Funcion pura. Implementa Levenshtein y Jaccard internamente para evitar dependencias externas a este modulo. Las funciones del registry `levenshtein_distance_py_cybersecurity` y `jaccard_similarity_py_cybersecurity` son equivalentes pero requieren imports adicionales — la implementacion inline mantiene la funcion sin dependencias de stdlib.
El `name_to_id` del resultado es el mapa de resolucion principal para la fase de deduplicacion de relaciones: permite resolver cualquier variante de nombre de una entidad a su ID canonico.
@@ -0,0 +1,283 @@
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
from __future__ import annotations
import sys
import os
import uuid
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.deduplication_result import DeduplicationResult
from python.functions.core.normalize_entity_name import normalize_entity_name
from python.functions.core.merge_entity_attributes import merge_entity_attributes
# ── Similitud helpers ──────────────────────────────────────────────────────────
def _levenshtein(a: str, b: str) -> int:
"""Distancia de edicion Levenshtein entre dos strings."""
if a == b:
return 0
if not a:
return len(b)
if not b:
return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
curr = [i]
for j, cb in enumerate(b, 1):
cost = 0 if ca == cb else 1
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
prev = curr
return prev[-1]
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
"""Similitud de Jaccard entre dos conjuntos de tokens."""
set_a = set(tokens_a)
set_b = set(tokens_b)
if not set_a and not set_b:
return 1.0
inter = len(set_a & set_b)
union = len(set_a | set_b)
return inter / union if union else 0.0
def _name_similarity(a: str, b: str) -> float:
"""Score de similitud entre dos nombres normalizados.
Combina similitud de Levenshtein y Jaccard sobre tokens.
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
"""
if a == b:
return 1.0
# Similitud Levenshtein
max_len = max(len(a), len(b))
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
# Similitud Jaccard sobre tokens
tokens_a = a.split()
tokens_b = b.split()
jac_sim = _jaccard(tokens_a, tokens_b)
score = max(lev_sim, jac_sim)
# Bonus de contencion: un nombre contiene al otro
if a in b or b in a:
score = min(1.0, score + 0.3)
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
score = min(1.0, score + 0.3)
return score
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
if not candidate or not tokens:
return False
initials = "".join(t[0] for t in tokens if t).upper()
return candidate.upper() == initials
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
def _is_exact_type(entity_type: str) -> bool:
"""Tipos tecnicos donde solo se acepta matching exacto."""
return entity_type.lower() in _EXACT_TYPES
# ── Union-Find ─────────────────────────────────────────────────────────────────
class _UnionFind:
def __init__(self, n: int) -> None:
self._parent = list(range(n))
self._rank = [0] * n
def find(self, x: int) -> int:
while self._parent[x] != x:
self._parent[x] = self._parent[self._parent[x]]
x = self._parent[x]
return x
def union(self, x: int, y: int) -> None:
rx, ry = self.find(x), self.find(y)
if rx == ry:
return
if self._rank[rx] < self._rank[ry]:
rx, ry = ry, rx
self._parent[ry] = rx
if self._rank[rx] == self._rank[ry]:
self._rank[rx] += 1
# ── Implementacion principal ────────────────────────────────────────────────────
def deduplicate_entities(
candidates: list[EntityCandidate],
name_threshold: float = 0.85,
same_type_only: bool = True,
) -> DeduplicationResult:
"""Agrupa entidades candidatas que refieren a la misma entidad real.
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
detectar clusters transitivos. Por cada cluster genera una entidad canonica
mergeando atributos de todos sus miembros.
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
acepta matching exacto normalizado, ignorando el umbral de nombre.
Args:
candidates: lista de EntityCandidate a deduplicar.
name_threshold: score minimo para considerar dos nombres iguales (0-1).
same_type_only: si True, solo compara entidades del mismo type_ref.
Returns:
DeduplicationResult con entidades deduplicadas, mapas de resolucion
e historial de merges.
"""
if not candidates:
return DeduplicationResult(
entities=[],
entity_id_map={},
name_to_id={},
merge_log=[],
total_before=0,
total_after=0,
)
n = len(candidates)
# Paso 1: normalizar nombres
normalized: list[str] = []
for c in candidates:
norm = normalize_entity_name(c.name, c.type_ref)
normalized.append(norm)
# Paso 2: Union-Find sobre todos los indices
uf = _UnionFind(n)
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
merge_pairs: list[tuple[int, int, float]] = []
for i in range(n):
for j in range(i + 1, n):
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
continue
ni, nj = normalized[i], normalized[j]
et = candidates[i].type_ref.lower()
if _is_exact_type(et):
if ni == nj:
uf.union(i, j)
merge_pairs.append((i, j, 1.0))
continue
score = _name_similarity(ni, nj)
if score >= name_threshold:
uf.union(i, j)
merge_pairs.append((i, j, score))
# Paso 4: agrupar indices por raiz del Union-Find
clusters: dict[int, list[int]] = {}
for i in range(n):
root = uf.find(i)
clusters.setdefault(root, []).append(i)
# Paso 5: merge por cluster
merged_entities: list[EntityCandidate] = []
entity_id_map: dict[str, str] = {}
name_to_id: dict[str, str] = {}
merge_log: list[dict] = []
# Pares mergeados para construir el log
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
for i, j, score in merge_pairs:
root = uf.find(i)
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
for root, indices in clusters.items():
cluster_candidates = [candidates[idx] for idx in indices]
if len(cluster_candidates) == 1:
c = cluster_candidates[0]
canonical_name = c.name
canonical_norm = normalized[indices[0]]
merged_attrs = c.attributes
merged_confidence = c.confidence
merged_chunks = list(c.source_chunk_indices)
merged_from = list(c.merged_from) if c.merged_from else [c.name]
else:
# Candidato con mayor confidence es el canonico
best = max(cluster_candidates, key=lambda c: c.confidence)
canonical_name = best.name
canonical_norm = normalize_entity_name(best.name, best.type_ref)
merged_attrs = merge_entity_attributes(
[c.attributes for c in cluster_candidates]
)
merged_confidence = max(c.confidence for c in cluster_candidates)
merged_chunks: list[int] = []
seen_chunks: set[int] = set()
for c in cluster_candidates:
for idx in c.source_chunk_indices:
if idx not in seen_chunks:
merged_chunks.append(idx)
seen_chunks.add(idx)
merged_from: list[str] = []
seen_names: set[str] = set()
for c in cluster_candidates:
names_to_add = c.merged_from if c.merged_from else [c.name]
for nm in names_to_add:
if nm not in seen_names:
merged_from.append(nm)
seen_names.add(nm)
# Log de merge
other_names = [c.name for c in cluster_candidates if c is not best]
pairs = merged_pairs_by_root.get(root, [])
max_score = max((s for _, _, s in pairs), default=1.0)
merge_log.append(
{
"canonical": canonical_name,
"merged": other_names,
"score": round(max_score, 4),
"reason": "fuzzy_name",
}
)
ent_id = str(uuid.uuid4())
entity = EntityCandidate(
name=canonical_name,
name_normalized=canonical_norm,
type_ref=cluster_candidates[0].type_ref,
type_label=cluster_candidates[0].type_label,
attributes=merged_attrs,
confidence=merged_confidence,
source_chunk_indices=merged_chunks,
merged_from=merged_from,
)
merged_entities.append(entity)
# Poblar mapas de resolucion
entity_id_map[canonical_norm] = ent_id
for orig_name in merged_from:
name_to_id[orig_name] = ent_id
name_to_id[canonical_norm] = ent_id
return DeduplicationResult(
entities=merged_entities,
entity_id_map=entity_id_map,
name_to_id=name_to_id,
merge_log=merge_log,
total_before=n,
total_after=len(merged_entities),
)
@@ -0,0 +1,113 @@
"""Tests para deduplicate_entities."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from python.types.datascience.entity_candidate import EntityCandidate
from python.functions.datascience.deduplicate_entities import deduplicate_entities
def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
return EntityCandidate(
name=name,
type_ref=type_ref,
type_label=type_ref.capitalize(),
attributes=attrs,
confidence=confidence,
source_chunk_indices=[0],
)
def test_john_smith_y_smith_john_merge():
"""John Smith y Smith, John se mergean."""
a = _make("John Smith", type_ref="person")
b = _make("Smith, John", type_ref="person")
result = deduplicate_entities([a, b])
assert result.total_before == 2
assert result.total_after == 1
assert len(result.entities) == 1
assert len(result.merge_log) == 1
def test_google_y_google_llc_merge():
"""Google y Google LLC se mergean."""
a = _make("Google", type_ref="organization")
b = _make("Google LLC", type_ref="organization")
result = deduplicate_entities([a, b])
assert result.total_after == 1
assert len(result.entities) == 1
def test_ip_matching_exacto():
"""192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
result = deduplicate_entities([a, b])
assert result.total_after == 1
def test_same_name_different_type_no_merge():
"""John Smith (person) y John Smith (organization) NO se mergean."""
a = _make("John Smith", type_ref="person")
b = _make("John Smith", type_ref="organization")
result = deduplicate_entities([a, b], same_type_only=True)
assert result.total_after == 2
def test_clusters_transitivos():
"""Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
a = _make("Alice Johnson", type_ref="person")
b = _make("Alice Johnso", type_ref="person") # muy similar a A
c = _make("Alice Johns", type_ref="person") # muy similar a B
result = deduplicate_entities([a, b, c], name_threshold=0.80)
assert result.total_after == 1
def test_sin_duplicados_sin_cambios():
"""Entidades sin duplicados pasan sin modificacion."""
a = _make("Alice Smith", type_ref="person")
b = _make("Bob Jones", type_ref="person")
c = _make("Charlie Brown", type_ref="person")
result = deduplicate_entities([a, b, c])
assert result.total_before == 3
assert result.total_after == 3
assert len(result.merge_log) == 0
def test_confidence_y_atributos_merge_correctos():
"""Confidence toma el max del cluster; atributos se fusionan."""
a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
result = deduplicate_entities([a, b])
assert result.total_after == 1
entity = result.entities[0]
# confidence = max(0.7, 0.95)
assert entity.confidence == 0.95
# atributos de ambos candidatos presentes
assert "role" in entity.attributes
assert "company" in entity.attributes
def test_lista_vacia():
"""Lista vacia retorna resultado vacio."""
result = deduplicate_entities([])
assert result.total_before == 0
assert result.total_after == 0
assert result.entities == []
assert result.merge_log == []
def test_name_to_id_resolucion():
"""name_to_id contiene todos los nombres originales del cluster."""
a = _make("John Smith", type_ref="person")
b = _make("Smith, John", type_ref="person")
result = deduplicate_entities([a, b])
# Ambos nombres deben apuntar al mismo ID
ids = list(result.entity_id_map.values())
assert len(ids) == 1
ent_id = ids[0]
# name_to_id debe tener entradas para los nombres originales
assert any(v == ent_id for v in result.name_to_id.values())
assert len(result.name_to_id) >= 2
@@ -0,0 +1,81 @@
---
name: deduplicate_relations
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def deduplicate_relations(relations: list[RelationCandidate], entity_id_map: dict[str, str]) -> list[RelationCandidate]"
description: "Deduplica relaciones candidatas resolviendo from_name/to_name a entity IDs finales via entity_id_map. Descarta self-loops y relaciones sin match. Mergea duplicados (mismo from_id, to_id, relation_type) concatenando descripciones unicas y tomando max confidence."
tags: [datascience, extraction, knowledge-graph, nlp, deduplication, fuzzy-match, fuzzygraph]
uses_functions:
- levenshtein_distance_py_cybersecurity
uses_types:
- relation_candidate_py_datascience
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "dos relaciones identicas se colapsan en una"
- "relacion con nombre mergeado se resuelve al id correcto"
- "self loop se descarta"
- "nombre no mapeado sin fuzzy match se descarta"
- "relaciones distintas se mantienen"
- "merge descripcion concatena unicas"
- "lista vacia retorna lista vacia"
- "fuzzy match resuelve nombre cercano"
test_file_path: "python/functions/datascience/deduplicate_relations_test.py"
file_path: "python/functions/datascience/deduplicate_relations.py"
---
## Ejemplo
```python
from python.types.datascience.relation_candidate import RelationCandidate
from python.functions.datascience.deduplicate_relations import deduplicate_relations
# entity_id_map producido por deduplicate_entities
entity_id_map = {
"john smith": "entity_001",
"smith, john": "entity_001", # alias mergeado
"acme corp": "entity_002",
}
relations = [
RelationCandidate(from_name="John Smith", to_name="Acme Corp",
relation_type="works_at", description="John es CEO",
confidence=0.9, source_chunk_index=0),
RelationCandidate(from_name="Smith, John", to_name="Acme Corp",
relation_type="works_at", description="CEO de Acme",
confidence=0.7, source_chunk_index=2),
]
result = deduplicate_relations(relations, entity_id_map)
# → 1 RelationCandidate con from_id="entity_001", to_id="entity_002",
# confidence=0.9, description="John es CEO; CEO de Acme"
```
## Notas
La funcion es pura: no hace I/O, no tiene efectos secundarios. El logging es
de nivel DEBUG/WARNING — en produccion configurar el logger de la aplicacion.
**Resolucion de nombres:**
- Lookup exacto primero (lowercase strip del nombre contra las claves del mapa).
- Si no hay match exacto, fuzzy match con Levenshtein (threshold=3 ediciones).
- Si sigue sin match, la relacion se descarta con `logger.warning`.
**Self-loops:** relaciones donde `from_id == to_id` siempre se descartan.
**Merge:** cuando varias relaciones comparten `(from_id, to_id, relation_type)`:
- `confidence`: max del grupo.
- `description`: union de descripciones unicas (no duplicadas), separadas por `'; '`.
- `from_name` / `to_name` / `source_chunk_index`: del primer candidato del grupo.
**Integracion con fuzzygraph:**
Esta funcion es el paso 4 del pipeline de extraccion. Recibe el output de
`extract_relations_llm` (relaciones crudas con nombres de texto) y el
`entity_id_map` producido por `deduplicate_entities`. Produce la lista final
de relaciones para `ExtractionResult`.
@@ -0,0 +1,189 @@
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
import logging
import os
import sys
logger = logging.getLogger(__name__)
# --- Importar levenshtein_distance desde cybersecurity ---
# Soporta dos contextos:
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
# 2. Ejecutado desde la raiz del registry (fn run)
def _levenshtein_distance(a: str, b: str) -> int:
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
if len(a) < len(b):
return _levenshtein_distance(b, a)
if len(b) == 0:
return len(a)
prev_row = list(range(len(b) + 1))
for i, ca in enumerate(a):
curr_row = [i + 1]
for j, cb in enumerate(b):
cost = 0 if ca == cb else 1
curr_row.append(
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
)
prev_row = curr_row
return prev_row[-1]
try:
_here = os.path.dirname(os.path.abspath(__file__))
_cyber_path = os.path.join(_here, "..", "cybersecurity")
if _cyber_path not in sys.path:
sys.path.insert(0, _cyber_path)
from cybersecurity import levenshtein_distance as _lev
except ImportError:
_lev = None # type: ignore
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
Recorre todas las claves de entity_id_map y busca la mas cercana segun
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
threshold, o '' si no hay match aceptable.
Args:
name: nombre a resolver (ya en lowercase strip).
entity_id_map: mapa nombre_normalizado -> entity_id.
threshold: distancia maxima de edicion para considerar match (default 3).
Returns:
entity_id del mejor match o '' si no hay match.
"""
best_id = ""
best_dist = threshold + 1
for key, entity_id in entity_id_map.items():
dist = levenshtein_distance(name, key)
if dist < best_dist:
best_dist = dist
best_id = entity_id
return best_id if best_dist <= threshold else ""
def deduplicate_relations(
relations: list,
entity_id_map: dict[str, str],
) -> list:
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
Algoritmo:
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
Si sigue sin match, descartar la relacion con warning.
2. Descartar self-loops (from_id == to_id).
3. Deduplicar por (from_id, to_id, relation_type):
- description: concatenar descripciones unicas separadas por '; '
- confidence: max del grupo
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
Args:
relations: lista de RelationCandidate con from_name/to_name originales.
entity_id_map: mapa nombre_normalizado -> entity_id (output de
deduplicate_entities). Permite resolver nombres que fueron mergeados.
Returns:
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
"""
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
try:
_types_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..", "..", "..", "python", "types", "datascience",
)
if _types_path not in sys.path:
sys.path.insert(0, _types_path)
from relation_candidate import RelationCandidate
except ImportError:
from python.types.datascience.relation_candidate import RelationCandidate # type: ignore
resolved: list = []
for rel in relations:
# --- Resolver from_name ---
from_key = rel.from_name.lower().strip()
from_id = entity_id_map.get(from_key, "")
if not from_id:
from_id = _fuzzy_resolve(from_key, entity_id_map)
if not from_id:
logger.warning(
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
rel.from_name,
)
continue
# --- Resolver to_name ---
to_key = rel.to_name.lower().strip()
to_id = entity_id_map.get(to_key, "")
if not to_id:
to_id = _fuzzy_resolve(to_key, entity_id_map)
if not to_id:
logger.warning(
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
rel.to_name,
)
continue
# --- Descartar self-loops ---
if from_id == to_id:
logger.debug(
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
rel.from_name,
rel.to_name,
rel.relation_type,
)
continue
resolved.append(
RelationCandidate(
from_name=rel.from_name,
to_name=rel.to_name,
from_id=from_id,
to_id=to_id,
relation_type=rel.relation_type,
description=rel.description,
confidence=rel.confidence,
source_chunk_index=rel.source_chunk_index,
)
)
# --- Deduplicar por (from_id, to_id, relation_type) ---
groups: dict[tuple, list] = {}
for rel in resolved:
key = (rel.from_id, rel.to_id, rel.relation_type)
groups.setdefault(key, []).append(rel)
result: list = []
for (from_id, to_id, rel_type), group in groups.items():
if len(group) == 1:
result.append(group[0])
continue
# Mergear: max confidence + union de descripciones unicas
best_confidence = max(r.confidence for r in group)
seen_desc: set[str] = set()
descriptions: list[str] = []
for r in group:
if r.description and r.description not in seen_desc:
descriptions.append(r.description)
seen_desc.add(r.description)
result.append(
RelationCandidate(
from_name=group[0].from_name,
to_name=group[0].to_name,
from_id=from_id,
to_id=to_id,
relation_type=rel_type,
description="; ".join(descriptions),
confidence=best_confidence,
source_chunk_index=group[0].source_chunk_index,
)
)
return result
@@ -0,0 +1,120 @@
"""Tests para deduplicate_relations."""
import os
import sys
# Permitir importar RelationCandidate desde python/types/datascience/
_here = os.path.dirname(os.path.abspath(__file__))
_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience")
if _types_path not in sys.path:
sys.path.insert(0, _types_path)
from relation_candidate import RelationCandidate
from deduplicate_relations import deduplicate_relations
def _make_rel(
from_name: str,
to_name: str,
relation_type: str = "works_at",
description: str = "",
confidence: float = 0.8,
source_chunk_index: int = 0,
) -> RelationCandidate:
return RelationCandidate(
from_name=from_name,
to_name=to_name,
relation_type=relation_type,
description=description,
confidence=confidence,
source_chunk_index=source_chunk_index,
)
# entity_id_map tipico: claves en lowercase normalizado
_ENTITY_MAP: dict[str, str] = {
"john smith": "entity_001",
"acme corp": "entity_002",
"jane doe": "entity_003",
"google": "entity_004",
}
def test_dos_relaciones_identicas_se_colapsan_en_una():
"""2 relaciones identicas (from, to, type) → 1."""
rels = [
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert result[0].from_id == "entity_001"
assert result[0].to_id == "entity_002"
assert result[0].confidence == 0.9 # max
def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto():
"""Relacion con nombre mergeado → se resuelve al ID correcto."""
# entity_id_map incluye "smith, john" como alias de entity_001
merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"}
rels = [_make_rel("Smith, John", "Acme Corp")]
result = deduplicate_relations(rels, merged_map)
assert len(result) == 1
assert result[0].from_id == "entity_001"
assert result[0].to_id == "entity_002"
def test_self_loop_se_descarta():
"""Self-loop (from_id == to_id) → descartado."""
rels = [_make_rel("John Smith", "John Smith", relation_type="knows")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 0
def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta():
"""Relacion con nombre no mapeado y sin fuzzy match → descartada."""
rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 0
def test_relaciones_distintas_se_mantienen():
"""Relaciones con (from, to, type) distintos → todas se mantienen."""
rels = [
_make_rel("John Smith", "Acme Corp", relation_type="works_at"),
_make_rel("Jane Doe", "Acme Corp", relation_type="works_at"),
_make_rel("John Smith", "Google", relation_type="invested_in"),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 3
def test_merge_descripcion_concatena_unicas():
"""Merge de relaciones: descripciones unicas se concatenan."""
rels = [
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
_make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7),
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert "John es CEO" in result[0].description
assert "Acme fue fundada por John" in result[0].description
# La descripcion duplicada ("John es CEO") no aparece dos veces
assert result[0].description.count("John es CEO") == 1
assert result[0].confidence == 0.9
def test_lista_vacia_retorna_lista_vacia():
"""Lista vacia de relaciones → lista vacia."""
result = deduplicate_relations([], _ENTITY_MAP)
assert result == []
def test_fuzzy_match_resuelve_nombre_cercano():
"""Nombre con typo pequeño → fuzzy match lo resuelve."""
# "john smit" tiene distancia 1 de "john smith"
rels = [_make_rel("John Smit", "Acme Corp")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert result[0].from_id == "entity_001"
@@ -0,0 +1,56 @@
---
name: detect_drift
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def detect_drift(history: list[dict], current: dict, fields: list[str], threshold: float = 2.0) -> list[dict]"
description: "Detecta drift estadistico comparando metricas de la ejecucion actual contra el historial usando z-score. Si |z| > threshold, el campo ha drifteado. Util para monitorizar executions en operations.db."
tags: [drift, statistics, z-score, monitoring, executions, operations, datascience]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [math]
tested: true
tests:
- "campo con drift claro (z > threshold)"
- "campo estable (z < threshold)"
- "historial con un solo punto → std=0, no puede calcular → drifted=False con nota"
- "historial vacio → todos drifted=False"
- "threshold custom"
test_file_path: "python/functions/datascience/detect_drift_test.py"
file_path: "python/functions/datascience/detect_drift.py"
---
## Ejemplo
```python
history = [
{"records_out": 100, "duration_ms": 500},
{"records_out": 105, "duration_ms": 480},
{"records_out": 98, "duration_ms": 510},
]
current = {"records_out": 50, "duration_ms": 2000}
results = detect_drift(history, current, ["records_out", "duration_ms"])
# [
# {"field": "records_out", "current": 50, "mean": 101.0, "std": 3.6, "z_score": -14.2, "drifted": True},
# {"field": "duration_ms", "current": 2000, "mean": 496.7, "std": 15.3, "z_score": 98.3, "drifted": True},
# ]
```
## Notas
Funcion pura. Solo stdlib (`math`).
El z-score usa desviacion estandar poblacional (dividir por N, no N-1) para ser consistente con historial de cualquier tamanio.
Casos especiales:
- **Historial vacio**: z_score=0.0, drifted=False para todos los campos.
- **Un solo punto en historial**: std=0.0, z_score=0.0, drifted=False. No hay suficiente historia para calcular variabilidad.
- **Std=0 con N>=2**: todos los valores historicos identicos. z_score=0.0, drifted=False (cualquier desviacion seria tecnicamente infinita, pero se asume que el sistema es muy estable).
Pensado para el paso ANALIZAR del bucle reactivo: comparar `metrics` de la ejecucion actual con executions historicas de `operations.db`.
@@ -0,0 +1,86 @@
"""detect_drift — detecta drift estadistico por z-score comparando metricas contra historial."""
import math
def detect_drift(
history: list[dict],
current: dict,
fields: list[str],
threshold: float = 2.0,
) -> list[dict]:
"""Detecta drift estadistico comparando metricas actuales contra el historial.
Usa z-score: si |z| > threshold, el campo ha drifteado. Pensado para
comparar metrics de executions sucesivas en operations.db.
Args:
history: Lista de dicts con metricas historicas. Cada dict puede
contener cualquier combinacion de los campos indicados.
current: Dict con las metricas de la ejecucion actual.
fields: Lista de campos numericos a analizar.
threshold: Umbral de z-score para considerar drift. Default 2.0.
Returns:
Lista de dicts con: field, current, mean, std, z_score, drifted.
Si el historial tiene 0 o 1 punto, z_score=0.0 y drifted=False
porque no hay suficiente informacion estadistica.
"""
results = []
for field in fields:
values = [
float(h[field])
for h in history
if field in h and h[field] is not None
]
current_val = float(current.get(field, 0))
if len(values) == 0:
results.append({
"field": field,
"current": current_val,
"mean": 0.0,
"std": 0.0,
"z_score": 0.0,
"drifted": False,
})
continue
n = len(values)
mean = sum(values) / n
if n < 2:
# Un solo punto: no hay std, no podemos calcular z-score
results.append({
"field": field,
"current": current_val,
"mean": mean,
"std": 0.0,
"z_score": 0.0,
"drifted": False,
})
continue
variance = sum((v - mean) ** 2 for v in values) / n
std = math.sqrt(variance)
if std == 0.0:
# Todos los valores identicos: z_score indeterminado, no drift
z_score = 0.0
drifted = False
else:
z_score = (current_val - mean) / std
drifted = abs(z_score) > threshold
results.append({
"field": field,
"current": current_val,
"mean": mean,
"std": std,
"z_score": z_score,
"drifted": drifted,
})
return results
@@ -0,0 +1,90 @@
"""Tests para detect_drift."""
import sys
import os
import math
sys.path.insert(0, os.path.dirname(__file__))
from detect_drift import detect_drift
def test_campo_con_drift_claro_z_mayor_threshold():
history = [
{"records_out": 100},
{"records_out": 105},
{"records_out": 98},
]
current = {"records_out": 50}
results = detect_drift(history, current, ["records_out"])
assert len(results) == 1
r = results[0]
assert r["field"] == "records_out"
assert r["current"] == 50.0
assert r["drifted"] is True
assert r["z_score"] < -2.0 # muy lejos de la media
def test_campo_estable_z_menor_threshold():
history = [
{"val": 100.0},
{"val": 102.0},
{"val": 98.0},
{"val": 101.0},
]
current = {"val": 100.5} # dentro del rango normal
results = detect_drift(history, current, ["val"])
assert len(results) == 1
r = results[0]
assert r["drifted"] is False
assert abs(r["z_score"]) < 2.0
def test_historial_con_un_solo_punto_std_0_drifted_False_con_nota():
history = [{"val": 100.0}]
current = {"val": 999.0}
results = detect_drift(history, current, ["val"])
assert len(results) == 1
r = results[0]
assert r["std"] == 0.0
assert r["z_score"] == 0.0
assert r["drifted"] is False
assert r["mean"] == 100.0
def test_historial_vacio_todos_drifted_False():
history = []
current = {"records_out": 50, "duration_ms": 2000}
results = detect_drift(history, current, ["records_out", "duration_ms"])
assert len(results) == 2
for r in results:
assert r["drifted"] is False
assert r["z_score"] == 0.0
assert r["mean"] == 0.0
def test_threshold_custom():
history = [
{"val": 100.0},
{"val": 100.0},
{"val": 110.0},
{"val": 90.0},
]
# std ~ 7.07, mean = 100
current = {"val": 115.0} # z ~ 2.12
# threshold default 2.0 -> drifted
results = detect_drift(history, current, ["val"], threshold=2.0)
assert results[0]["drifted"] is True
# threshold 3.0 -> no drifted
results2 = detect_drift(history, current, ["val"], threshold=3.0)
assert results2[0]["drifted"] is False
if __name__ == "__main__":
test_campo_con_drift_claro_z_mayor_threshold()
test_campo_estable_z_menor_threshold()
test_historial_con_un_solo_punto_std_0_drifted_False_con_nota()
test_historial_vacio_todos_drifted_False()
test_threshold_custom()
print("All tests passed.")
@@ -0,0 +1,58 @@
---
name: diff_entities
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def diff_entities(before: list[dict], after: list[dict], key: str = 'id', ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict"
description: "Compara dos snapshots de entities y devuelve diferencias campo a campo. Detecta añadidas, eliminadas, modificadas e inalteradas. Ignora created_at y updated_at por defecto."
tags: [diff, entities, snapshot, operations, comparison, datascience]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "entity añadida"
- "entity eliminada"
- "entity modificada con detalle de campos"
- "entities identicas → unchanged"
- "ignore_fields funciona"
- "compare_fields filtra correctamente"
- "lista vacia vs lista con datos"
test_file_path: "python/functions/datascience/diff_entities_test.py"
file_path: "python/functions/datascience/diff_entities.py"
---
## Ejemplo
```python
before = [
{"id": "1", "name": "Alice", "status": "active", "updated_at": "2024-01-01"},
{"id": "2", "name": "Bob", "status": "active", "updated_at": "2024-01-01"},
]
after = [
{"id": "1", "name": "Alice", "status": "inactive", "updated_at": "2024-01-02"},
{"id": "3", "name": "Carol", "status": "active", "updated_at": "2024-01-02"},
]
result = diff_entities(before, after)
# result["added"] -> [{"id": "3", "name": "Carol", ...}]
# result["removed"] -> [{"id": "2", "name": "Bob", ...}]
# result["modified"] -> [{"key": "1", "changes": {"status": {"old": "active", "new": "inactive"}}}]
# result["unchanged"] -> 0
# result["summary"] -> "1 added, 1 removed, 1 modified, 0 unchanged"
```
## Notas
Funcion pura. No hace I/O — toma listas de dicts ya cargadas en memoria.
El campo `key` debe existir en todas las entities; las que no lo tengan se ignoran silenciosamente.
Si `compare_fields` se da, tiene prioridad sobre `ignore_fields`. Esto permite comparar solo un subconjunto especifico de campos sin preocuparse por los campos temporales.
El orden de `added` y `removed` no esta garantizado (depende del orden de iteracion de sets).
@@ -0,0 +1,77 @@
"""diff_entities — compara dos snapshots de entities detectando cambios campo a campo."""
def diff_entities(
before: list[dict],
after: list[dict],
key: str = "id",
ignore_fields: list[str] | None = None,
compare_fields: list[str] | None = None,
) -> dict:
"""Compara dos snapshots de entities y devuelve diferencias campo a campo.
Detecta entities añadidas, eliminadas, modificadas e inalteradas.
Ignora campos de metadata temporal por defecto (created_at, updated_at).
Args:
before: Lista de entities del snapshot anterior.
after: Lista de entities del snapshot posterior.
key: Campo que identifica unicamente cada entity. Default "id".
ignore_fields: Campos a excluir de la comparacion.
Default ["created_at", "updated_at"].
compare_fields: Si se da, solo compara estos campos (tiene prioridad
sobre ignore_fields).
Returns:
Dict con keys: added, removed, modified, unchanged, summary.
modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}.
"""
if ignore_fields is None:
ignore_fields = ["created_at", "updated_at"]
before_map = {str(e[key]): e for e in before if key in e}
after_map = {str(e[key]): e for e in after if key in e}
before_keys = set(before_map.keys())
after_keys = set(after_map.keys())
added = [after_map[k] for k in after_keys - before_keys]
removed = [before_map[k] for k in before_keys - after_keys]
modified = []
unchanged = 0
for k in before_keys & after_keys:
b = before_map[k]
a = after_map[k]
if compare_fields is not None:
fields_to_check = compare_fields
else:
all_fields = set(b.keys()) | set(a.keys())
fields_to_check = [f for f in all_fields if f not in ignore_fields and f != key]
changes = {}
for field in fields_to_check:
old_val = b.get(field)
new_val = a.get(field)
if old_val != new_val:
changes[field] = {"old": old_val, "new": new_val}
if changes:
modified.append({"key": k, "changes": changes})
else:
unchanged += 1
n_added = len(added)
n_removed = len(removed)
n_modified = len(modified)
summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged"
return {
"added": added,
"removed": removed,
"modified": modified,
"unchanged": unchanged,
"summary": summary,
}
@@ -0,0 +1,111 @@
"""Tests para diff_entities."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from diff_entities import diff_entities
def test_entity_anadida():
before = [{"id": "1", "name": "Alice"}]
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
result = diff_entities(before, after)
assert len(result["added"]) == 1
assert result["added"][0]["id"] == "2"
assert result["removed"] == []
assert result["modified"] == []
assert result["unchanged"] == 1
assert "1 added" in result["summary"]
def test_entity_eliminada():
before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
after = [{"id": "1", "name": "Alice"}]
result = diff_entities(before, after)
assert result["added"] == []
assert len(result["removed"]) == 1
assert result["removed"][0]["id"] == "2"
assert result["unchanged"] == 1
assert "1 removed" in result["summary"]
def test_entity_modificada_con_detalle_de_campos():
before = [{"id": "1", "name": "Alice", "status": "active"}]
after = [{"id": "1", "name": "Alice", "status": "inactive"}]
result = diff_entities(before, after)
assert result["added"] == []
assert result["removed"] == []
assert len(result["modified"]) == 1
mod = result["modified"][0]
assert mod["key"] == "1"
assert "status" in mod["changes"]
assert mod["changes"]["status"]["old"] == "active"
assert mod["changes"]["status"]["new"] == "inactive"
assert result["unchanged"] == 0
def test_entities_identicas_unchanged():
before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
result = diff_entities(before, after)
assert result["added"] == []
assert result["removed"] == []
assert result["modified"] == []
assert result["unchanged"] == 2
assert "2 unchanged" in result["summary"]
def test_ignore_fields_funciona():
before = [{"id": "1", "name": "Alice", "updated_at": "2024-01-01", "created_at": "2023-01-01"}]
after = [{"id": "1", "name": "Alice", "updated_at": "2024-06-01", "created_at": "2023-01-01"}]
result = diff_entities(before, after)
# updated_at se ignora por defecto -> unchanged
assert result["unchanged"] == 1
assert result["modified"] == []
# Si no ignoramos updated_at, debe detectar el cambio
result2 = diff_entities(before, after, ignore_fields=[])
assert len(result2["modified"]) == 1
assert "updated_at" in result2["modified"][0]["changes"]
def test_compare_fields_filtra_correctamente():
before = [{"id": "1", "name": "Alice", "status": "active", "score": 10}]
after = [{"id": "1", "name": "Bob", "status": "inactive", "score": 10}]
# Solo comparar score -> no hay cambio en score, unchanged
result = diff_entities(before, after, compare_fields=["score"])
assert result["unchanged"] == 1
assert result["modified"] == []
# Solo comparar name -> detecta cambio
result2 = diff_entities(before, after, compare_fields=["name"])
assert len(result2["modified"]) == 1
assert "name" in result2["modified"][0]["changes"]
assert "status" not in result2["modified"][0]["changes"]
def test_lista_vacia_vs_lista_con_datos():
before = []
after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}]
result = diff_entities(before, after)
assert len(result["added"]) == 2
assert result["removed"] == []
assert result["unchanged"] == 0
# Invertido
result2 = diff_entities(after, before)
assert result2["added"] == []
assert len(result2["removed"]) == 2
assert result2["unchanged"] == 0
if __name__ == "__main__":
test_entity_anadida()
test_entity_eliminada()
test_entity_modificada_con_detalle_de_campos()
test_entities_identicas_unchanged()
test_ignore_fields_funciona()
test_compare_fields_filtra_correctamente()
test_lista_vacia_vs_lista_con_datos()
print("All tests passed.")
@@ -0,0 +1,52 @@
---
name: diff_relations
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def diff_relations(before: list[dict], after: list[dict], key: tuple[str, str, str] = ('source_id', 'target_id', 'relation_type'), ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict"
description: "Compara relaciones entre dos snapshots usando key compuesta (source_id, target_id, relation_type). Detecta relaciones añadidas, eliminadas y modificadas con detalle campo a campo."
tags: [diff, relations, graph, snapshot, operations, comparison, datascience]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "relacion añadida"
- "relacion eliminada"
- "relacion con metadata modificada (mismo source/target/type, distinto weight)"
- "key compuesta funciona correctamente"
test_file_path: "python/functions/datascience/diff_relations_test.py"
file_path: "python/functions/datascience/diff_relations.py"
---
## Ejemplo
```python
before = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
{"source_id": "B", "target_id": "C", "relation_type": "owns", "weight": 0.5},
]
after = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 2.0},
{"source_id": "C", "target_id": "D", "relation_type": "knows", "weight": 1.0},
]
result = diff_relations(before, after)
# result["added"] -> [{"source_id": "C", "target_id": "D", ...}]
# result["removed"] -> [{"source_id": "B", "target_id": "C", ...}]
# result["modified"] -> [{"key": "A|B|knows", "changes": {"weight": {"old": 1.0, "new": 2.0}}}]
# result["unchanged"] -> 0
```
## Notas
La key compuesta se serializa como `source_id|target_id|relation_type`. Si alguno de los campos clave no existe en la relacion, se usa string vacio.
Misma semantica que `diff_entities_py_datascience` pero adaptada para relaciones donde no hay un ID unico — la identidad se define por los tres campos de la key.
Complemento natural de `diff_entities_py_datascience` para comparar grafos completos entre ejecuciones de pipelines.
@@ -0,0 +1,82 @@
"""diff_relations — compara dos snapshots de relaciones con key compuesta."""
def diff_relations(
before: list[dict],
after: list[dict],
key: tuple[str, str, str] = ("source_id", "target_id", "relation_type"),
ignore_fields: list[str] | None = None,
compare_fields: list[str] | None = None,
) -> dict:
"""Compara relaciones entre dos snapshots usando key compuesta.
Las relaciones se identifican por (source_id, target_id, relation_type)
porque no tienen un ID unico propio. Detecta relaciones añadidas,
eliminadas y modificadas (mismo source/target/type, distinta metadata).
Args:
before: Lista de relaciones del snapshot anterior.
after: Lista de relaciones del snapshot posterior.
key: Tupla de campos que forman la key compuesta.
Default ("source_id", "target_id", "relation_type").
ignore_fields: Campos a excluir de la comparacion.
Default ["created_at", "updated_at"].
compare_fields: Si se da, solo compara estos campos.
Returns:
Dict con keys: added, removed, modified, unchanged, summary.
modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}.
"""
if ignore_fields is None:
ignore_fields = ["created_at", "updated_at"]
def make_key(rel: dict) -> str:
return "|".join(str(rel.get(k, "")) for k in key)
before_map = {make_key(r): r for r in before}
after_map = {make_key(r): r for r in after}
before_keys = set(before_map.keys())
after_keys = set(after_map.keys())
added = [after_map[k] for k in after_keys - before_keys]
removed = [before_map[k] for k in before_keys - after_keys]
modified = []
unchanged = 0
for k in before_keys & after_keys:
b = before_map[k]
a = after_map[k]
if compare_fields is not None:
fields_to_check = compare_fields
else:
all_fields = set(b.keys()) | set(a.keys())
key_set = set(key)
fields_to_check = [f for f in all_fields if f not in ignore_fields and f not in key_set]
changes = {}
for field in fields_to_check:
old_val = b.get(field)
new_val = a.get(field)
if old_val != new_val:
changes[field] = {"old": old_val, "new": new_val}
if changes:
modified.append({"key": k, "changes": changes})
else:
unchanged += 1
n_added = len(added)
n_removed = len(removed)
n_modified = len(modified)
summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged"
return {
"added": added,
"removed": removed,
"modified": modified,
"unchanged": unchanged,
"summary": summary,
}
@@ -0,0 +1,78 @@
"""Tests para diff_relations."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from diff_relations import diff_relations
def test_relacion_anadida():
before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
after = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
{"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5},
]
result = diff_relations(before, after)
assert len(result["added"]) == 1
assert result["added"][0]["source_id"] == "C"
assert result["removed"] == []
assert result["unchanged"] == 1
assert "1 added" in result["summary"]
def test_relacion_eliminada():
before = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
{"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5},
]
after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
result = diff_relations(before, after)
assert result["added"] == []
assert len(result["removed"]) == 1
assert result["removed"][0]["source_id"] == "C"
assert result["unchanged"] == 1
assert "1 removed" in result["summary"]
def test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight():
before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}]
after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 5.0}]
result = diff_relations(before, after)
assert result["added"] == []
assert result["removed"] == []
assert len(result["modified"]) == 1
mod = result["modified"][0]
assert mod["key"] == "A|B|knows"
assert "weight" in mod["changes"]
assert mod["changes"]["weight"]["old"] == 1.0
assert mod["changes"]["weight"]["new"] == 5.0
assert result["unchanged"] == 0
def test_key_compuesta_funciona_correctamente():
# Misma pareja A->B pero diferente tipo de relacion -> dos relaciones distintas
before = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
{"source_id": "A", "target_id": "B", "relation_type": "owns", "weight": 0.5},
]
after = [
{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0},
{"source_id": "A", "target_id": "B", "relation_type": "trusts", "weight": 0.8},
]
result = diff_relations(before, after)
# owns eliminada, trusts añadida, knows sin cambios
assert len(result["added"]) == 1
assert result["added"][0]["relation_type"] == "trusts"
assert len(result["removed"]) == 1
assert result["removed"][0]["relation_type"] == "owns"
assert result["unchanged"] == 1
assert result["modified"] == []
if __name__ == "__main__":
test_relacion_anadida()
test_relacion_eliminada()
test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight()
test_key_compuesta_funciona_correctamente()
print("All tests passed.")
@@ -0,0 +1,36 @@
---
name: estimate_hawkes
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict"
description: "Estima parámetros de un proceso Hawkes (alpha, beta, branching_ratio) desde la autocorrelación de arrivals ajustando una exponencial decreciente sobre la ACF."
tags: [estimation, hawkes, stochastic-process, microstructure, timeseries]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [numpy, scipy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/datascience/datascience.py"
---
## Ejemplo
```python
arrivals = [0, 1, 3, 2, 0, 1, 4, 2, 1, 0] * 10
result = estimate_hawkes(arrivals, max_lag=10)
# {'alpha': 0.312, 'beta': 0.874, 'branching_ratio': 0.357, 'acf': [...]}
```
## Notas
Ajusta la función `a * exp(-b * lag)` sobre los lags 1..max_lag de la ACF usando `curve_fit` de scipy.
Si el primer lag de la ACF es <= 0.01 (sin autocorrelación), retorna alpha=0, beta=1, branching_ratio=0.
El branching_ratio = alpha/beta; si se acerca a 1, el proceso es explosivo.
Función pura: requiere numpy y scipy instalados.
@@ -0,0 +1,38 @@
---
name: estimate_pareto_alpha
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict"
description: "Estima el exponente alpha de una distribución Pareto via MLE. Alpha bajo indica cola más pesada y mayor frecuencia de valores extremos."
tags: [estimation, pareto, power-law, heavy-tail, statistics]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/datascience/datascience.py"
---
## Ejemplo
```python
import numpy as np
# Simular datos con cola pesada
values = list(np.random.pareto(2.0, 1000) + 1)
result = estimate_pareto_alpha(values, x_min_percentile=90.0)
# {'alpha': ~2.0, 'x_min': ..., 'n_tail': 100}
```
## Notas
Usa el estimador MLE de Hill: α = n / Σ ln(xᵢ / x_min).
x_min se determina como el percentil indicado de los valores positivos.
Retorna alpha=0 si hay menos de 10 valores positivos o la cola tiene menos de 2 elementos.
Función pura: requiere numpy instalado.
@@ -0,0 +1,87 @@
---
name: extract_entities_llm
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: impure
signature: "def extract_entities_llm(text: str, entity_schema: list[dict], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list[EntityCandidate]"
description: "Extrae entidades de un chunk de texto usando un LLM inyectado. Construye el system prompt con el schema, llama al LLM y valida la respuesta retornando EntityCandidate. JSON invalido o type_ref fuera del schema se descartan con warning."
tags: [llm, extraction, entity, nlp, osint, graph, fuzzygraph, datascience, prompt]
uses_functions: []
uses_types: [entity_candidate_py_datascience]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [warnings, typing.Callable]
tested: true
tests:
- "texto con entidades claras retorna EntityCandidate"
- "texto sin entidades retorna lista vacia"
- "llm retorna json mal formado retorna lista vacia con warning"
- "type_ref invalido en respuesta se descarta con warning"
- "confidence se propaga correctamente"
- "schema vacio lanza ValueError"
test_file_path: "python/functions/datascience/extract_entities_llm_test.py"
file_path: "python/functions/datascience/extract_entities_llm.py"
---
## Ejemplo
```python
import json
from extract_entities_llm import extract_entities_llm
# LLM stub para tests — en produccion usar litellm o similar
def mock_llm(messages: list[dict]) -> dict:
return {
"entities": [
{
"name": "John Smith",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {"full_name": "John Smith", "nationality": "US"},
"confidence": 0.95,
},
{
"name": "evil-corp.com",
"type_ref": "osint_domain_go_cybersecurity",
"attributes": {"fqdn": "evil-corp.com"},
"confidence": 0.88,
},
]
}
schema = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
},
{
"type_ref": "osint_domain_go_cybersecurity",
"label": "Domain",
"metadata_fields": ["fqdn", "registrar", "created_date"],
},
]
text = "John Smith, a US citizen, was linked to the domain evil-corp.com."
candidates = extract_entities_llm(text, schema, mock_llm)
# [EntityCandidate(name='John Smith', type_ref='osint_person_go_cybersecurity', confidence=0.95),
# EntityCandidate(name='evil-corp.com', type_ref='osint_domain_go_cybersecurity', confidence=0.88)]
```
## Notas
**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe mensajes en formato OpenAI (`[{"role": "system", "content": "..."}, ...]`) y retorna un `dict` con la respuesta ya parseada como JSON. Esto desacopla la funcion de cualquier cliente especifico — puede usarse con OpenAI, Anthropic via litellm, o cualquier mock.
**Validacion de type_ref:** Solo se aceptan entidades cuyo `type_ref` aparece en el `entity_schema`. Entidades con type_ref desconocido se descartan con `warnings.warn` (no lanzan excepcion) para ser resiliente ante alucinaciones del LLM.
**Manejo de JSON invalido:** Si `llm_chat_json` lanza una excepcion o retorna un dict sin la clave `entities`, se retorna lista vacia y se emite un warning. El llamador puede decidir si reintentar.
**Confidence clamping:** El valor de confidence se clampea al rango [0.0, 1.0] automaticamente.
**Atributos null:** Los atributos con valor `None` se filtran del dict de atributos para mantener el output limpio.
**source_chunk_indices:** Esta funcion no setea `source_chunk_indices` — ese campo lo llena el pipeline exterior que conoce el indice del chunk actual.
Esta funcion es el bloque atomico de extraccion. El pipeline completo de grafos la llama por cada chunk del documento y luego deduplica los candidatos resultantes.
@@ -0,0 +1,145 @@
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
import sys
import os
import warnings
from typing import Callable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from python.types.datascience.entity_candidate import EntityCandidate
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
"""Construye el system prompt para extraccion de entidades."""
lines = [
"You are an entity extraction expert. Given text, extract all entities",
"matching these types. For each entity, provide: name, type_ref,",
"attributes (matching the metadata_fields for that type), and a",
"confidence score (0.0-1.0).",
"",
"Entity types:",
]
for schema_entry in entity_schema:
label = schema_entry.get("label", "Unknown")
type_ref = schema_entry.get("type_ref", "")
metadata_fields = schema_entry.get("metadata_fields", [])
lines.append(f"- {label} (type_ref: {type_ref})")
if metadata_fields:
lines.append(f" fields: {', '.join(metadata_fields)}")
lines += [
"",
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
"",
"Rules:",
"- Only extract entities explicitly mentioned in the text",
"- Use the exact type_ref from the schema",
"- Leave unknown attributes as null",
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
f"- {language_instruction}",
]
return "\n".join(lines)
def extract_entities_llm(
text: str,
entity_schema: list[dict],
llm_chat_json: Callable[[list[dict]], dict],
language_instruction: str = "Respond in English.",
) -> list[EntityCandidate]:
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
Construye un system prompt con el schema de entity types, llama al LLM
y valida la respuesta retornando una lista de EntityCandidate.
Args:
text: Chunk de texto a analizar.
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
dict con las claves 'type_ref', 'label' y opcionalmente
'metadata_fields'. Ejemplo:
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
"metadata_fields": ["full_name", "alias"]}]
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
y retorna un dict con la respuesta JSON del LLM. Interfaz:
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
language_instruction: Instruccion de idioma para el LLM. Por defecto
"Respond in English."
Returns:
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
no retorna JSON valido o si no se encuentran entidades.
Raises:
ValueError: Si entity_schema esta vacio.
"""
if not entity_schema:
raise ValueError("entity_schema no puede estar vacio")
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
type_ref_to_label = {
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
}
system_prompt = _build_system_prompt(entity_schema, language_instruction)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
]
try:
response = llm_chat_json(messages)
except Exception as exc:
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
return []
raw_entities = response.get("entities", [])
if not isinstance(raw_entities, list):
warnings.warn(
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
stacklevel=2,
)
return []
candidates: list[EntityCandidate] = []
for item in raw_entities:
if not isinstance(item, dict):
continue
name = item.get("name", "")
if not name:
continue
type_ref = item.get("type_ref", "")
if type_ref not in valid_type_refs:
warnings.warn(
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
stacklevel=2,
)
continue
attributes = item.get("attributes", {})
if not isinstance(attributes, dict):
attributes = {}
# Normalizar null values a None
attributes = {k: v for k, v in attributes.items() if v is not None}
confidence = item.get("confidence", 0.0)
if not isinstance(confidence, (int, float)):
confidence = 0.0
confidence = float(max(0.0, min(1.0, confidence)))
candidates.append(
EntityCandidate(
name=name,
type_ref=type_ref,
type_label=type_ref_to_label.get(type_ref, ""),
attributes=attributes,
confidence=confidence,
)
)
return candidates
@@ -0,0 +1,164 @@
"""Tests para extract_entities_llm."""
import warnings
import sys
import os
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from python.functions.datascience.extract_entities_llm import extract_entities_llm
from python.types.datascience.entity_candidate import EntityCandidate
SCHEMA = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
},
{
"type_ref": "osint_domain_go_cybersecurity",
"label": "Domain",
"metadata_fields": ["fqdn", "registrar", "created_date"],
},
]
def make_llm(response: dict):
"""Crea un stub de LLM que retorna la respuesta dada."""
def _llm(messages: list[dict]) -> dict:
return response
return _llm
def test_texto_con_entidades_claras_retorna_entity_candidate():
"""texto con entidades claras retorna EntityCandidate"""
llm = make_llm({
"entities": [
{
"name": "John Smith",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {"full_name": "John Smith", "nationality": "US"},
"confidence": 0.95,
},
{
"name": "evil-corp.com",
"type_ref": "osint_domain_go_cybersecurity",
"attributes": {"fqdn": "evil-corp.com"},
"confidence": 0.88,
},
]
})
result = extract_entities_llm(
"John Smith, US citizen, linked to evil-corp.com.", SCHEMA, llm
)
assert len(result) == 2
person = next(e for e in result if e.name == "John Smith")
assert person.type_ref == "osint_person_go_cybersecurity"
assert person.type_label == "Person"
assert person.attributes["full_name"] == "John Smith"
assert person.confidence == 0.95
domain = next(e for e in result if e.name == "evil-corp.com")
assert domain.type_ref == "osint_domain_go_cybersecurity"
assert domain.type_label == "Domain"
assert domain.attributes["fqdn"] == "evil-corp.com"
assert domain.confidence == 0.88
def test_texto_sin_entidades_retorna_lista_vacia():
"""texto sin entidades retorna lista vacia"""
llm = make_llm({"entities": []})
result = extract_entities_llm(
"The sky is blue and the grass is green.", SCHEMA, llm
)
assert result == []
def test_llm_retorna_json_mal_formado_retorna_lista_vacia_con_warning():
"""llm retorna json mal formado retorna lista vacia con warning"""
def bad_llm(messages: list[dict]) -> dict:
raise ValueError("JSON decode error")
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
result = extract_entities_llm("Some text with entities.", SCHEMA, bad_llm)
assert result == []
assert len(caught) == 1
assert "error llamando al LLM" in str(caught[0].message)
def test_type_ref_invalido_en_respuesta_se_descarta_con_warning():
"""type_ref invalido en respuesta se descarta con warning"""
llm = make_llm({
"entities": [
{
"name": "Valid Person",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {},
"confidence": 0.9,
},
{
"name": "Unknown Thing",
"type_ref": "nonexistent_type_ref",
"attributes": {},
"confidence": 0.8,
},
]
})
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
result = extract_entities_llm("Text with entities.", SCHEMA, llm)
assert len(result) == 1
assert result[0].name == "Valid Person"
assert any("nonexistent_type_ref" in str(w.message) for w in caught)
def test_confidence_se_propaga_correctamente():
"""confidence se propaga correctamente"""
llm = make_llm({
"entities": [
{
"name": "Implied Person",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {},
"confidence": 0.7,
},
{
"name": "Weakly Implied Domain",
"type_ref": "osint_domain_go_cybersecurity",
"attributes": {},
"confidence": 0.5,
},
{
"name": "Explicit Entity",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {},
"confidence": 1.0,
},
]
})
result = extract_entities_llm("Some text.", SCHEMA, llm)
assert len(result) == 3
confidences = {e.name: e.confidence for e in result}
assert confidences["Implied Person"] == 0.7
assert confidences["Weakly Implied Domain"] == 0.5
assert confidences["Explicit Entity"] == 1.0
def test_schema_vacio_lanza_value_error():
"""schema vacio lanza ValueError"""
llm = make_llm({"entities": []})
with pytest.raises(ValueError, match="entity_schema no puede estar vacio"):
extract_entities_llm("Some text.", [], llm)
@@ -0,0 +1,75 @@
---
name: extract_relations_llm
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: impure
signature: "def extract_relations_llm(text: str, entities: list, relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list"
description: "Extrae relaciones entre entidades de un chunk de texto usando un LLM inyectado. Valida que from_name y to_name correspondan a entidades existentes, y usa 'related_to' como fallback para tipos de relacion no permitidos."
tags: [extraction, relation, llm, knowledge-graph, nlp, datascience, fuzzygraph, graph]
uses_functions: []
uses_types:
- entity_candidate_py_datascience
- relation_candidate_py_datascience
returns:
- relation_candidate_py_datascience
returns_optional: false
error_type: "error_go_core"
imports: [logging, sys, os, typing]
tested: true
tests:
- "texto con dos entidades relacionadas"
- "texto con entidades pero sin relacion"
- "menos de dos entidades retorna lista vacia"
- "llm inventa entidad que no existe se descarta"
test_file_path: "python/functions/datascience/extract_relations_llm_test.py"
file_path: "python/functions/datascience/extract_relations_llm.py"
---
## Ejemplo
```python
from extract_relations_llm import extract_relations_llm
from python.types.datascience.entity_candidate import EntityCandidate
# Stub de llm_chat_json (en produccion usar llm_completion_retry o similar)
def my_llm(messages: list[dict]) -> dict:
# Llamar al LLM real aqui
return {"relations": [...]}
entities = [
EntityCandidate(name="Acme Corp", type_label="Organization", confidence=0.95),
EntityCandidate(name="John Smith", type_label="Person", confidence=0.9),
]
relation_types = ["employs", "funds", "owns", "communicates_with", "related_to"]
relations = extract_relations_llm(
text="Acme Corp employs John Smith as CEO and funds his research.",
entities=entities,
relation_types=relation_types,
llm_chat_json=my_llm,
)
for rel in relations:
print(f"{rel.from_name} --[{rel.relation_type}]--> {rel.to_name} ({rel.confidence:.2f})")
# Acme Corp --[employs]--> John Smith (0.90)
# Acme Corp --[funds]--> John Smith (0.85)
```
## Notas
**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe una lista de mensajes en formato OpenAI (`[{"role": "system", "content": ...}, {"role": "user", "content": ...}]`) y retorna un dict con la clave `"relations"`. Esto desacopla la funcion de cualquier proveedor de LLM concreto.
**Validacion de entidades:** Solo se aceptan relaciones donde `from_name` y `to_name` aparecen exactamente en los nombres de las entidades proporcionadas. Relaciones con nombres inventados por el LLM se descartan silenciosamente (con debug log).
**Fallback de tipo:** Si el LLM propone un `relation_type` que no esta en la lista permitida, se reemplaza por `"related_to"`. Si `"related_to"` tampoco esta en la lista, se incluye igualmente como catch-all seguro.
**Menos de 2 entidades:** La funcion retorna `[]` inmediatamente sin llamar al LLM, ya que no puede haber relaciones con menos de 2 participantes.
**Error handling:** Si `llm_chat_json` lanza una excepcion, se captura con warning y retorna `[]`. Si la respuesta no contiene la clave `"relations"` o no es una lista, idem.
**Confianza:** Los valores de confianza del LLM se clampean al rango `[0.0, 1.0]`. Valores no numericos se convierten a `0.0`.
Disenado para fuzzygraph — se compone con `extract_entities_llm` (paso anterior) y `deduplicate_relations` (paso siguiente en el pipeline de extraccion).
@@ -0,0 +1,141 @@
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
import logging
import sys
import os
from typing import Callable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.relation_candidate import RelationCandidate
logger = logging.getLogger(__name__)
def extract_relations_llm(
text: str,
entities: list[EntityCandidate],
relation_types: list[str],
llm_chat_json: Callable[[list[dict]], dict],
language_instruction: str = "Respond in English.",
) -> list[RelationCandidate]:
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
Dado el texto original y las entidades ya extraidas, pide al LLM que
identifique relaciones entre pares de entidades. Las relaciones cuyo
from_name o to_name no coincidan con ninguna entidad existente se descartan.
Los tipos de relacion no permitidos se reemplazan por "related_to".
Args:
text: chunk de texto (el mismo que se uso para extraer las entidades).
entities: entidades ya extraidas del chunk.
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
"communicates_with", "owns", "related_to"].
llm_chat_json: funcion inyectada que recibe una lista de mensajes
(dicts con "role" y "content") y retorna un dict con la respuesta
JSON del LLM.
language_instruction: instruccion de idioma para el LLM.
Returns:
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
o si el LLM no encuentra relaciones.
"""
if len(entities) < 2:
return []
entity_names = {e.name for e in entities}
relation_types_set = set(relation_types)
# Construir lista de entidades para el prompt
entity_lines = "\n".join(
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
)
# Construir tipos de relacion para el prompt
relation_types_str = ", ".join(relation_types)
system_prompt = f"""\
You are a relation extraction expert. Given text and a list of entities already \
extracted, identify relationships between them.
Entities found in this text:
{entity_lines}
Allowed relation types: {relation_types_str}
Output JSON: {{"relations": [
{{"from_name": "Entity A", "to_name": "Entity B",
"relation_type": "employs", "description": "...", "confidence": 0.8}}
]}}
Rules:
- Only extract relations explicitly stated or strongly implied in the text
- from_name and to_name must match entity names exactly as listed above
- relation_type must be one of the allowed types
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
- Do not invent entities not in the list above
- {language_instruction}"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
]
try:
response = llm_chat_json(messages)
except Exception as exc:
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
return []
raw_relations = response.get("relations", [])
if not isinstance(raw_relations, list):
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
return []
results: list[RelationCandidate] = []
for item in raw_relations:
if not isinstance(item, dict):
continue
from_name = item.get("from_name", "")
to_name = item.get("to_name", "")
# Validar que ambos nombres corresponden a entidades existentes
if from_name not in entity_names:
logger.debug(
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
from_name,
)
continue
if to_name not in entity_names:
logger.debug(
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
to_name,
)
continue
relation_type = item.get("relation_type", "")
if relation_type not in relation_types_set:
logger.debug(
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
relation_type,
)
relation_type = "related_to"
confidence = item.get("confidence", 0.0)
if not isinstance(confidence, (int, float)):
confidence = 0.0
confidence = float(max(0.0, min(1.0, confidence)))
results.append(
RelationCandidate(
from_name=from_name,
to_name=to_name,
relation_type=relation_type,
description=item.get("description", ""),
confidence=confidence,
)
)
return results
@@ -0,0 +1,140 @@
"""Tests para extract_relations_llm."""
import sys
import os
# Rutas para importar desde el registry
REGISTRY_ROOT = os.path.join(os.path.dirname(__file__), "..", "..", "..", "")
sys.path.insert(0, REGISTRY_ROOT)
sys.path.insert(0, os.path.dirname(__file__))
from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.relation_candidate import RelationCandidate
from extract_relations_llm import extract_relations_llm
def _make_entity(name: str, type_label: str = "Entity") -> EntityCandidate:
return EntityCandidate(name=name, type_label=type_label, confidence=0.9)
def _make_llm(response: dict):
"""Crea un stub de llm_chat_json que retorna la respuesta fija."""
def llm_chat_json(messages: list[dict]) -> dict:
return response
return llm_chat_json
def test_texto_con_dos_entidades_relacionadas():
entities = [_make_entity("Acme Corp", "Organization"), _make_entity("John Smith", "Person")]
relation_types = ["employs", "funds", "related_to"]
llm_response = {
"relations": [
{
"from_name": "Acme Corp",
"to_name": "John Smith",
"relation_type": "employs",
"description": "Acme Corp employs John Smith as CEO",
"confidence": 0.9,
}
]
}
result = extract_relations_llm(
text="Acme Corp employs John Smith as CEO.",
entities=entities,
relation_types=relation_types,
llm_chat_json=_make_llm(llm_response),
)
assert len(result) == 1
rel = result[0]
assert rel.from_name == "Acme Corp"
assert rel.to_name == "John Smith"
assert rel.relation_type == "employs"
assert rel.confidence == 0.9
assert "CEO" in rel.description
def test_texto_con_entidades_pero_sin_relacion():
entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")]
relation_types = ["funds", "employs"]
llm_response = {"relations": []}
result = extract_relations_llm(
text="Alice and Bob both attended the conference.",
entities=entities,
relation_types=relation_types,
llm_chat_json=_make_llm(llm_response),
)
assert result == []
def test_menos_de_dos_entidades_retorna_lista_vacia():
entities = [_make_entity("Solo Corp", "Organization")]
relation_types = ["employs", "funds"]
# El LLM nunca deberia ser llamado, pero si lo fuera retornaria relaciones
llm_response = {
"relations": [
{"from_name": "Solo Corp", "to_name": "Nobody", "relation_type": "employs", "confidence": 0.9}
]
}
result = extract_relations_llm(
text="Solo Corp is a company.",
entities=entities,
relation_types=relation_types,
llm_chat_json=_make_llm(llm_response),
)
assert result == []
def test_llm_inventa_entidad_que_no_existe_se_descarta():
entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")]
relation_types = ["funds", "employs", "related_to"]
llm_response = {
"relations": [
# Valida — Alice y Bob existen
{
"from_name": "Alice",
"to_name": "Bob",
"relation_type": "funds",
"description": "Alice funds Bob",
"confidence": 0.8,
},
# Invalida — "Charlie" no esta en entities
{
"from_name": "Alice",
"to_name": "Charlie",
"relation_type": "employs",
"description": "Alice employs Charlie",
"confidence": 0.7,
},
# Invalida — "Unknown Corp" no esta en entities
{
"from_name": "Unknown Corp",
"to_name": "Bob",
"relation_type": "related_to",
"description": "...",
"confidence": 0.6,
},
]
}
result = extract_relations_llm(
text="Alice funds Bob. Alice also employs Charlie from Unknown Corp.",
entities=entities,
relation_types=relation_types,
llm_chat_json=_make_llm(llm_response),
)
# Solo la primera relacion es valida
assert len(result) == 1
assert result[0].from_name == "Alice"
assert result[0].to_name == "Bob"
assert result[0].relation_type == "funds"
@@ -0,0 +1,72 @@
---
name: hotness_score
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def hotness_score(active_count: int, updated_at: datetime | None, now: datetime | None = None, half_life_days: float = 7.0) -> float"
description: "Calcula un score de hotness combinando frecuencia de acceso y recencia temporal. Util para ranking de resultados, memoria hot/cold y cache eviction."
tags: [ranking, decay, recency, frequency, scoring, cache, memory, datascience]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [math, datetime]
tested: true
tests:
- "active_count=0, updated_at reciente"
- "active_count=100, updated_at reciente (score alto)"
- "active_count=100, updated_at hace 30 dias (score bajo)"
- "updated_at=None (retorna 0.0)"
- "now explicito (determinista para tests)"
- "half_life_days custom"
test_file_path: "python/functions/datascience/hotness_score_test.py"
file_path: "python/functions/datascience/hotness_score.py"
---
## Ejemplo
```python
from datetime import datetime, timedelta, timezone
from datascience.hotness_score import hotness_score
now = datetime.now(timezone.utc)
# Item reciente con muchos accesos -> score alto
score = hotness_score(active_count=150, updated_at=now - timedelta(hours=2), now=now)
# score > 0.95
# Item antiguo aunque muy accedido -> score bajo
score = hotness_score(active_count=150, updated_at=now - timedelta(days=30), now=now)
# score ~ 0.05
# Item sin fecha -> siempre 0
score = hotness_score(active_count=999, updated_at=None)
# score == 0.0
```
## Notas
Formula: `score = sigmoid(log1p(active_count)) * exp(-ln(2)/half_life_days * age_days)`
**Componente de frecuencia**`sigmoid(log1p(count))` mapea enteros no negativos al rango `(0.5, 1.0)`:
- count=0 -> 0.5
- count=10 -> ~0.92
- count=100 -> ~0.99
**Componente de recencia** — decaimiento exponencial con vida media configurable:
- `half_life_days=7` (default): score se reduce a la mitad cada 7 dias
- `half_life_days=1`: decaimiento agresivo (util para feeds en tiempo real)
- `half_life_days=365`: decaimiento lento (util para contenido evergreen)
**Propiedades del score:**
- `updated_at=None` -> 0.0 siempre (item sin fecha no tiene hotness)
- `active_count=0, reciente` -> ~0.5 (neutro pero fresco)
- `active_count alto, reciente` -> ~1.0 (muy caliente)
- `active_count alto, antiguo` -> ~0.0 (frio a pesar de popularidad pasada)
Timestamps sin timezone se interpretan como UTC. Pasar `now` explicitamente garantiza determinismo en tests y reproducibilidad en pipelines batch.
Fuente conceptual: openviking/retrieve/memory_lifecycle.py (AGPL-3.0). Reimplementado desde cero con formula equivalente.
@@ -0,0 +1,49 @@
"""Hotness score — combining access frequency and recency decay."""
import math
from datetime import datetime, timezone
def hotness_score(
active_count: int,
updated_at: datetime | None,
now: datetime | None = None,
half_life_days: float = 7.0,
) -> float:
"""Calcula un score de hotness combinando frecuencia de acceso y recencia.
Formula: sigmoid(log1p(active_count)) * exp_decay(age_days, half_life_days)
El componente de frecuencia mapea conteos enteros al rango (0, 1) via sigmoid(log1p).
El componente de recencia decae exponencialmente con vida media configurable.
Args:
active_count: Numero de accesos o activaciones. Debe ser >= 0.
updated_at: Timestamp de la ultima actualizacion. None retorna 0.0.
now: Momento de referencia para calcular la edad. Si es None usa datetime.now(UTC).
half_life_days: Dias para que la recencia se reduzca a la mitad. Default 7.
Returns:
float en [0.0, 1.0]. Valores mas cercanos a 1.0 indican mayor hotness.
"""
if updated_at is None:
return 0.0
# Componente de frecuencia: sigmoid(log1p(count)) mapea 0..inf -> (0.5, 1.0)
freq = 1.0 / (1.0 + math.exp(-math.log1p(active_count)))
# Componente de recencia: decaimiento exponencial
if now is None:
now = datetime.now(timezone.utc)
# Normalizar ambos timestamps a UTC para comparacion segura
if updated_at.tzinfo is None:
updated_at = updated_at.replace(tzinfo=timezone.utc)
if now.tzinfo is None:
now = now.replace(tzinfo=timezone.utc)
age_days = max((now - updated_at).total_seconds() / 86400.0, 0.0)
decay_rate = math.log(2) / half_life_days
recency = math.exp(-decay_rate * age_days)
return freq * recency
@@ -0,0 +1,61 @@
"""Tests para hotness_score."""
import math
from datetime import datetime, timedelta, timezone
from hotness_score import hotness_score
NOW = datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
def test_active_count_zero_updated_at_reciente():
"""active_count=0, updated_at reciente."""
updated_at = NOW - timedelta(hours=1)
score = hotness_score(0, updated_at, now=NOW)
# freq = sigmoid(log1p(0)) = sigmoid(0) = 0.5
# recency ~ 1.0 (casi nuevo)
assert 0.45 < score < 0.55, f"Expected ~0.5, got {score}"
def test_active_count_alto_updated_at_reciente():
"""active_count=100, updated_at reciente (score alto)."""
updated_at = NOW - timedelta(hours=1)
score = hotness_score(100, updated_at, now=NOW)
# freq = sigmoid(log1p(100)) = sigmoid(4.615) ~ 0.99
# recency ~ 1.0
assert score > 0.95, f"Expected > 0.95, got {score}"
def test_active_count_alto_updated_at_hace_30_dias():
"""active_count=100, updated_at hace 30 dias (score bajo)."""
updated_at = NOW - timedelta(days=30)
score = hotness_score(100, updated_at, now=NOW)
# recency = exp(-ln2/7 * 30) = exp(-2.97) ~ 0.051
# score ~ 0.99 * 0.051 ~ 0.05
assert score < 0.1, f"Expected < 0.1, got {score}"
def test_updated_at_none_retorna_cero():
"""updated_at=None (retorna 0.0)."""
score = hotness_score(100, None, now=NOW)
assert score == 0.0, f"Expected 0.0, got {score}"
def test_now_explicito():
"""now explicito (determinista para tests)."""
updated_at = NOW - timedelta(days=7)
score = hotness_score(50, updated_at, now=NOW)
# recency = exp(-ln2/7 * 7) = 0.5
# freq = sigmoid(log1p(50)) ~ sigmoid(3.93) ~ 0.981
expected = (1.0 / (1.0 + math.exp(-math.log1p(50)))) * 0.5
assert abs(score - expected) < 1e-9, f"Expected {expected}, got {score}"
def test_half_life_days_custom():
"""half_life_days custom."""
updated_at = NOW - timedelta(days=1)
# Con half_life=1 dia, despues de 1 dia recency = 0.5
score = hotness_score(50, updated_at, now=NOW, half_life_days=1.0)
freq = 1.0 / (1.0 + math.exp(-math.log1p(50)))
expected = freq * 0.5
assert abs(score - expected) < 1e-6, f"Expected {expected}, got {score}"
+40
View File
@@ -0,0 +1,40 @@
---
name: melt
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def melt(rows: list[dict], id_vars: list[str], value_vars: list[str] | None = None, var_name: str = 'variable', value_name: str = 'value') -> list[dict]"
description: "Inversa de pivot. Convierte columnas en filas (formato largo). Cada combinacion de id_vars + value_var genera una fila. Si value_vars es None, derrite todas las columnas no-id."
tags: [datascience, tabular, melt, unpivot, transform, python]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests:
- "Melt basico"
- "Multiples id_vars"
- "value_vars None derrite todas las columnas no-id"
- "Fila con campo faltante en value_vars"
test_file_path: "python/functions/datascience/melt_test.py"
file_path: "python/functions/datascience/melt.py"
---
## Ejemplo
```python
rows = [{"region": "US", "q1": 10, "q2": 20}]
melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
# [{"region": "US", "variable": "q1", "value": 10},
# {"region": "US", "variable": "q2", "value": 20}]
```
## Notas
Funcion pura sin dependencias externas.
Si un campo de value_vars no existe en la fila, su valor sera None.
El parametro value_vars=None es util cuando se desconoce el schema exacto.
+40
View File
@@ -0,0 +1,40 @@
"""Melt (unpivot) para datos tabulares list[dict]."""
def melt(
rows: list[dict],
id_vars: list[str],
value_vars: list[str] | None = None,
var_name: str = "variable",
value_name: str = "value",
) -> list[dict]:
"""Convierte columnas en filas (formato largo). Inversa de pivot.
Cada combinacion de id_vars + value_var genera una fila nueva.
Si value_vars es None, se usan todas las columnas que no esten en id_vars.
Args:
rows: Lista de dicts en formato ancho.
id_vars: Columnas que se mantienen como identificadores en cada fila.
value_vars: Columnas a convertir en filas. None = todas las no-id.
var_name: Nombre de la columna que contendra los nombres de variables.
value_name: Nombre de la columna que contendra los valores.
Returns:
Lista de dicts en formato largo con una fila por combinacion id+variable.
"""
result = []
for row in rows:
# Determinar que columnas derretir
if value_vars is None:
vars_to_melt = [k for k in row if k not in id_vars]
else:
vars_to_melt = value_vars
for var in vars_to_melt:
new_row: dict = {k: row.get(k) for k in id_vars}
new_row[var_name] = var
new_row[value_name] = row.get(var)
result.append(new_row)
return result
+49
View File
@@ -0,0 +1,49 @@
"""Tests para melt."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from melt import melt
def test_melt_basico():
"""Melt basico."""
rows = [{"region": "US", "q1": 10, "q2": 20}]
result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
assert len(result) == 2
assert result[0] == {"region": "US", "variable": "q1", "value": 10}
assert result[1] == {"region": "US", "variable": "q2", "value": 20}
def test_melt_multiples_id_vars():
"""Multiples id_vars."""
rows = [{"region": "US", "year": 2023, "q1": 10, "q2": 20}]
result = melt(rows, id_vars=["region", "year"], value_vars=["q1", "q2"])
assert len(result) == 2
assert result[0]["region"] == "US"
assert result[0]["year"] == 2023
assert result[0]["variable"] == "q1"
assert result[0]["value"] == 10
assert result[1]["variable"] == "q2"
assert result[1]["value"] == 20
def test_melt_value_vars_none_derrite_todas_las_columnas_no_id():
"""value_vars None derrite todas las columnas no-id."""
rows = [{"id": 1, "a": 10, "b": 20, "c": 30}]
result = melt(rows, id_vars=["id"])
assert len(result) == 3
vars_found = {r["variable"] for r in result}
assert vars_found == {"a", "b", "c"}
values_found = {r["value"] for r in result}
assert values_found == {10, 20, 30}
def test_melt_fila_con_campo_faltante_en_value_vars():
"""Fila con campo faltante en value_vars."""
rows = [{"region": "US", "q1": 10}] # q2 no existe
result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"])
assert len(result) == 2
q2_row = next(r for r in result if r["variable"] == "q2")
assert q2_row["value"] is None
@@ -0,0 +1,68 @@
---
name: merge_graphs
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def merge_graphs(graphs: list[dict], entity_key: str = 'name', similarity_threshold: float = 0.85) -> dict"
description: "Mergea multiples grafos de conocimiento en uno deduplicando entities por similitud de nombre (Levenshtein normalizado). Relaciones se re-apuntan a las entities canonicas. Atributos se combinan por union."
tags: [graph, merge, deduplication, knowledge-graph, levenshtein, similarity, datascience]
uses_functions: [levenshtein_distance_py_cybersecurity]
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [sys, os]
tested: true
tests:
- "dos grafos con entity duplicada → merge"
- "entities similares pero bajo threshold → no merge"
- "relaciones re-apuntadas correctamente"
- "merge log registra cada merge"
- "tres grafos → merge transitivo"
- "grafos sin overlap → concatenacion simple"
test_file_path: "python/functions/datascience/merge_graphs_test.py"
file_path: "python/functions/datascience/merge_graphs.py"
---
## Ejemplo
```python
g1 = {
"entities": [
{"id": "1", "name": "Alice Corp", "type": "company"},
{"id": "2", "name": "Bob", "type": "person"},
],
"relations": [
{"source_id": "2", "target_id": "1", "relation_type": "works_at"},
],
}
g2 = {
"entities": [
{"id": "3", "name": "Alice Corp.", "type": "company", "country": "US"},
],
"relations": [],
}
result = merge_graphs([g1, g2], similarity_threshold=0.85)
# result["entities"] -> 2 entities (Alice Corp mergeada, Bob)
# result["merge_log"] -> [{"merged": ["3", "1"], "into": "1", "similarity": 0.909}]
# "Alice Corp." mergeada en "Alice Corp" porque similitud > 0.85
```
## Notas
Funcion pura. Reutiliza `levenshtein_distance_py_cybersecurity` para calcular similitud normalizada entre nombres.
**Algoritmo de merge transitivo**: si A~B y B~C, entonces A, B, C se mergean en uno solo. Se implementa via union-find (path compression simple).
**Eleccion de canonical**: la entity con mas campos no-null gana. En caso de empate, la primera encontrada en el par.
**Conflictos de atributos**: si ambas entities tienen un campo con valor, el canonical conserva el suyo (primero gana). Solo se copian campos que el canonical no tiene o tiene null.
**Deduplicacion de relaciones**: por (source_id, target_id, relation_type). Si dos relaciones son identicas tras re-apuntar los IDs, se conserva la primera encontrada.
**Complejidad**: O(n^2) en numero de entities por la comparacion de pares. Adecuado para grafos de knowledge tipicos (< 10K entities). Para grafos muy grandes, usar indexado por prefijo antes de comparar.
**Importacion**: intenta importar `levenshtein_distance` desde el paquete `cybersecurity` del registry. Si no esta disponible, usa una reimplementacion inline equivalente.
@@ -0,0 +1,169 @@
"""merge_graphs — mergea multiples grafos de conocimiento deduplicando entities por similitud."""
import sys
import os
# Importar levenshtein_distance desde el registry
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "cybersecurity"))
try:
from cybersecurity import levenshtein_distance
except ImportError:
# Fallback: reimplementacion inline si el paquete no esta disponible
def levenshtein_distance(a: str, b: str) -> int:
"""Calcula la distancia de Levenshtein entre dos strings."""
if len(a) < len(b):
return levenshtein_distance(b, a)
if len(b) == 0:
return len(a)
prev_row = list(range(len(b) + 1))
for i, ca in enumerate(a):
curr_row = [i + 1]
for j, cb in enumerate(b):
cost = 0 if ca == cb else 1
curr_row.append(
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
)
prev_row = curr_row
return prev_row[-1]
def _name_similarity(a: str, b: str) -> float:
"""Similitud de Levenshtein normalizada entre 0 y 1."""
if not a and not b:
return 1.0
max_len = max(len(a), len(b))
if max_len == 0:
return 1.0
dist = levenshtein_distance(a.lower(), b.lower())
return 1.0 - dist / max_len
def _count_non_null_fields(entity: dict) -> int:
"""Cuenta campos con valor no-None."""
return sum(1 for v in entity.values() if v is not None)
def _merge_two_entities(canonical: dict, other: dict) -> dict:
"""Combina dos entities: union de campos, ultimo gana en conflictos."""
merged = dict(canonical)
for k, v in other.items():
if k not in merged or merged[k] is None:
merged[k] = v
# Si ambos tienen valor, el canonical (primero) gana — no sobreescribir
return merged
def merge_graphs(
graphs: list[dict],
entity_key: str = "name",
similarity_threshold: float = 0.85,
) -> dict:
"""Mergea multiples grafos de conocimiento en uno, deduplicando entities por similitud.
Algoritmo:
1. Juntar todas las entities de todos los grafos (con ID de origen).
2. Para cada par con similitud de nombre >= threshold, mergear.
3. Elegir entity canonica (la que tiene mas campos no-null).
4. Re-apuntar relaciones al ID canonico.
5. Deduplicar relaciones identicas (mismo source, target, type).
6. Registrar cada merge en merge_log.
Args:
graphs: Lista de grafos. Cada grafo es un dict con keys:
"entities" (list[dict]) y "relations" (list[dict]).
Las entities deben tener "id" y el campo entity_key.
entity_key: Campo de texto usado para calcular similitud. Default "name".
similarity_threshold: Umbral de similitud Levenshtein normalizada [0,1].
Default 0.85.
Returns:
Dict con keys: entities, relations, merge_log.
"""
# Recopilar todas las entities y relaciones
all_entities: list[dict] = []
all_relations: list[dict] = []
for graph in graphs:
all_entities.extend(graph.get("entities", []))
all_relations.extend(graph.get("relations", []))
# Construir union-find para agrupar entities similares
# id_map: entity_id original -> entity_id canonico
id_map: dict[str, str] = {e["id"]: e["id"] for e in all_entities if "id" in e}
entity_by_id: dict[str, dict] = {e["id"]: e for e in all_entities if "id" in e}
merge_log: list[dict] = []
def find_canonical(eid: str) -> str:
while id_map.get(eid, eid) != eid:
eid = id_map[eid]
return eid
entity_ids = [e["id"] for e in all_entities if "id" in e]
# Comparar todos los pares (O(n^2) — aceptable para grafos de knowledge tipicos)
for i in range(len(entity_ids)):
for j in range(i + 1, len(entity_ids)):
id_i = find_canonical(entity_ids[i])
id_j = find_canonical(entity_ids[j])
if id_i == id_j:
continue # ya mergeados
e_i = entity_by_id.get(id_i)
e_j = entity_by_id.get(id_j)
if e_i is None or e_j is None:
continue
name_i = str(e_i.get(entity_key, ""))
name_j = str(e_j.get(entity_key, ""))
sim = _name_similarity(name_i, name_j)
if sim >= similarity_threshold:
# Elegir canonical: el que tiene mas campos no-null
if _count_non_null_fields(e_i) >= _count_non_null_fields(e_j):
canonical_id, other_id = id_i, id_j
else:
canonical_id, other_id = id_j, id_i
# Mergear datos
merged = _merge_two_entities(entity_by_id[canonical_id], entity_by_id[other_id])
entity_by_id[canonical_id] = merged
# Redirigir other_id -> canonical_id
id_map[other_id] = canonical_id
merge_log.append({
"merged": [other_id, canonical_id],
"into": canonical_id,
"similarity": round(sim, 4),
})
# Construir lista final de entities (solo canonicas)
canonical_ids = {eid for eid in entity_ids if find_canonical(eid) == eid}
final_entities = [entity_by_id[eid] for eid in canonical_ids if eid in entity_by_id]
# Re-apuntar relaciones a IDs canonicos
final_relations_set: dict[tuple, dict] = {}
for rel in all_relations:
new_rel = dict(rel)
if "source_id" in new_rel:
new_rel["source_id"] = find_canonical(new_rel["source_id"])
if "target_id" in new_rel:
new_rel["target_id"] = find_canonical(new_rel["target_id"])
# Deduplicar por (source_id, target_id, relation_type)
rel_key = (
new_rel.get("source_id", ""),
new_rel.get("target_id", ""),
new_rel.get("relation_type", ""),
)
if rel_key not in final_relations_set:
final_relations_set[rel_key] = new_rel
return {
"entities": final_entities,
"relations": list(final_relations_set.values()),
"merge_log": merge_log,
}
@@ -0,0 +1,120 @@
"""Tests para merge_graphs."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from merge_graphs import merge_graphs
def test_dos_grafos_con_entity_duplicada_merge():
g1 = {
"entities": [{"id": "1", "name": "Alice Corp", "type": "company"}],
"relations": [],
}
g2 = {
"entities": [{"id": "2", "name": "Alice Corp", "type": "company", "country": "US"}],
"relations": [],
}
result = merge_graphs([g1, g2], similarity_threshold=0.95)
# Nombres identicos -> similitud 1.0 -> deben mergearse
assert len(result["entities"]) == 1
assert len(result["merge_log"]) == 1
merged = result["entities"][0]
# El merge debe preservar "country" aunque el canonical no lo tuviera
assert merged.get("country") == "US" or merged.get("name") == "Alice Corp"
def test_entities_similares_pero_bajo_threshold_no_merge():
g1 = {
"entities": [{"id": "1", "name": "Alice"}],
"relations": [],
}
g2 = {
"entities": [{"id": "2", "name": "Bob"}],
"relations": [],
}
result = merge_graphs([g1, g2], similarity_threshold=0.85)
# Alice y Bob son muy distintos -> no merge
assert len(result["entities"]) == 2
assert len(result["merge_log"]) == 0
def test_relaciones_re_apuntadas_correctamente():
g1 = {
"entities": [
{"id": "1", "name": "Alice Corp"},
{"id": "2", "name": "Bob"},
],
"relations": [
{"source_id": "2", "target_id": "1", "relation_type": "works_at"},
],
}
g2 = {
"entities": [
{"id": "3", "name": "Alice Corp"}, # duplicada de id=1
],
"relations": [
{"source_id": "3", "target_id": "2", "relation_type": "knows"},
],
}
result = merge_graphs([g1, g2], similarity_threshold=0.95)
# Entity 3 mergeada en 1 -> relacion source_id=3 debe apuntar al canonical de 1
assert len(result["entities"]) == 2 # Alice Corp + Bob
# Verificar que las relaciones tienen IDs canonicos (no "3")
for rel in result["relations"]:
assert rel["source_id"] != "3"
assert rel["target_id"] != "3"
def test_merge_log_registra_cada_merge():
g1 = {
"entities": [{"id": "1", "name": "OpenAI"}],
"relations": [],
}
g2 = {
"entities": [{"id": "2", "name": "OpenAI"}],
"relations": [],
}
result = merge_graphs([g1, g2], similarity_threshold=0.9)
assert len(result["merge_log"]) == 1
log = result["merge_log"][0]
assert "merged" in log
assert "into" in log
assert "similarity" in log
assert log["similarity"] == 1.0
def test_tres_grafos_merge_transitivo():
# A~B y B~C -> A, B, C deben mergearse en uno
g1 = {"entities": [{"id": "1", "name": "Acme Corp"}], "relations": []}
g2 = {"entities": [{"id": "2", "name": "Acme Corp"}], "relations": []}
g3 = {"entities": [{"id": "3", "name": "Acme Corp"}], "relations": []}
result = merge_graphs([g1, g2, g3], similarity_threshold=0.9)
assert len(result["entities"]) == 1
def test_grafos_sin_overlap_concatenacion_simple():
g1 = {
"entities": [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}],
"relations": [{"source_id": "1", "target_id": "2", "relation_type": "knows"}],
}
g2 = {
"entities": [{"id": "3", "name": "Carol"}, {"id": "4", "name": "Dave"}],
"relations": [{"source_id": "3", "target_id": "4", "relation_type": "knows"}],
}
result = merge_graphs([g1, g2], similarity_threshold=0.85)
# Ninguna entity similar -> concatenacion directa
assert len(result["entities"]) == 4
assert len(result["relations"]) == 2
assert len(result["merge_log"]) == 0
if __name__ == "__main__":
test_dos_grafos_con_entity_duplicada_merge()
test_entities_similares_pero_bajo_threshold_no_merge()
test_relaciones_re_apuntadas_correctamente()
test_merge_log_registra_cada_merge()
test_tres_grafos_merge_transitivo()
test_grafos_sin_overlap_concatenacion_simple()
print("All tests passed.")
+44
View File
@@ -0,0 +1,44 @@
---
name: pivot
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def pivot(rows: list[dict], index: str, columns: str, values: str, agg: str = 'sum') -> list[dict]"
description: "Pivot table sin pandas. Agrupa por index, expande valores unicos de columns como nuevas columnas y agrega values con la funcion indicada (sum, count, mean, min, max, first, last)."
tags: [datascience, tabular, pivot, transform, aggregation, python]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["collections"]
tested: true
tests:
- "Pivot basico con sum"
- "Pivot con count y mean"
- "Valores faltantes rellenados con 0"
- "Una sola fila"
- "Multiples valores por celda requieren agregacion"
test_file_path: "python/functions/datascience/pivot_test.py"
file_path: "python/functions/datascience/pivot.py"
---
## Ejemplo
```python
rows = [
{"region": "US", "product": "A", "sales": 10},
{"region": "US", "product": "B", "sales": 20},
{"region": "EU", "product": "A", "sales": 15},
]
pivot(rows, index="region", columns="product", values="sales")
# [{"region": "US", "A": 10, "B": 20}, {"region": "EU", "A": 15, "B": 0}]
```
## Notas
Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib).
Preserva el orden de aparicion de los valores de index y columns.
Valores numericos faltantes se rellenan con 0; no numericos con None.
+89
View File
@@ -0,0 +1,89 @@
"""Pivot table sin pandas para datos tabulares list[dict]."""
from collections import defaultdict
def pivot(
rows: list[dict],
index: str,
columns: str,
values: str,
agg: str = "sum",
) -> list[dict]:
"""Transforma datos del formato largo al formato ancho (pivot table).
Agrupa por `index`, expande los valores unicos de `columns` como nuevas
columnas y agrega la columna `values` con la funcion indicada.
Args:
rows: Lista de dicts con los datos en formato largo.
index: Nombre de la columna que actua como indice de filas.
columns: Nombre de la columna cuyos valores unicos se convierten en columnas.
values: Nombre de la columna cuyos valores se agregan.
agg: Funcion de agregacion: sum, count, mean, min, max, first, last.
Returns:
Lista de dicts con una fila por valor unico de index y una columna
por cada valor unico de columns. Valores numericos faltantes rellenados
con 0, valores no numericos con None.
"""
# Recopilar valores unicos de columns (orden de aparicion)
col_values: list = []
seen_cols: set = set()
index_order: list = []
seen_index: set = set()
for row in rows:
idx = row.get(index)
col = row.get(columns)
if idx not in seen_index:
seen_index.add(idx)
index_order.append(idx)
if col not in seen_cols:
seen_cols.add(col)
col_values.append(col)
# Acumular: groups[index_val][col_val] = lista de values
groups: dict[any, dict[any, list]] = defaultdict(lambda: defaultdict(list))
for row in rows:
idx = row.get(index)
col = row.get(columns)
val = row.get(values)
if val is not None:
groups[idx][col].append(val)
# Determinar si los valores son numericos (para relleno de 0)
sample_vals = [v for g in groups.values() for vs in g.values() for v in vs]
is_numeric = all(isinstance(v, (int, float)) for v in sample_vals) if sample_vals else True
def _aggregate(vals: list, func: str):
if not vals:
return 0 if is_numeric else None
if func == "sum":
return sum(vals)
if func == "count":
return len(vals)
if func == "mean":
return sum(vals) / len(vals)
if func == "min":
return min(vals)
if func == "max":
return max(vals)
if func == "first":
return vals[0]
if func == "last":
return vals[-1]
raise ValueError(f"Funcion de agregacion no soportada: {func}")
result = []
for idx in index_order:
record: dict = {index: idx}
for col in col_values:
vals = groups[idx][col]
if vals:
record[col] = _aggregate(vals, agg)
else:
record[col] = 0 if is_numeric else None
result.append(record)
return result
@@ -0,0 +1,78 @@
"""Tests para pivot."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from pivot import pivot
def test_pivot_basico_con_sum():
"""Pivot basico con sum."""
rows = [
{"region": "US", "product": "A", "sales": 10},
{"region": "US", "product": "B", "sales": 20},
{"region": "EU", "product": "A", "sales": 15},
]
result = pivot(rows, index="region", columns="product", values="sales")
assert len(result) == 2
us = next(r for r in result if r["region"] == "US")
eu = next(r for r in result if r["region"] == "EU")
assert us["A"] == 10
assert us["B"] == 20
assert eu["A"] == 15
assert eu["B"] == 0
def test_pivot_con_count_y_mean():
"""Pivot con count y mean."""
rows = [
{"region": "US", "product": "A", "sales": 10},
{"region": "US", "product": "A", "sales": 20},
{"region": "EU", "product": "A", "sales": 15},
]
result_count = pivot(rows, index="region", columns="product", values="sales", agg="count")
us_count = next(r for r in result_count if r["region"] == "US")
assert us_count["A"] == 2
result_mean = pivot(rows, index="region", columns="product", values="sales", agg="mean")
us_mean = next(r for r in result_mean if r["region"] == "US")
assert us_mean["A"] == 15.0
def test_pivot_valores_faltantes_rellenados_con_0():
"""Valores faltantes rellenados con 0."""
rows = [
{"region": "US", "product": "A", "sales": 5},
{"region": "EU", "product": "B", "sales": 8},
]
result = pivot(rows, index="region", columns="product", values="sales")
us = next(r for r in result if r["region"] == "US")
eu = next(r for r in result if r["region"] == "EU")
assert us["B"] == 0
assert eu["A"] == 0
def test_pivot_una_sola_fila():
"""Una sola fila."""
rows = [{"region": "US", "product": "A", "sales": 42}]
result = pivot(rows, index="region", columns="product", values="sales")
assert len(result) == 1
assert result[0]["region"] == "US"
assert result[0]["A"] == 42
def test_pivot_multiples_valores_por_celda_requieren_agregacion():
"""Multiples valores por celda requieren agregacion."""
rows = [
{"region": "US", "product": "A", "sales": 10},
{"region": "US", "product": "A", "sales": 30},
]
result_sum = pivot(rows, index="region", columns="product", values="sales", agg="sum")
assert result_sum[0]["A"] == 40
result_min = pivot(rows, index="region", columns="product", values="sales", agg="min")
assert result_min[0]["A"] == 10
result_max = pivot(rows, index="region", columns="product", values="sales", agg="max")
assert result_max[0]["A"] == 30
@@ -0,0 +1,48 @@
---
name: avellaneda_stoikov_quotes
kind: function
lang: py
domain: finance
version: "1.0.0"
purity: pure
signature: "avellaneda_stoikov_quotes(mid_price: float, inventory: float, gamma: float, sigma: float, spread_base: float, n_levels: int, qty_base: float) -> list[dict]"
description: "Genera ordenes de market maker usando el modelo Avellaneda-Stoikov. Calcula precio de reserva y half spread optimos segun inventario y volatilidad."
tags: [simulation, market-making, avellaneda-stoikov, montecarlo, finance, order-book]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/finance/finance.py"
---
## Ejemplo
```python
orders = avellaneda_stoikov_quotes(
mid_price=100.0,
inventory=0.0,
gamma=0.1,
sigma=0.02,
spread_base=0.5,
n_levels=3,
qty_base=10.0,
)
# [
# {'side': 'buy', 'price': 99.75, 'qty': 10.0},
# {'side': 'sell', 'price': 100.25, 'qty': 10.0},
# ...
# ]
```
## Notas
Funcion pura — sin aleatoriedad.
`gamma` controla la aversion al riesgo de inventario: mayor gamma = spreads mas amplios.
`inventory` positivo sesga los quotes hacia venta (reduce inventario largo).
Cada nivel adicional ensancha el spread en `half_spread * 0.5` y aumenta la cantidad en `qty_base * 0.5`.
Ordenes con precio <= 0 se descartan automaticamente.
+101
View File
@@ -135,3 +135,104 @@ def annualized_volatility(returns: list, periods_per_year: float) -> float:
mean = sum(returns) / n
variance = sum((r - mean) ** 2 for r in returns) / (n - 1)
return math.sqrt(variance) * math.sqrt(periods_per_year)
def generate_gbm_prices(
initial_price: float,
n_ticks: int,
sigma: float,
mu: float = 0.0,
jump_intensity: float = 0.0,
jump_size_std: float = 0.05,
seed: int = 42,
) -> list:
"""Genera serie de precios fundamentales con Geometric Brownian Motion + jump-diffusion.
S(t+1) = S(t) * exp((mu - sigma^2/2)*dt + sigma*sqrt(dt)*Z + J*N)
donde Z ~ N(0,1), N ~ Bernoulli(jump_intensity), J ~ N(0, jump_size_std)
"""
import numpy as np
rng = np.random.default_rng(seed)
prices = [0.0] * n_ticks
prices[0] = initial_price
dt = 1.0
for t in range(1, n_ticks):
z = rng.standard_normal()
gbm = (mu - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * z
jump = 0.0
if jump_intensity > 0 and rng.random() < jump_intensity:
jump = rng.normal(0, jump_size_std)
prices[t] = prices[t - 1] * np.exp(gbm + jump)
return prices
def avellaneda_stoikov_quotes(
mid_price: float,
inventory: float,
gamma: float,
sigma: float,
spread_base: float,
n_levels: int = 3,
qty_base: float = 10.0,
) -> list:
"""Genera ordenes de market maker usando el modelo Avellaneda-Stoikov.
Precio de reserva: r = mid - inventory * gamma * sigma^2
Half spread: delta = spread_base/2 + gamma * sigma^2/2
Retorna lista de dicts con keys: side, price, qty
"""
reservation = mid_price - inventory * gamma * sigma**2
half_spread = spread_base / 2 + gamma * sigma**2 / 2
orders = []
for level in range(n_levels):
offset = level * half_spread * 0.5
qty = qty_base * (1 + level * 0.5)
bid_price = round(reservation - half_spread - offset, 2)
ask_price = round(reservation + half_spread + offset, 2)
if bid_price > 0:
orders.append({'side': 'buy', 'price': bid_price, 'qty': qty})
if ask_price > 0:
orders.append({'side': 'sell', 'price': ask_price, 'qty': qty})
return orders
def generate_taker_order(
alpha: float = 2.0,
size_min: float = 1.0,
size_max: float = 100.0,
buy_prob: float = 0.5,
seed: int | None = None,
) -> dict:
"""Genera una market order de taker con tamano power-law (Pareto).
P(size > x) ~ x^(-alpha). Alpha bajo = mas ballenas.
Retorna dict con keys: side, qty
"""
import numpy as np
rng = np.random.default_rng(seed)
side = 'buy' if rng.random() < buy_prob else 'sell'
raw_size = (rng.pareto(alpha) + 1) * size_min
size = min(round(raw_size, 1), size_max)
return {'side': side, 'qty': size}
def hawkes_intensity(
base_rate: float,
hawkes_alpha: float,
hawkes_beta: float,
event_times: list,
current_time: float,
) -> float:
"""Calcula la intensidad lambda(t) de un proceso de Hawkes en el tiempo actual.
lambda(t) = base_rate + sum(alpha * exp(-beta * (t - ti)))
donde ti son los tiempos de eventos pasados.
"""
import numpy as np
excitation = sum(
hawkes_alpha * np.exp(-hawkes_beta * (current_time - ti))
for ti in event_times
if ti < current_time
)
return max(0.0, base_rate + excitation)
@@ -0,0 +1,44 @@
---
name: generate_gbm_prices
kind: function
lang: py
domain: finance
version: "1.0.0"
purity: pure
signature: "generate_gbm_prices(initial_price: float, n_ticks: int, sigma: float, mu: float, jump_intensity: float, jump_size_std: float, seed: int) -> list[float]"
description: "Genera serie de precios fundamentales con Geometric Brownian Motion + jump-diffusion. S(t+1) = S(t) * exp((mu - sigma^2/2)*dt + sigma*sqrt(dt)*Z + J*N)."
tags: [simulation, gbm, price, montecarlo, finance, stochastic]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/finance/finance.py"
---
## Ejemplo
```python
prices = generate_gbm_prices(
initial_price=100.0,
n_ticks=1000,
sigma=0.02,
mu=0.0,
jump_intensity=0.01,
jump_size_std=0.05,
seed=42,
)
# prices[0] == 100.0
# len(prices) == 1000
```
## Notas
Funcion pura — el seed fija el resultado deterministicamente.
`jump_intensity=0.0` desactiva los saltos (GBM puro).
`dt=1.0` por tick (tiempo discreto). Para tiempo continuo, ajustar sigma y mu en consecuencia.
Requiere numpy para la generacion de numeros aleatorios y el calculo de exp.
@@ -0,0 +1,41 @@
---
name: generate_taker_order
kind: function
lang: py
domain: finance
version: "1.0.0"
purity: pure
signature: "generate_taker_order(alpha: float, size_min: float, size_max: float, buy_prob: float, seed: int | None) -> dict"
description: "Genera una market order de taker con tamano distribuido segun power-law (Pareto). Alpha bajo produce ordenes mas grandes (ballenas)."
tags: [simulation, taker, power-law, montecarlo, finance, order-book]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/finance/finance.py"
---
## Ejemplo
```python
order = generate_taker_order(
alpha=2.0,
size_min=1.0,
size_max=100.0,
buy_prob=0.5,
seed=42,
)
# {'side': 'buy', 'qty': 3.7}
```
## Notas
Funcion pura cuando se fija seed. Con seed=None el resultado es no deterministico.
La distribucion Pareto con alpha=2 modela bien la distribucion empirica de tamaños de ordenes en mercados reales.
`size_max` actua como techo (clipping) para evitar ordenes extremas.
Retorna dict con keys: `side` ('buy' o 'sell') y `qty` (float redondeado a 1 decimal).
@@ -0,0 +1,43 @@
---
name: hawkes_intensity
kind: function
lang: py
domain: finance
version: "1.0.0"
purity: pure
signature: "hawkes_intensity(base_rate: float, hawkes_alpha: float, hawkes_beta: float, event_times: list[float], current_time: float) -> float"
description: "Calcula la intensidad lambda(t) de un proceso de Hawkes en el tiempo actual. Modela la autocorrelacion temporal de eventos de mercado (rafagas de ordenes)."
tags: [simulation, hawkes, stochastic-process, montecarlo, finance, point-process]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/finance/finance.py"
---
## Ejemplo
```python
intensity = hawkes_intensity(
base_rate=1.0,
hawkes_alpha=0.8,
hawkes_beta=2.0,
event_times=[0.5, 1.2, 1.8],
current_time=2.5,
)
# Intensidad > base_rate por excitacion de eventos pasados
```
## Notas
Funcion pura — determinista dado el mismo historial de eventos.
`hawkes_alpha` controla la magnitud del salto de intensidad por evento.
`hawkes_beta` controla la velocidad de decaimiento (mayor beta = decaimiento mas rapido).
La condicion de estabilidad del proceso es hawkes_alpha < hawkes_beta.
Eventos con ti >= current_time se ignoran automaticamente.
Retorna max(0.0, ...) para garantizar intensidad no negativa.
@@ -0,0 +1,123 @@
---
name: extraction_pipeline
kind: pipeline
lang: py
domain: pipelines
version: "1.0.0"
purity: impure
signature: "def extraction_pipeline(file_path: str, entity_presets: list[dict], relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], chunk_size: int = 500, chunk_overlap: int = 50, confidence_threshold: float = 0.5, dedup_threshold: float = 0.85, on_progress: Callable[[str, float], None] | None = None) -> ExtractionResult"
description: "Pipeline completa de extraccion de entidades y relaciones desde un documento. Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks -> extract_entities_llm por chunk -> deduplicate_entities -> extract_relations_llm por chunk -> deduplicate_relations."
tags: [pipeline, extraction, entities, relations, llm, nlp, fuzzygraph, datascience]
uses_functions:
- extract_text_from_file_py_core
- preprocess_text_py_core
- split_text_into_chunks_py_core
- build_entity_schema_prompt_py_datascience
- build_relation_schema_prompt_py_datascience
- extract_entities_llm_py_datascience
- extract_relations_llm_py_datascience
- deduplicate_entities_py_datascience
- deduplicate_relations_py_datascience
uses_types:
- entity_candidate_py_datascience
- extraction_result_py_datascience
- extraction_stats_py_datascience
- relation_candidate_py_datascience
returns:
- extraction_result_py_datascience
returns_optional: false
error_type: "error_go_core"
imports:
- time
- warnings
- typing.Callable
tested: true
tests:
- "documento con entidades y relaciones retorna ExtractionResult completo"
- "documento vacio retorna ExtractionResult con listas vacias"
- "documento sin entidades detectables retorna listas vacias"
- "archivo no encontrado lanza FileNotFoundError"
- "entity presets vacio lanza ValueError"
- "progress callback se invoca durante la ejecucion"
- "stats se rellenan correctamente con conteos y tiempo"
test_file_path: "python/functions/pipelines/extraction_pipeline_test.py"
file_path: "python/functions/pipelines/extraction_pipeline.py"
---
## Ejemplo
```python
from python.functions.pipelines.extraction_pipeline import extraction_pipeline
entity_presets = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality"],
},
{
"type_ref": "osint_domain_go_cybersecurity",
"label": "Domain",
"metadata_fields": ["fqdn", "registrar"],
},
]
relation_types = ["operates", "owns", "funds", "communicates_with", "related_to"]
# Inyectar un cliente LLM real
def llm_chat_json(messages):
# llamada al proveedor LLM elegido
...
result = extraction_pipeline(
file_path="report.pdf",
entity_presets=entity_presets,
relation_types=relation_types,
llm_chat_json=llm_chat_json,
chunk_size=500,
chunk_overlap=50,
confidence_threshold=0.5,
dedup_threshold=0.85,
on_progress=lambda msg, pct: print(f"[{pct:.0%}] {msg}"),
)
print(f"Entities: {len(result.entities)}, Relations: {len(result.relations)}")
print(f"Stats: {result.stats}")
# Integrar con fuzzygraph / operations.db
for entity in result.entities:
db.add_entity(
name=entity.name,
type_ref=entity.type_ref,
metadata=entity.attributes,
)
for relation in result.relations:
db.add_relation(
name=relation.relation_type,
from_entity=relation.from_id,
to_entity=relation.to_id,
)
```
## Algoritmo
1. **Extract:** `extract_text_from_file(file_path)` — texto crudo desde PDF, TXT, Markdown
2. **Preprocess:** `preprocess_text(text)` — normaliza espacios, caracteres especiales
3. **Split:** `split_text_into_chunks(text, chunk_size, chunk_overlap)` — divide en ventanas solapadas
4. **Extract entities per chunk (0-40%):** Para cada chunk llama `extract_entities_llm` con el schema de presets. Anota `source_chunk_index` en cada candidato
5. **Filter:** filtra por `confidence >= confidence_threshold`
6. **Deduplicate entities (40%):** `deduplicate_entities` con fuzzy matching, produce `entity_id_map`
7. **Extract relations per chunk (40-80%):** Para cada chunk obtiene las entidades de ese chunk y llama `extract_relations_llm`
8. **Deduplicate relations (80-100%):** `deduplicate_relations` resuelve nombres a IDs y colapsa duplicados
9. **Return:** `ExtractionResult` con entidades, relaciones y stats del proceso
## Notas
- El parametro `llm_chat_json` inyecta el cliente LLM, sin acoplamiento a ningun proveedor (OpenAI, Anthropic, Ollama, etc.)
- El progress callback cubre: 0-40% extraccion de entidades, 40-80% extraccion de relaciones, 80-100% deduplicacion
- Si el archivo no existe lanza `FileNotFoundError` antes de cualquier llamada al LLM
- Si `entity_presets` esta vacio lanza `ValueError`
- Errores en chunks individuales se capturan con warnings y continuan (robustez)
- Los `entity_id_map` de `deduplicate_entities` conectan nombres originales del texto con IDs UUID finales para `deduplicate_relations`
- La retorna `ExtractionResult` esta lista para insertar en `operations.db` via `fn ops entity add` / `fn ops relation add`
@@ -0,0 +1,211 @@
"""Pipeline de extraccion de entidades y relaciones desde un documento."""
from __future__ import annotations
import sys
import os
import time
import warnings
from typing import Callable
# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from python.functions.core.extract_text_from_file import extract_text_from_file
from python.functions.core.core import preprocess_text
from python.functions.core.split_text_into_chunks import split_text_into_chunks
from python.functions.datascience.build_entity_schema_prompt import build_entity_schema_prompt
from python.functions.datascience.build_relation_schema_prompt import build_relation_schema_prompt
from python.functions.datascience.extract_entities_llm import extract_entities_llm
from python.functions.datascience.extract_relations_llm import extract_relations_llm
from python.functions.datascience.deduplicate_entities import deduplicate_entities
from python.functions.datascience.deduplicate_relations import deduplicate_relations
from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.extraction_result import ExtractionResult
from python.types.datascience.extraction_stats import ExtractionStats
def extraction_pipeline(
file_path: str,
entity_presets: list[dict],
relation_types: list[str],
llm_chat_json: Callable[[list[dict]], dict],
chunk_size: int = 500,
chunk_overlap: int = 50,
confidence_threshold: float = 0.5,
dedup_threshold: float = 0.85,
on_progress: Callable[[str, float], None] | None = None,
) -> ExtractionResult:
"""Pipeline completa de extraccion de entidades y relaciones desde un documento.
Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
-> extract_entities_llm por chunk -> deduplicate_entities ->
extract_relations_llm por chunk -> deduplicate_relations.
Args:
file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
entity_presets: lista de dicts con type_ref, label y metadata_fields.
Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "nationality"]}]
relation_types: tipos de relacion permitidos para extraccion.
Ejemplo: ["funds", "employs", "communicates_with", "owns"]
llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
chunk_size: numero de caracteres por chunk (default 500).
chunk_overlap: overlap entre chunks consecutivos (default 50).
confidence_threshold: umbral minimo de confidence para aceptar entidades
candidatas antes de deduplicar (default 0.5).
dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
on_progress: callback opcional de progreso (message: str, pct: float 0-1).
0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
80-100%: deduplicacion.
Returns:
ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
Raises:
FileNotFoundError: si file_path no existe.
ValueError: si entity_presets esta vacio.
"""
if not entity_presets:
raise ValueError("entity_presets no puede estar vacio")
if not os.path.exists(file_path):
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
def _progress(msg: str, pct: float) -> None:
if on_progress is not None:
try:
on_progress(msg, pct)
except Exception:
pass
start_time = time.monotonic()
stats = ExtractionStats()
# ── Paso 1: Extraer texto ──────────────────────────────────────────────────
_progress("Extracting text from file...", 0.0)
try:
raw_text = extract_text_from_file(file_path)
except Exception as exc:
warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
raw_text = ""
# ── Paso 2: Preprocesar ────────────────────────────────────────────────────
clean_text = preprocess_text(raw_text)
stats.total_chars = len(clean_text)
# ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
n = len(chunks)
stats.total_chunks = n
if n == 0:
stats.processing_time_seconds = time.monotonic() - start_time
return ExtractionResult(entities=[], relations=[], stats=stats)
# ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
all_raw_entities: list[EntityCandidate] = []
for i, chunk in enumerate(chunks):
_progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
try:
candidates = extract_entities_llm(
text=chunk,
entity_schema=entity_presets,
llm_chat_json=llm_chat_json,
)
except Exception as exc:
warnings.warn(
f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
)
candidates = []
for candidate in candidates:
# Anotar el chunk de origen
if i not in candidate.source_chunk_indices:
candidate.source_chunk_indices.append(i)
all_raw_entities.append(candidate)
# ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
filtered_entities = [
e for e in all_raw_entities if e.confidence >= confidence_threshold
]
stats.raw_entities_count = len(filtered_entities)
# Actualizar stats de tipos
for ent in filtered_entities:
stats.entity_types_found[ent.type_ref] = (
stats.entity_types_found.get(ent.type_ref, 0) + 1
)
# ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
_progress("Deduplicating entities...", 0.4)
dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
stats.final_entities_count = dedup_result.total_after
stats.entities_merged = dedup_result.total_before - dedup_result.total_after
final_entities = dedup_result.entities
entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id
# ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
all_raw_relations = []
for i, chunk in enumerate(chunks):
_progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
# Obtener entidades relevantes de este chunk
chunk_entities = [
e for e in final_entities if i in e.source_chunk_indices
]
# Si no hay entidades en este chunk especifico, usar todas
if not chunk_entities:
chunk_entities = final_entities
if len(chunk_entities) < 2:
continue
try:
chunk_relations = extract_relations_llm(
text=chunk,
entities=chunk_entities,
relation_types=relation_types,
llm_chat_json=llm_chat_json,
)
except Exception as exc:
warnings.warn(
f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
)
chunk_relations = []
for rel in chunk_relations:
rel.source_chunk_index = i
all_raw_relations.extend(chunk_relations)
stats.raw_relations_count = len(all_raw_relations)
# Actualizar stats de tipos de relacion
for rel in all_raw_relations:
stats.relation_types_found[rel.relation_type] = (
stats.relation_types_found.get(rel.relation_type, 0) + 1
)
# ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
_progress("Deduplicating relations...", 0.8)
final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
stats.final_relations_count = len(final_relations)
stats.relations_merged = stats.raw_relations_count - len(final_relations)
stats.processing_time_seconds = time.monotonic() - start_time
_progress("Done", 1.0)
return ExtractionResult(
entities=final_entities,
relations=final_relations,
stats=stats,
)
@@ -0,0 +1,227 @@
"""Tests para extraction_pipeline."""
from __future__ import annotations
import os
import sys
import tempfile
_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from python.functions.pipelines.extraction_pipeline import extraction_pipeline
# ── LLM stubs ─────────────────────────────────────────────────────────────────
def _llm_with_entities(messages: list[dict]) -> dict:
"""LLM stub que retorna entidades fijas para el primer mensaje de extraccion."""
system_content = messages[0]["content"] if messages else ""
if "entity" in system_content.lower() or "entities" in system_content.lower():
return {
"entities": [
{
"name": "John Smith",
"type_ref": "osint_person_go_cybersecurity",
"attributes": {"full_name": "John Smith", "nationality": "US"},
"confidence": 0.95,
},
{
"name": "evil-corp.com",
"type_ref": "osint_domain_go_cybersecurity",
"attributes": {"fqdn": "evil-corp.com"},
"confidence": 0.88,
},
]
}
# Llamada de relaciones
return {
"relations": [
{
"from_name": "John Smith",
"to_name": "evil-corp.com",
"relation_type": "operates",
"description": "John Smith operates evil-corp.com",
"confidence": 0.8,
}
]
}
def _llm_empty(messages: list[dict]) -> dict:
"""LLM stub que retorna siempre resultado vacio."""
system_content = messages[0]["content"] if messages else ""
if "entit" in system_content.lower():
return {"entities": []}
return {"relations": []}
ENTITY_PRESETS = [
{
"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias", "nationality"],
},
{
"type_ref": "osint_domain_go_cybersecurity",
"label": "Domain",
"metadata_fields": ["fqdn", "registrar"],
},
]
RELATION_TYPES = ["operates", "owns", "funds", "communicates_with", "related_to"]
# ── Tests ──────────────────────────────────────────────────────────────────────
def test_documento_con_entidades_y_relaciones():
"""documento con entidades y relaciones retorna ExtractionResult completo"""
text = (
"John Smith, a US national, operates the domain evil-corp.com. "
"He was identified as the main administrator of the infrastructure."
)
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write(text)
tmp_path = f.name
try:
result = extraction_pipeline(
file_path=tmp_path,
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_with_entities,
chunk_size=500,
chunk_overlap=50,
confidence_threshold=0.5,
dedup_threshold=0.85,
)
assert result is not None
assert len(result.entities) >= 1
assert result.stats.total_chunks >= 1
assert result.stats.total_chars > 0
finally:
os.unlink(tmp_path)
def test_documento_vacio():
"""documento vacio retorna ExtractionResult con listas vacias"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write("")
tmp_path = f.name
try:
result = extraction_pipeline(
file_path=tmp_path,
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_empty,
)
assert result is not None
assert result.entities == []
assert result.relations == []
assert result.stats.total_chunks == 0
finally:
os.unlink(tmp_path)
def test_documento_sin_entidades_detectables():
"""documento sin entidades detectables retorna listas vacias"""
text = "The weather is nice today. The sun shines brightly over the mountains."
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write(text)
tmp_path = f.name
try:
result = extraction_pipeline(
file_path=tmp_path,
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_empty,
confidence_threshold=0.5,
)
assert result is not None
assert result.entities == []
assert result.relations == []
assert result.stats.raw_entities_count == 0
finally:
os.unlink(tmp_path)
def test_archivo_no_encontrado_lanza_filenotfounderror():
"""archivo no encontrado lanza FileNotFoundError"""
import pytest
with pytest.raises(FileNotFoundError):
extraction_pipeline(
file_path="/tmp/no_existe_para_test_extraccion_pipeline.txt",
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_empty,
)
def test_entity_presets_vacio_lanza_valueerror():
"""entity presets vacio lanza ValueError"""
import pytest
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write("some text")
tmp_path = f.name
try:
with pytest.raises(ValueError):
extraction_pipeline(
file_path=tmp_path,
entity_presets=[],
relation_types=RELATION_TYPES,
llm_chat_json=_llm_empty,
)
finally:
os.unlink(tmp_path)
def test_progress_callback_se_invoca():
"""progress callback se invoca durante la ejecucion"""
calls: list[tuple[str, float]] = []
def _on_progress(msg: str, pct: float) -> None:
calls.append((msg, pct))
text = "John Smith operates evil-corp.com."
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write(text)
tmp_path = f.name
try:
extraction_pipeline(
file_path=tmp_path,
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_with_entities,
on_progress=_on_progress,
)
assert len(calls) > 0
messages = [c[0] for c in calls]
assert any("Extracting" in m or "Done" in m or "Dedup" in m for m in messages)
finally:
os.unlink(tmp_path)
def test_stats_se_rellenan_correctamente():
"""stats se rellenan correctamente con conteos y tiempo"""
text = "John Smith, a US national, operates the domain evil-corp.com."
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
f.write(text)
tmp_path = f.name
try:
result = extraction_pipeline(
file_path=tmp_path,
entity_presets=ENTITY_PRESETS,
relation_types=RELATION_TYPES,
llm_chat_json=_llm_with_entities,
)
assert result.stats.total_chars > 0
assert result.stats.total_chunks >= 1
assert result.stats.processing_time_seconds >= 0.0
finally:
os.unlink(tmp_path)
@@ -0,0 +1,74 @@
---
name: monte_carlo_market
kind: pipeline
lang: py
domain: pipelines
version: "1.0.0"
purity: impure
signature: "def monte_carlo_market(n_simulations: int, base_params: dict, vary_params: dict, seed_start: int) -> list[dict]"
description: "Ejecuta N simulaciones de mercado con parámetros variados uniformemente. Cada simulación usa run_market_sim y retorna métricas resumen: spreads, trades por tick, volatilidad realizada y PnL total de makers."
tags: [montecarlo, simulation, market, launcher, finance, microstructure]
uses_functions:
- run_market_sim_py_pipelines
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/pipelines/monte_carlo_market.py"
---
## Ejemplo
```bash
# 10 simulaciones con sigma y gamma variables
python python/functions/pipelines/monte_carlo_market.py -n 10
```
```python
from monte_carlo_market import monte_carlo_market
results = monte_carlo_market(
n_simulations=50,
base_params={'n_ticks': 300, 'n_makers': 3},
vary_params={
'sigma': (0.005, 0.05),
'gamma': (0.01, 1.0),
'hawkes_alpha': (0.1, 0.9),
},
seed_start=42,
)
# Cada resultado tiene: sim_id, seed, sigma, gamma, hawkes_alpha,
# total_trades, mean_spread, std_spread, mean_trades_per_tick,
# price_return, maker_total_pnl, realized_vol
```
## Flujo
1. Para cada simulación i en range(n_simulations):
- Tomar `base_params` + `seed = seed_start + i`
- Samplear `vary_params` uniformemente con rng derivado de `seed_start`
- Llamar `run_market_sim(**params)`
- Calcular métricas resumen sobre el resultado
2. Reportar progreso cada 10% de simulaciones
3. Retornar lista de dicts con params usados + métricas
## Métricas por simulación
| Campo | Descripción |
|---|---|
| `total_trades` | Número total de trades en la simulación |
| `mean_spread` | Spread bid-ask medio |
| `std_spread` | Desviación estándar del spread |
| `mean_trades_per_tick` | Intensidad media del flujo de órdenes |
| `price_return` | Retorno % del precio fundamental |
| `maker_total_pnl` | PnL agregado de todos los makers |
| `realized_vol` | Volatilidad realizada de los trade prices (si hay trades) |
## Notas
`vary_params` acepta cualquier parámetro válido de `run_market_sim` como clave, con valor `(min, max)`.
Los parámetros en `base_params` tienen precedencia sobre los defaults pero son sobreescritos por `vary_params`.
@@ -0,0 +1,91 @@
"""Ejecuta N simulaciones de mercado con parámetros variables para análisis Monte Carlo."""
import sys
import os
import json
def monte_carlo_market(
n_simulations: int = 100,
base_params: dict | None = None,
vary_params: dict | None = None,
seed_start: int = 0,
) -> list[dict]:
"""Ejecuta N simulaciones variando parámetros.
base_params: parámetros fijos para run_market_sim
vary_params: dict de param_name -> (min, max) para variar uniformemente
Retorna lista de dicts, cada uno con los params usados + métricas resumen.
"""
import numpy as np
sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions'))
sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions', 'pipelines'))
from run_market_sim import run_market_sim
if base_params is None:
base_params = {}
if vary_params is None:
vary_params = {}
rng = np.random.default_rng(seed_start)
results = []
for i in range(n_simulations):
params = dict(base_params)
params['seed'] = seed_start + i
# Variar parámetros
varied = {}
for pname, (pmin, pmax) in vary_params.items():
val = rng.uniform(pmin, pmax)
params[pname] = round(val, 6)
varied[pname] = params[pname]
sim = run_market_sim(**params)
# Métricas resumen
spreads = sim['spreads']
trade_prices = sim['trade_prices']
n_per_tick = sim['n_trades_per_tick']
result = {
'sim_id': i,
'seed': params['seed'],
**varied,
'total_trades': sim['total_trades'],
'mean_spread': round(np.mean(spreads), 6) if spreads else 0,
'std_spread': round(np.std(spreads), 6) if spreads else 0,
'mean_trades_per_tick': round(np.mean(n_per_tick), 2),
'price_return': round((sim['fundamental_prices'][-1] / sim['fundamental_prices'][0] - 1) * 100, 4),
'maker_total_pnl': round(sum(sim['maker_pnls']), 2),
}
if trade_prices:
tp = np.array(trade_prices)
log_ret = np.diff(np.log(tp[tp > 0]))
if len(log_ret) > 1:
result['realized_vol'] = round(float(np.std(log_ret)), 6)
results.append(result)
if (i + 1) % max(1, n_simulations // 10) == 0:
print(f' {i+1}/{n_simulations} simulaciones completadas')
return results
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-n', type=int, default=10)
args = parser.parse_args()
results = monte_carlo_market(
n_simulations=args.n,
base_params={'n_ticks': 200},
vary_params={'sigma': (0.005, 0.05), 'gamma': (0.01, 1.0)},
)
print(json.dumps(results[-1], indent=2))
print(f'\n{len(results)} simulaciones completadas')
@@ -0,0 +1,65 @@
---
name: run_market_sim
kind: pipeline
lang: py
domain: pipelines
version: "1.0.0"
purity: impure
signature: "def run_market_sim(initial_price: float, n_ticks: int, sigma: float, mu: float, jump_intensity: float, jump_size_std: float, n_makers: int, maker_spread: float, gamma: float, maker_levels: int, maker_qty: float, n_takers_lambda: float, taker_size_alpha: float, taker_size_min: float, taker_size_max: float, hawkes_alpha: float, hawkes_beta: float, seed: int) -> dict"
description: "Simula un mercado completo con matching engine FIFO. Makers usan Avellaneda-Stoikov, takers llegan según proceso Hawkes con tamaños power-law. Retorna trades, spreads, midprices y PnL de makers."
tags: [simulation, market, matching-engine, montecarlo, launcher, finance, microstructure]
uses_functions:
- generate_gbm_prices_py_finance
- avellaneda_stoikov_quotes_py_finance
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [numpy]
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/pipelines/run_market_sim.py"
---
## Ejemplo
```bash
python python/functions/pipelines/run_market_sim.py
# {
# "total_trades": 1234,
# "mean_spread": 0.4821,
# "maker_pnls": [12.5, -3.2, 8.1, 5.6, -1.4]
# }
```
```python
from run_market_sim import run_market_sim
result = run_market_sim(
initial_price=100.0,
n_ticks=200,
sigma=0.01,
n_makers=3,
seed=0,
)
print(result['total_trades'])
print(result['maker_pnls'])
```
## Flujo
1. `generate_gbm_prices` — genera la serie de precios fundamentales con GBM + saltos
2. Loop por ticks:
- Cada maker coloca quotes via `avellaneda_stoikov_quotes`
- Takers llegan según Poisson con intensidad modulada por excitación Hawkes
- Tamaños de taker siguen distribución Pareto (power-law)
- Matching FIFO sobre el order book simplificado
- Excitación Hawkes decae exponencialmente entre ticks
3. Mark-to-market final de inventarios de makers
## Notas
Los parámetros Hawkes (`hawkes_alpha`, `hawkes_beta`) controlan la autocorrelación del flujo de órdenes.
`branching_ratio = hawkes_alpha / hawkes_beta`; si > 1, el proceso es explosivo.
El matching es simplificado: no hay cancelaciones intra-tick, el book se reconstituye en cada tick.
@@ -0,0 +1,149 @@
"""Ejecuta una simulación de mercado completa con matching engine FIFO."""
import sys
import os
import json
def run_market_sim(
initial_price: float = 100.0,
n_ticks: int = 500,
sigma: float = 0.02,
mu: float = 0.0,
jump_intensity: float = 0.02,
jump_size_std: float = 0.05,
n_makers: int = 5,
maker_spread: float = 0.5,
gamma: float = 0.1,
maker_levels: int = 3,
maker_qty: float = 10.0,
n_takers_lambda: float = 2.0,
taker_size_alpha: float = 2.0,
taker_size_min: float = 1.0,
taker_size_max: float = 100.0,
hawkes_alpha: float = 0.5,
hawkes_beta: float = 1.0,
seed: int = 42,
) -> dict:
"""Simula un mercado con makers (Avellaneda-Stoikov) y takers (Hawkes + power-law).
Retorna dict con:
- trade_prices, trade_times, trade_sizes: listas de trades
- spreads, midprices: series por tick
- n_trades_per_tick: arrivals por tick
- maker_pnls: PnL final de cada maker
- total_trades: conteo total
"""
import numpy as np
# Importar funciones del registry
sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions'))
from finance.finance import generate_gbm_prices, avellaneda_stoikov_quotes
rng = np.random.default_rng(seed)
# Generar precios fundamentales
fund_prices = generate_gbm_prices(initial_price, n_ticks, sigma, mu, jump_intensity, jump_size_std, seed)
# Order book simplificado: listas de (price, qty, maker_idx)
# Matching inline para no depender del notebook
trade_prices, trade_times, trade_sizes = [], [], []
spreads, midprices = [], []
n_trades_per_tick = []
maker_inventories = [0.0] * n_makers
maker_pnls = [0.0] * n_makers
hawkes_excitation = 0.0
for t in range(n_ticks):
mid = fund_prices[t]
# Makers place orders
all_bids = [] # (price, qty, maker_idx)
all_asks = []
for m in range(n_makers):
noise = rng.uniform(-0.05, 0.05)
quotes = avellaneda_stoikov_quotes(
mid + noise, maker_inventories[m], gamma, sigma, maker_spread, maker_levels, maker_qty
)
for q in quotes:
if q['side'] == 'buy':
all_bids.append((q['price'], q['qty'], m))
else:
all_asks.append((q['price'], q['qty'], m))
all_bids.sort(key=lambda x: -x[0]) # best bid first
all_asks.sort(key=lambda x: x[0]) # best ask first
# Record book state
if all_bids and all_asks:
spreads.append(all_asks[0][0] - all_bids[0][0])
midprices.append((all_bids[0][0] + all_asks[0][0]) / 2)
else:
spreads.append(0.0)
midprices.append(mid)
# Takers arrive (Hawkes)
lam = max(0.1, n_takers_lambda + hawkes_excitation)
n_takers = rng.poisson(lam)
tick_trades = 0
for _ in range(n_takers):
side = 'buy' if rng.random() < 0.5 else 'sell'
raw_size = (rng.pareto(taker_size_alpha) + 1) * taker_size_min
qty_remaining = min(round(raw_size, 1), taker_size_max)
book = list(all_asks) if side == 'buy' else list(all_bids)
for i, (price, available, maker_idx) in enumerate(book):
if qty_remaining <= 0:
break
fill = min(qty_remaining, available)
trade_prices.append(price)
trade_times.append(t)
trade_sizes.append(fill)
tick_trades += 1
qty_remaining -= fill
if side == 'buy':
maker_inventories[maker_idx] -= fill
maker_pnls[maker_idx] += price * fill
else:
maker_inventories[maker_idx] += fill
maker_pnls[maker_idx] -= price * fill
book[i] = (price, available - fill, maker_idx)
if side == 'buy':
all_asks = [(p, q, m) for p, q, m in book if q > 0]
else:
all_bids = [(p, q, m) for p, q, m in book if q > 0]
hawkes_excitation *= np.exp(-hawkes_beta)
hawkes_excitation += hawkes_alpha * tick_trades
n_trades_per_tick.append(tick_trades)
# Mark to market
final_price = fund_prices[-1]
for m in range(n_makers):
maker_pnls[m] += maker_inventories[m] * final_price
return {
'trade_prices': trade_prices,
'trade_times': trade_times,
'trade_sizes': trade_sizes,
'spreads': spreads,
'midprices': midprices,
'n_trades_per_tick': n_trades_per_tick,
'fundamental_prices': fund_prices,
'maker_pnls': [round(p, 2) for p in maker_pnls],
'total_trades': len(trade_prices),
}
if __name__ == '__main__':
result = run_market_sim()
print(json.dumps({
'total_trades': result['total_trades'],
'mean_spread': round(sum(result['spreads']) / len(result['spreads']), 4),
'maker_pnls': result['maker_pnls'],
}, indent=2))