diff --git a/python/functions/cybersecurity/__init__.py b/python/functions/cybersecurity/__init__.py index 8406b920..caddb4be 100644 --- a/python/functions/cybersecurity/__init__.py +++ b/python/functions/cybersecurity/__init__.py @@ -9,6 +9,8 @@ from .cybersecurity import ( levenshtein_distance, jaccard_similarity, normalize_url, + envelope_encrypt, + envelope_decrypt, ) __all__ = [ @@ -22,4 +24,6 @@ __all__ = [ "levenshtein_distance", "jaccard_similarity", "normalize_url", + "envelope_encrypt", + "envelope_decrypt", ] diff --git a/python/functions/cybersecurity/cybersecurity.py b/python/functions/cybersecurity/cybersecurity.py index 5c63c993..9522e249 100644 --- a/python/functions/cybersecurity/cybersecurity.py +++ b/python/functions/cybersecurity/cybersecurity.py @@ -4,8 +4,11 @@ import hashlib import math import re import base64 +import secrets +import struct from collections import Counter from urllib.parse import urlparse, urlunparse, parse_qs, urlencode +from cryptography.hazmat.primitives.ciphers.aead import AESGCM def hash_sha256(data: bytes) -> str: @@ -165,3 +168,147 @@ def normalize_url(raw_url: str) -> str: sorted_query = urlencode(sorted(params.items()), doseq=True) # Drop fragment return urlunparse((scheme, netloc, path, parsed.params, sorted_query, "")) + + +# --- Envelope Encryption (AES-256-GCM) --- + +_ENVELOPE_MAGIC = b"OVE1" +_ENVELOPE_VERSION = 0x01 +_HEADER_SIZE = 12 # magic(4) + version(1) + reserved(1) + efk_len(2) + kiv_len(2) + div_len(2) + + +def _build_envelope( + encrypted_file_key: bytes, + key_iv: bytes, + data_iv: bytes, + encrypted_content: bytes, +) -> bytes: + """Construye el formato binario del envelope (helper puro interno). + + Header (12 bytes): + Magic (4B): b"OVE1" + Version (1B): 0x01 + Reserved (1B): 0x00 + EFK_len (2B): longitud de encrypted_file_key (big-endian) + KIV_len (2B): longitud de key_iv (big-endian) + DIV_len (2B): longitud de data_iv (big-endian) + Seguido de: encrypted_file_key + key_iv + data_iv + encrypted_content + """ + header = ( + _ENVELOPE_MAGIC + + struct.pack(">BBHHH", _ENVELOPE_VERSION, 0x00, + len(encrypted_file_key), len(key_iv), len(data_iv)) + ) + return header + encrypted_file_key + key_iv + data_iv + encrypted_content + + +def _parse_envelope(ciphertext: bytes) -> tuple: + """Parsea el envelope binario y retorna sus componentes (helper puro interno). + + Returns: + (encrypted_file_key, key_iv, data_iv, encrypted_content) + + Raises: + ValueError: si el envelope esta truncado o la version no es soportada. + """ + if len(ciphertext) < _HEADER_SIZE: + raise ValueError( + f"Envelope truncado: se esperaban al menos {_HEADER_SIZE} bytes, " + f"se recibieron {len(ciphertext)}" + ) + + magic = ciphertext[:4] + if magic != _ENVELOPE_MAGIC: + raise ValueError(f"Magic invalido: se esperaba {_ENVELOPE_MAGIC!r}, se obtuvo {magic!r}") + + version, _reserved, efk_len, kiv_len, div_len = struct.unpack(">BBHHH", ciphertext[4:12]) + + if version != _ENVELOPE_VERSION: + raise ValueError(f"Version de envelope no soportada: {version}") + + offset = _HEADER_SIZE + encrypted_file_key = ciphertext[offset : offset + efk_len] + offset += efk_len + key_iv = ciphertext[offset : offset + kiv_len] + offset += kiv_len + data_iv = ciphertext[offset : offset + div_len] + offset += div_len + encrypted_content = ciphertext[offset:] + + if ( + len(encrypted_file_key) != efk_len + or len(key_iv) != kiv_len + or len(data_iv) != div_len + ): + raise ValueError("Envelope truncado: longitudes declaradas exceden los datos disponibles") + + return encrypted_file_key, key_iv, data_iv, encrypted_content + + +def envelope_encrypt(plaintext: bytes, master_key: bytes) -> bytes: + """Cifra datos usando patron Envelope Encryption con AES-256-GCM. + + Genera una file key aleatoria de 32 bytes, cifra los datos con ella, + luego cifra la file key con la master_key. El resultado es un envelope + binario que contiene todo lo necesario para descifrar con la master_key. + + Args: + plaintext: Datos a cifrar (puede ser vacio). + master_key: Clave maestra de 32 bytes (AES-256). + + Returns: + Envelope binario cifrado. + + Raises: + Exception: Si ocurre un error en el cifrado (clave de longitud incorrecta, etc.). + """ + # 1. Generar file_key aleatoria (DEK: Data Encryption Key) + file_key = secrets.token_bytes(32) + + # 2. Cifrar contenido con la file_key + data_iv = secrets.token_bytes(12) + aesgcm_data = AESGCM(file_key) + encrypted_content = aesgcm_data.encrypt(data_iv, plaintext, None) + + # 3. Cifrar file_key con la master_key (KEK: Key Encryption Key) + key_iv = secrets.token_bytes(12) + aesgcm_key = AESGCM(master_key) + encrypted_file_key = aesgcm_key.encrypt(key_iv, file_key, None) + + # 4. Construir envelope + return _build_envelope(encrypted_file_key, key_iv, data_iv, encrypted_content) + + +def envelope_decrypt(ciphertext: bytes, master_key: bytes) -> bytes: + """Descifra datos cifrados con envelope_encrypt. + + Si los datos no empiezan con el magic b"OVE1", se asume que no estan + cifrados y se retornan tal cual (comportamiento passthrough). Esto + permite usar la funcion en archivos que pueden o no estar cifrados. + + Args: + ciphertext: Envelope cifrado (o datos en plano si no tienen magic). + master_key: Clave maestra de 32 bytes (AES-256). + + Returns: + Datos descifrados, o ciphertext sin modificar si no tiene magic. + + Raises: + ValueError: Si el envelope esta corrupto o truncado. + cryptography.exceptions.InvalidTag: Si la master_key es incorrecta + o los datos fueron manipulados (falla de autenticacion GCM). + """ + # Passthrough: si no comienza con magic, asumir que no esta cifrado + if not ciphertext.startswith(_ENVELOPE_MAGIC): + return ciphertext + + # Parsear envelope + encrypted_file_key, key_iv, data_iv, encrypted_content = _parse_envelope(ciphertext) + + # Descifrar file_key con master_key + aesgcm_key = AESGCM(master_key) + file_key = aesgcm_key.decrypt(key_iv, encrypted_file_key, None) + + # Descifrar contenido con file_key + aesgcm_data = AESGCM(file_key) + return aesgcm_data.decrypt(data_iv, encrypted_content, None) diff --git a/python/functions/cybersecurity/envelope_decrypt.md b/python/functions/cybersecurity/envelope_decrypt.md new file mode 100644 index 00000000..4cb71cd6 --- /dev/null +++ b/python/functions/cybersecurity/envelope_decrypt.md @@ -0,0 +1,59 @@ +--- +name: envelope_decrypt +kind: function +lang: py +domain: cybersecurity +version: "1.0.0" +purity: impure +signature: "def envelope_decrypt(ciphertext: bytes, master_key: bytes) -> bytes" +description: "Descifra datos cifrados con envelope_encrypt. Si los datos no comienzan con el magic b'OVE1', los retorna sin modificar (passthrough). Soporta archivos que pueden o no estar cifrados sin necesidad de chequeo previo." +tags: [decryption, aes, gcm, envelope-encryption, dek, kek, cryptography, cybersecurity, passthrough] +uses_functions: [envelope_encrypt_py_cybersecurity] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [cryptography, struct] +tested: true +tests: + - "decrypt de datos cifrados" + - "decrypt de datos no cifrados passthrough" + - "key incorrecta" + - "envelope truncado" + - "magic invalido" +test_file_path: "python/functions/cybersecurity/envelope_encrypt_test.py" +file_path: "python/functions/cybersecurity/cybersecurity.py" +--- + +## Ejemplo + +```python +import secrets +from cybersecurity import envelope_encrypt, envelope_decrypt + +master_key = secrets.token_bytes(32) + +# Caso 1: descifrar datos cifrados +ciphertext = envelope_encrypt(b"datos secretos", master_key) +plaintext = envelope_decrypt(ciphertext, master_key) +# plaintext == b"datos secretos" + +# Caso 2: passthrough — datos no cifrados +raw = b"archivo en plano" +result = envelope_decrypt(raw, master_key) +# result == b"archivo en plano" (sin modificar) + +# Caso 3: key incorrecta — lanza InvalidTag +wrong_key = secrets.token_bytes(32) +# envelope_decrypt(ciphertext, wrong_key) → cryptography.exceptions.InvalidTag +``` + +## Notas + +Implementacion original inspirada en OpenViking `openviking/crypto/encryptor.py` (AGPL-3.0). Reimplementada desde cero. + +- **Passthrough**: si `ciphertext` no empieza con `b"OVE1"`, se retorna sin modificar. Permite usar la funcion indistintamente en archivos cifrados y no cifrados. +- **Autenticacion GCM**: si la master_key es incorrecta o los datos fueron manipulados, `cryptography.exceptions.InvalidTag` es lanzado por la capa GCM — nunca se retorna texto corrupto. +- **ValueError**: lanzado si el envelope tiene magic correcto pero estructura invalida (truncado o version no soportada). +- `master_key` debe ser de exactamente 32 bytes para AES-256. +- Requiere `cryptography` instalado: `uv add cryptography`. diff --git a/python/functions/cybersecurity/envelope_encrypt.md b/python/functions/cybersecurity/envelope_encrypt.md new file mode 100644 index 00000000..0ab7b4a0 --- /dev/null +++ b/python/functions/cybersecurity/envelope_encrypt.md @@ -0,0 +1,68 @@ +--- +name: envelope_encrypt +kind: function +lang: py +domain: cybersecurity +version: "1.0.0" +purity: impure +signature: "def envelope_encrypt(plaintext: bytes, master_key: bytes) -> bytes" +description: "Cifra datos usando patron Envelope Encryption con AES-256-GCM. Genera una file key aleatoria (DEK), cifra los datos con ella, luego cifra la file key con la master_key (KEK). Retorna un envelope binario con magic b'OVE1'." +tags: [encryption, aes, gcm, envelope-encryption, dek, kek, cryptography, cybersecurity] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [cryptography, secrets, struct] +tested: true +tests: + - "encrypt → decrypt roundtrip" + - "datos vacios" + - "datos grandes" + - "ciphertext tiene magic correcto" + - "ciphertext es distinto cada vez" +test_file_path: "python/functions/cybersecurity/envelope_encrypt_test.py" +file_path: "python/functions/cybersecurity/cybersecurity.py" +--- + +## Ejemplo + +```python +import secrets +from cybersecurity import envelope_encrypt, envelope_decrypt + +master_key = secrets.token_bytes(32) # 256-bit KEK +plaintext = b"datos confidenciales" + +ciphertext = envelope_encrypt(plaintext, master_key) +# ciphertext[:4] == b"OVE1" + +recovered = envelope_decrypt(ciphertext, master_key) +# recovered == plaintext +``` + +## Formato del envelope + +``` +Magic (4B): b"OVE1" identificador de formato +Version (1B): 0x01 version del protocolo +Reserved (1B): 0x00 reservado para uso futuro +EFK_len (2B): big-endian longitud de encrypted_file_key +KIV_len (2B): big-endian longitud de key_iv +DIV_len (2B): big-endian longitud de data_iv +--- header: 12 bytes total --- +Encrypted File Key (variable, incluye GCM auth tag de 16B) +Key IV (12B) +Data IV (12B) +Encrypted Content (variable, incluye GCM auth tag de 16B) +``` + +## Notas + +Implementacion original inspirada en OpenViking `openviking/crypto/encryptor.py` (AGPL-3.0). Reimplementada desde cero. + +- La file key (DEK) es de 32 bytes generados con `secrets.token_bytes` (CSPRNG). +- Tanto el cifrado de datos como el de la file key usan AES-256-GCM con IVs de 12 bytes. +- El GCM auth tag (16 bytes) garantiza autenticidad e integridad. +- `master_key` debe ser de exactamente 32 bytes para AES-256. +- Requiere `cryptography` instalado: `uv add cryptography`. diff --git a/python/functions/cybersecurity/envelope_encrypt_test.py b/python/functions/cybersecurity/envelope_encrypt_test.py new file mode 100644 index 00000000..2a3b5828 --- /dev/null +++ b/python/functions/cybersecurity/envelope_encrypt_test.py @@ -0,0 +1,101 @@ +"""Tests para envelope_encrypt y envelope_decrypt.""" + +import secrets +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from cybersecurity import envelope_encrypt, envelope_decrypt + + +def test_encrypt_decrypt_roundtrip(): + master_key = secrets.token_bytes(32) + plaintext = b"datos de prueba para envelope encryption" + ciphertext = envelope_encrypt(plaintext, master_key) + result = envelope_decrypt(ciphertext, master_key) + assert result == plaintext + + +def test_datos_vacios(): + master_key = secrets.token_bytes(32) + ciphertext = envelope_encrypt(b"", master_key) + result = envelope_decrypt(ciphertext, master_key) + assert result == b"" + + +def test_datos_grandes(): + master_key = secrets.token_bytes(32) + plaintext = secrets.token_bytes(1024 * 1024) # 1 MB + ciphertext = envelope_encrypt(plaintext, master_key) + result = envelope_decrypt(ciphertext, master_key) + assert result == plaintext + + +def test_decrypt_datos_no_cifrados_passthrough(): + master_key = secrets.token_bytes(32) + plain = b"archivo no cifrado, sin magic bytes" + result = envelope_decrypt(plain, master_key) + assert result == plain + + +def test_key_incorrecta(): + master_key = secrets.token_bytes(32) + wrong_key = secrets.token_bytes(32) + ciphertext = envelope_encrypt(b"secreto", master_key) + try: + envelope_decrypt(ciphertext, wrong_key) + assert False, "deberia haber lanzado excepcion" + except Exception: + pass # esperado: InvalidTag de cryptography + + +def test_envelope_truncado(): + master_key = secrets.token_bytes(32) + ciphertext = envelope_encrypt(b"datos", master_key) + truncated = ciphertext[:6] # header incompleto + try: + envelope_decrypt(truncated, master_key) + assert False, "deberia haber lanzado ValueError" + except ValueError: + pass + + +def test_magic_invalido(): + master_key = secrets.token_bytes(32) + # Construir datos con magic valido para pasar el check del passthrough + # pero con header corrupto + bad_envelope = b"OVE1" + b"\x00" * 20 # magic correcto pero header invalido + try: + envelope_decrypt(bad_envelope, master_key) + assert False, "deberia haber lanzado excepcion" + except Exception: + pass + + +def test_ciphertext_tiene_magic_correcto(): + master_key = secrets.token_bytes(32) + ciphertext = envelope_encrypt(b"test", master_key) + assert ciphertext[:4] == b"OVE1" + + +def test_ciphertext_es_distinto_cada_vez(): + master_key = secrets.token_bytes(32) + plaintext = b"mismo mensaje" + ct1 = envelope_encrypt(plaintext, master_key) + ct2 = envelope_encrypt(plaintext, master_key) + # IVs aleatorios garantizan ciphertexts distintos + assert ct1 != ct2 + + +if __name__ == "__main__": + test_encrypt_decrypt_roundtrip() + test_datos_vacios() + test_datos_grandes() + test_decrypt_datos_no_cifrados_passthrough() + test_key_incorrecta() + test_envelope_truncado() + test_magic_invalido() + test_ciphertext_tiene_magic_correcto() + test_ciphertext_es_distinto_cada_vez() + print("Todos los tests pasaron.") diff --git a/python/functions/datascience/aggregate_by_group.md b/python/functions/datascience/aggregate_by_group.md new file mode 100644 index 00000000..f3010863 --- /dev/null +++ b/python/functions/datascience/aggregate_by_group.md @@ -0,0 +1,45 @@ +--- +name: aggregate_by_group +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def aggregate_by_group(rows: list[dict], group_by: list[str], aggs: dict[str, str]) -> list[dict]" +description: "GROUP BY + agregaciones sobre datos tabulares. aggs es un dict de columna a funcion (sum, mean, count, min, max, first, last, collect). collect acumula valores en lista. None se ignora en agregaciones numericas." +tags: [datascience, tabular, groupby, aggregate, transform, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["collections"] +tested: true +tests: + - "Group by una columna con sum" + - "Group by multiples columnas" + - "Agregacion mean count min max" + - "collect acumula en lista" + - "Grupo con una sola fila" + - "Campo con None se ignora en agregaciones numericas" +test_file_path: "python/functions/datascience/aggregate_by_group_test.py" +file_path: "python/functions/datascience/aggregate_by_group.py" +--- + +## Ejemplo + +```python +rows = [ + {"dept": "eng", "salary": 100}, + {"dept": "eng", "salary": 120}, + {"dept": "sales", "salary": 80}, +] +aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"}) +# [{"dept": "eng", "salary": 110.0}, {"dept": "sales", "salary": 80.0}] +``` + +## Notas + +Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib). +Preserva el orden de primera aparicion de cada grupo. +La funcion 'collect' no filtra None — acumula todos los valores incluyendo None. diff --git a/python/functions/datascience/aggregate_by_group.py b/python/functions/datascience/aggregate_by_group.py new file mode 100644 index 00000000..d1ff9bee --- /dev/null +++ b/python/functions/datascience/aggregate_by_group.py @@ -0,0 +1,71 @@ +"""GROUP BY + agregaciones sobre datos tabulares list[dict].""" + +from collections import defaultdict + + +def aggregate_by_group( + rows: list[dict], + group_by: list[str], + aggs: dict[str, str], +) -> list[dict]: + """Agrupa filas por una o varias columnas y aplica agregaciones. + + Equivalente a SQL GROUP BY con funciones de agregacion. + La funcion 'collect' acumula todos los valores en una lista. + Los valores None se ignoran en agregaciones numericas (sum, mean, min, max). + + Args: + rows: Lista de dicts con los datos. + group_by: Lista de columnas por las que agrupar. + aggs: Dict de {columna: funcion}. Funciones: sum, mean, count, + min, max, first, last, collect. + + Returns: + Lista de dicts con las columnas de group_by mas los campos agregados. + El orden de las filas sigue el orden de primera aparicion del grupo. + """ + # Mantener orden de grupos con lista de claves + group_keys: list[tuple] = [] + seen_groups: set[tuple] = set() + buckets: dict[tuple, dict[str, list]] = defaultdict(lambda: defaultdict(list)) + + for row in rows: + gk = tuple(row.get(col) for col in group_by) + if gk not in seen_groups: + seen_groups.add(gk) + group_keys.append(gk) + for col in aggs: + val = row.get(col) + buckets[gk][col].append(val) + + def _aggregate(vals: list, func: str): + if func == "collect": + return vals + if func == "count": + return len(vals) + if func == "first": + return vals[0] if vals else None + if func == "last": + return vals[-1] if vals else None + # Para sum, mean, min, max: ignorar None + numeric = [v for v in vals if v is not None] + if not numeric: + return None + if func == "sum": + return sum(numeric) + if func == "mean": + return sum(numeric) / len(numeric) + if func == "min": + return min(numeric) + if func == "max": + return max(numeric) + raise ValueError(f"Funcion de agregacion no soportada: {func}") + + result = [] + for gk in group_keys: + record: dict = dict(zip(group_by, gk)) + for col, func in aggs.items(): + record[col] = _aggregate(buckets[gk][col], func) + result.append(record) + + return result diff --git a/python/functions/datascience/aggregate_by_group_test.py b/python/functions/datascience/aggregate_by_group_test.py new file mode 100644 index 00000000..1f39f569 --- /dev/null +++ b/python/functions/datascience/aggregate_by_group_test.py @@ -0,0 +1,90 @@ +"""Tests para aggregate_by_group.""" + +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) + +from aggregate_by_group import aggregate_by_group + + +def test_group_by_una_columna_con_sum(): + """Group by una columna con sum.""" + rows = [ + {"dept": "eng", "salary": 100}, + {"dept": "eng", "salary": 120}, + {"dept": "sales", "salary": 80}, + ] + result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"}) + assert len(result) == 2 + eng = next(r for r in result if r["dept"] == "eng") + sales = next(r for r in result if r["dept"] == "sales") + assert eng["salary"] == 220 + assert sales["salary"] == 80 + + +def test_group_by_multiples_columnas(): + """Group by multiples columnas.""" + rows = [ + {"dept": "eng", "level": "senior", "salary": 150}, + {"dept": "eng", "level": "junior", "salary": 80}, + {"dept": "eng", "level": "senior", "salary": 160}, + {"dept": "sales", "level": "senior", "salary": 120}, + ] + result = aggregate_by_group(rows, group_by=["dept", "level"], aggs={"salary": "sum"}) + assert len(result) == 3 + eng_senior = next(r for r in result if r["dept"] == "eng" and r["level"] == "senior") + assert eng_senior["salary"] == 310 + + +def test_agregacion_mean_count_min_max(): + """Agregacion mean count min max.""" + rows = [ + {"cat": "A", "val": 10}, + {"cat": "A", "val": 20}, + {"cat": "A", "val": 30}, + ] + result_mean = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "mean"}) + assert result_mean[0]["val"] == 20.0 + + result_count = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "count"}) + assert result_count[0]["val"] == 3 + + result_min = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "min"}) + assert result_min[0]["val"] == 10 + + result_max = aggregate_by_group(rows, group_by=["cat"], aggs={"val": "max"}) + assert result_max[0]["val"] == 30 + + +def test_collect_acumula_en_lista(): + """collect acumula en lista.""" + rows = [ + {"dept": "eng", "name": "Alice"}, + {"dept": "eng", "name": "Bob"}, + {"dept": "sales", "name": "Carol"}, + ] + result = aggregate_by_group(rows, group_by=["dept"], aggs={"name": "collect"}) + eng = next(r for r in result if r["dept"] == "eng") + assert sorted(eng["name"]) == ["Alice", "Bob"] + + +def test_grupo_con_una_sola_fila(): + """Grupo con una sola fila.""" + rows = [{"dept": "eng", "salary": 100}] + result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"}) + assert len(result) == 1 + assert result[0]["salary"] == 100 + + +def test_campo_con_none_se_ignora_en_agregaciones_numericas(): + """Campo con None se ignora en agregaciones numericas.""" + rows = [ + {"dept": "eng", "salary": 100}, + {"dept": "eng", "salary": None}, + {"dept": "eng", "salary": 200}, + ] + result = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "sum"}) + assert result[0]["salary"] == 300 + + result_mean = aggregate_by_group(rows, group_by=["dept"], aggs={"salary": "mean"}) + assert result_mean[0]["salary"] == 150.0 diff --git a/python/functions/datascience/build_entity_schema_prompt.md b/python/functions/datascience/build_entity_schema_prompt.md new file mode 100644 index 00000000..5fe10aa4 --- /dev/null +++ b/python/functions/datascience/build_entity_schema_prompt.md @@ -0,0 +1,62 @@ +--- +name: build_entity_schema_prompt +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def build_entity_schema_prompt(entity_presets: list[dict]) -> str" +description: "Genera la seccion del system prompt que describe los entity types disponibles para extraccion. Formatea los presets del registry en texto legible para el LLM." +tags: [prompt, llm, entity, schema, osint, graph, extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "lista con varios presets" + - "lista vacia retorna string vacio" + - "preset sin metadata_fields" +test_file_path: "python/functions/datascience/build_entity_schema_prompt_test.py" +file_path: "python/functions/datascience/build_entity_schema_prompt.py" +--- + +## Ejemplo + +```python +from build_entity_schema_prompt import build_entity_schema_prompt + +presets = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"], + }, + { + "type_ref": "osint_organization_go_cybersecurity", + "label": "Organization", + "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"], + }, +] + +prompt = build_entity_schema_prompt(presets) +# Entity types available for extraction: +# +# 1. Person (type_ref: osint_person_go_cybersecurity) +# Attributes: full_name, alias, nationality, dob, risk_score +# +# 2. Organization (type_ref: osint_organization_go_cybersecurity) +# Attributes: legal_name, country, sector, founded, risk_score +``` + +## Notas + +Funcion pura. No requiere dependencias externas. + +El formato de salida es deliberadamente sencillo para maximizar la comprension por el LLM: numero de orden, label humano, type_ref del registry y lista de atributos en una sola linea. + +Si un preset no tiene `metadata_fields` (o tiene lista vacia), se omite la linea de atributos. + +Pensada para componer con `build_relation_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT. diff --git a/python/functions/datascience/build_entity_schema_prompt.py b/python/functions/datascience/build_entity_schema_prompt.py new file mode 100644 index 00000000..23cc0633 --- /dev/null +++ b/python/functions/datascience/build_entity_schema_prompt.py @@ -0,0 +1,43 @@ +"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion.""" + + +def build_entity_schema_prompt(entity_presets: list[dict]) -> str: + """Genera texto legible para el LLM describiendo los entity types disponibles. + + Formatea los presets del registry en una seccion del system prompt que indica + al LLM que tipos de entidades puede extraer y que atributos tiene cada uno. + + Args: + entity_presets: Lista de presets con campos 'label', 'type_ref' y + opcionalmente 'metadata_fields'. Ejemplo: + [{"type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias"]}] + + Returns: + String formateado con la seccion del prompt. Retorna string vacio si + la lista de presets esta vacia. + """ + if not entity_presets: + return "" + + lines = ["Entity types available for extraction:", ""] + + for i, preset in enumerate(entity_presets, start=1): + label = preset.get("label", "Unknown") + type_ref = preset.get("type_ref", "") + metadata_fields = preset.get("metadata_fields", []) + + lines.append(f"{i}. {label} (type_ref: {type_ref})") + + if metadata_fields: + attrs = ", ".join(metadata_fields) + lines.append(f" Attributes: {attrs}") + + lines.append("") + + # Remove trailing blank line + if lines and lines[-1] == "": + lines.pop() + + return "\n".join(lines) diff --git a/python/functions/datascience/build_entity_schema_prompt_test.py b/python/functions/datascience/build_entity_schema_prompt_test.py new file mode 100644 index 00000000..5ecb43b0 --- /dev/null +++ b/python/functions/datascience/build_entity_schema_prompt_test.py @@ -0,0 +1,41 @@ +"""Tests para build_entity_schema_prompt.""" + +from build_entity_schema_prompt import build_entity_schema_prompt + + +def test_lista_con_varios_presets(): + presets = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"], + }, + { + "type_ref": "osint_organization_go_cybersecurity", + "label": "Organization", + "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"], + }, + ] + result = build_entity_schema_prompt(presets) + assert "Entity types available for extraction:" in result + assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result + assert " Attributes: full_name, alias, nationality, dob, risk_score" in result + assert "2. Organization (type_ref: osint_organization_go_cybersecurity)" in result + assert " Attributes: legal_name, country, sector, founded, risk_score" in result + + +def test_lista_vacia_retorna_string_vacio(): + result = build_entity_schema_prompt([]) + assert result == "" + + +def test_preset_sin_metadata_fields(): + presets = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + } + ] + result = build_entity_schema_prompt(presets) + assert "1. Person (type_ref: osint_person_go_cybersecurity)" in result + assert "Attributes:" not in result diff --git a/python/functions/datascience/build_relation_schema_prompt.md b/python/functions/datascience/build_relation_schema_prompt.md new file mode 100644 index 00000000..e76ffa85 --- /dev/null +++ b/python/functions/datascience/build_relation_schema_prompt.md @@ -0,0 +1,43 @@ +--- +name: build_relation_schema_prompt +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def build_relation_schema_prompt(relation_types: list[str]) -> str" +description: "Genera la seccion del system prompt con los tipos de relacion permitidos para extraccion. Formatea la lista de tipos en texto legible para el LLM." +tags: [prompt, llm, relation, schema, osint, graph, extraction] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "lista con varios tipos" + - "lista vacia retorna string vacio" + - "un solo tipo" +test_file_path: "python/functions/datascience/build_relation_schema_prompt_test.py" +file_path: "python/functions/datascience/build_relation_schema_prompt.py" +--- + +## Ejemplo + +```python +from build_relation_schema_prompt import build_relation_schema_prompt + +types = ["funds", "employs", "communicates_with", "owns"] +prompt = build_relation_schema_prompt(types) +# Allowed relation types: +# funds, employs, communicates_with, owns +``` + +## Notas + +Funcion pura. No requiere dependencias externas. + +La salida es una sola linea con todos los tipos separados por coma, precedida por el encabezado. El formato es minimal para no consumir tokens innecesarios del contexto del LLM. + +Pensada para componer con `build_entity_schema_prompt` al construir el system prompt completo de extraccion de grafos OSINT. diff --git a/python/functions/datascience/build_relation_schema_prompt.py b/python/functions/datascience/build_relation_schema_prompt.py new file mode 100644 index 00000000..83ba5261 --- /dev/null +++ b/python/functions/datascience/build_relation_schema_prompt.py @@ -0,0 +1,22 @@ +"""Genera la seccion del system prompt con los tipos de relacion permitidos.""" + + +def build_relation_schema_prompt(relation_types: list[str]) -> str: + """Genera texto legible para el LLM describiendo los tipos de relacion permitidos. + + Formatea la lista de tipos de relacion en una seccion del system prompt que + indica al LLM que relaciones puede extraer entre entidades. + + Args: + relation_types: Lista de strings con los tipos de relacion permitidos. + Ejemplo: ["funds", "employs", "communicates_with"] + + Returns: + String formateado con la seccion del prompt. Retorna string vacio si + la lista esta vacia. + """ + if not relation_types: + return "" + + joined = ", ".join(relation_types) + return f"Allowed relation types:\n{joined}" diff --git a/python/functions/datascience/build_relation_schema_prompt_test.py b/python/functions/datascience/build_relation_schema_prompt_test.py new file mode 100644 index 00000000..6c13c004 --- /dev/null +++ b/python/functions/datascience/build_relation_schema_prompt_test.py @@ -0,0 +1,19 @@ +"""Tests para build_relation_schema_prompt.""" + +from build_relation_schema_prompt import build_relation_schema_prompt + + +def test_lista_normal(): + relation_types = ["funds", "employs", "communicates_with", "owns", "operates"] + result = build_relation_schema_prompt(relation_types) + assert result.startswith("Allowed relation types:") + assert "funds" in result + assert "employs" in result + assert "communicates_with" in result + assert "owns" in result + assert "operates" in result + + +def test_lista_vacia_retorna_string_vacio(): + result = build_relation_schema_prompt([]) + assert result == "" diff --git a/python/functions/datascience/datascience.py b/python/functions/datascience/datascience.py index 35f7d38a..60a78eab 100644 --- a/python/functions/datascience/datascience.py +++ b/python/functions/datascience/datascience.py @@ -121,3 +121,72 @@ def linspace(start: float, stop: float, num: int) -> list: return [start] step = (stop - start) / (num - 1) return [start + i * step for i in range(num)] + + +def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict: + """Estima parámetros de un proceso Hawkes desde autocorrelación de arrivals. + + Ajusta exponencial a*exp(-b*lag) sobre la ACF. + Retorna dict con alpha, beta, branching_ratio, acf. + """ + import numpy as np + from scipy.optimize import curve_fit + + arr = np.array(arrivals, dtype=float) + mean_a = np.mean(arr) + var_a = np.var(arr) + if var_a == 0: + return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': [1.0]} + + acf = [1.0] + [ + float(np.mean((arr[lag:] - mean_a) * (arr[:-lag] - mean_a)) / var_a) + for lag in range(1, max_lag) + ] + + lags = np.arange(1, max_lag) + acf_vals = np.array(acf[1:]) + + if acf_vals[0] <= 0.01: + return {'alpha': 0.0, 'beta': 1.0, 'branching_ratio': 0.0, 'acf': acf} + + exp_decay = lambda x, a, b: a * np.exp(-b * x) + try: + popt, _ = curve_fit(exp_decay, lags, acf_vals, p0=[0.5, 0.5], maxfev=5000) + alpha_est, beta_est = abs(popt[0]), abs(popt[1]) + except RuntimeError: + alpha_est, beta_est = 0.0, 1.0 + + branching = alpha_est / beta_est if beta_est > 0 else 0.0 + return { + 'alpha': round(alpha_est, 4), + 'beta': round(beta_est, 4), + 'branching_ratio': round(branching, 4), + 'acf': acf, + } + + +def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict: + """Estima el exponente alpha de una distribución Pareto via MLE. + + α = n / Σ ln(xi / x_min) donde x_min es el percentil indicado. + Alpha bajo = cola más pesada = más valores extremos. + """ + import numpy as np + + arr = np.array([v for v in values if v > 0], dtype=float) + if len(arr) < 10: + return {'alpha': 0.0, 'x_min': 0.0, 'n_tail': 0} + + x_min = float(np.percentile(arr, x_min_percentile)) + tail = arr[arr >= x_min] + + if len(tail) < 2 or x_min <= 0: + return {'alpha': 0.0, 'x_min': x_min, 'n_tail': len(tail)} + + alpha = float(len(tail) / np.sum(np.log(tail / x_min))) + + return { + 'alpha': round(alpha, 4), + 'x_min': round(x_min, 6), + 'n_tail': len(tail), + } diff --git a/python/functions/datascience/deduplicate_entities.md b/python/functions/datascience/deduplicate_entities.md new file mode 100644 index 00000000..50885104 --- /dev/null +++ b/python/functions/datascience/deduplicate_entities.md @@ -0,0 +1,94 @@ +--- +name: deduplicate_entities +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def deduplicate_entities(candidates: list[EntityCandidate], name_threshold: float = 0.85, same_type_only: bool = True) -> DeduplicationResult" +description: "Agrupa entidades candidatas que refieren a la misma entidad real usando fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para clusters transitivos. Retorna entidades mergeadas con mapas de resolucion de IDs y log de merges." +tags: [deduplication, entity, fuzzy, levenshtein, jaccard, union-find, knowledge-graph, nlp, fuzzygraph, datascience] +uses_functions: + - normalize_entity_name_py_core + - merge_entity_attributes_py_core +uses_types: + - entity_candidate_py_datascience + - deduplication_result_py_datascience +returns: [deduplication_result_py_datascience] +returns_optional: false +error_type: "" +imports: + - uuid +tested: true +tests: + - "John Smith y Smith, John se mergean" + - "Google y Google LLC se mergean" + - "192.168.1.1 y 192.168.1.1 se mergean por matching exacto" + - "John Smith (person) y John Smith (organization) NO se mergean" + - "Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster" + - "Entidades sin duplicados pasan sin modificacion" + - "Confidence toma el max del cluster; atributos se fusionan" + - "Lista vacia retorna resultado vacio" + - "name_to_id contiene todos los nombres originales del cluster" +test_file_path: "python/functions/datascience/deduplicate_entities_test.py" +file_path: "python/functions/datascience/deduplicate_entities.py" +--- + +## Ejemplo + +```python +from python.types.datascience.entity_candidate import EntityCandidate +from python.functions.datascience.deduplicate_entities import deduplicate_entities + +candidates = [ + EntityCandidate(name="John Smith", type_ref="person", confidence=0.9), + EntityCandidate(name="Smith, John", type_ref="person", confidence=0.85), + EntityCandidate(name="Google", type_ref="organization", confidence=0.95), + EntityCandidate(name="Google LLC", type_ref="organization", confidence=0.88), +] + +result = deduplicate_entities(candidates, name_threshold=0.85, same_type_only=True) +# result.total_before = 4 +# result.total_after = 2 +# result.merge_log = [ +# {"canonical": "John Smith", "merged": ["Smith, John"], "score": 0.91, "reason": "fuzzy_name"}, +# {"canonical": "Google", "merged": ["Google LLC"], "score": 0.89, "reason": "fuzzy_name"}, +# ] +``` + +## Algoritmo + +1. **Normalizar nombres** usando `normalize_entity_name()` sobre cada candidato segun su `type_ref` +2. **Comparacion pairwise** dentro del mismo tipo (si `same_type_only=True`): + - Para tipos tecnicos (ip, email, domain, crypto_wallet, phone): matching exacto normalizado + - Para el resto: `score = max(levenshtein_sim, jaccard_sim)` + bonus por contencion (+0.3) y acronimos (+0.3) +3. **Union-Find** para clusters transitivos: si A~B y B~C, entonces {A, B, C} forman un cluster +4. **Merge por cluster:** + - Nombre canonico: candidato con mayor `confidence` + - Atributos: `merge_entity_attributes()` sobre todos los candidatos del cluster + - Confidence: `max` del cluster + - Source chunks: union de todos los candidatos + - `merged_from`: union de todos los nombres originales + +## Heuristicas de similitud de nombres + +| Heuristica | Efecto | +|---|---| +| Levenshtein | `1 - (edit_distance / max_len)` | +| Jaccard sobre tokens | `\|A ∩ B\| / \|A ∪ B\|` | +| Score base | `max(lev_sim, jaccard_sim)` | +| Contencion (a in b o b in a) | `+0.3` hasta max 1.0 | +| Acronimo ("FBI" ~ "Federal Bureau of Investigation") | `+0.3` hasta max 1.0 | +| Tipos exactos (ip/email/domain) | solo matching exacto, ignora umbral | + +## Complejidad + +- Pairwise: O(N^2) — aceptable para <1000 entidades (tipico por documento) +- Union-Find con path compression: O(α(N)) amortizado por operacion +- Para escalar a >1000: pre-filtrar por primera letra o n-gram index antes de comparar + +## Notas + +Funcion pura. Implementa Levenshtein y Jaccard internamente para evitar dependencias externas a este modulo. Las funciones del registry `levenshtein_distance_py_cybersecurity` y `jaccard_similarity_py_cybersecurity` son equivalentes pero requieren imports adicionales — la implementacion inline mantiene la funcion sin dependencias de stdlib. + +El `name_to_id` del resultado es el mapa de resolucion principal para la fase de deduplicacion de relaciones: permite resolver cualquier variante de nombre de una entidad a su ID canonico. diff --git a/python/functions/datascience/deduplicate_entities.py b/python/functions/datascience/deduplicate_entities.py new file mode 100644 index 00000000..bbfa3908 --- /dev/null +++ b/python/functions/datascience/deduplicate_entities.py @@ -0,0 +1,283 @@ +"""Deduplica entidades candidatas usando fuzzy matching de nombres.""" + +from __future__ import annotations + +import sys +import os +import uuid + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from python.types.datascience.entity_candidate import EntityCandidate +from python.types.datascience.deduplication_result import DeduplicationResult +from python.functions.core.normalize_entity_name import normalize_entity_name +from python.functions.core.merge_entity_attributes import merge_entity_attributes + + +# ── Similitud helpers ────────────────────────────────────────────────────────── + +def _levenshtein(a: str, b: str) -> int: + """Distancia de edicion Levenshtein entre dos strings.""" + if a == b: + return 0 + if not a: + return len(b) + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + curr = [i] + for j, cb in enumerate(b, 1): + cost = 0 if ca == cb else 1 + curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)) + prev = curr + return prev[-1] + + +def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float: + """Similitud de Jaccard entre dos conjuntos de tokens.""" + set_a = set(tokens_a) + set_b = set(tokens_b) + if not set_a and not set_b: + return 1.0 + inter = len(set_a & set_b) + union = len(set_a | set_b) + return inter / union if union else 0.0 + + +def _name_similarity(a: str, b: str) -> float: + """Score de similitud entre dos nombres normalizados. + + Combina similitud de Levenshtein y Jaccard sobre tokens. + Aplica bonus de contencion (+0.3) y deteccion de acronimos. + """ + if a == b: + return 1.0 + + # Similitud Levenshtein + max_len = max(len(a), len(b)) + lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0 + + # Similitud Jaccard sobre tokens + tokens_a = a.split() + tokens_b = b.split() + jac_sim = _jaccard(tokens_a, tokens_b) + + score = max(lev_sim, jac_sim) + + # Bonus de contencion: un nombre contiene al otro + if a in b or b in a: + score = min(1.0, score + 0.3) + + # Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation" + if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a): + score = min(1.0, score + 0.3) + + return score + + +def _is_acronym_of(candidate: str, tokens: list[str]) -> bool: + """Comprueba si candidate es un acronimo formado por las iniciales de tokens.""" + if not candidate or not tokens: + return False + initials = "".join(t[0] for t in tokens if t).upper() + return candidate.upper() == initials + + +_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"} + + +def _is_exact_type(entity_type: str) -> bool: + """Tipos tecnicos donde solo se acepta matching exacto.""" + return entity_type.lower() in _EXACT_TYPES + + +# ── Union-Find ───────────────────────────────────────────────────────────────── + +class _UnionFind: + def __init__(self, n: int) -> None: + self._parent = list(range(n)) + self._rank = [0] * n + + def find(self, x: int) -> int: + while self._parent[x] != x: + self._parent[x] = self._parent[self._parent[x]] + x = self._parent[x] + return x + + def union(self, x: int, y: int) -> None: + rx, ry = self.find(x), self.find(y) + if rx == ry: + return + if self._rank[rx] < self._rank[ry]: + rx, ry = ry, rx + self._parent[ry] = rx + if self._rank[rx] == self._rank[ry]: + self._rank[rx] += 1 + + +# ── Implementacion principal ──────────────────────────────────────────────────── + +def deduplicate_entities( + candidates: list[EntityCandidate], + name_threshold: float = 0.85, + same_type_only: bool = True, +) -> DeduplicationResult: + """Agrupa entidades candidatas que refieren a la misma entidad real. + + Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para + detectar clusters transitivos. Por cada cluster genera una entidad canonica + mergeando atributos de todos sus miembros. + + Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se + acepta matching exacto normalizado, ignorando el umbral de nombre. + + Args: + candidates: lista de EntityCandidate a deduplicar. + name_threshold: score minimo para considerar dos nombres iguales (0-1). + same_type_only: si True, solo compara entidades del mismo type_ref. + + Returns: + DeduplicationResult con entidades deduplicadas, mapas de resolucion + e historial de merges. + """ + if not candidates: + return DeduplicationResult( + entities=[], + entity_id_map={}, + name_to_id={}, + merge_log=[], + total_before=0, + total_after=0, + ) + + n = len(candidates) + + # Paso 1: normalizar nombres + normalized: list[str] = [] + for c in candidates: + norm = normalize_entity_name(c.name, c.type_ref) + normalized.append(norm) + + # Paso 2: Union-Find sobre todos los indices + uf = _UnionFind(n) + + # Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only) + merge_pairs: list[tuple[int, int, float]] = [] + + for i in range(n): + for j in range(i + 1, n): + if same_type_only and candidates[i].type_ref != candidates[j].type_ref: + continue + + ni, nj = normalized[i], normalized[j] + et = candidates[i].type_ref.lower() + + if _is_exact_type(et): + if ni == nj: + uf.union(i, j) + merge_pairs.append((i, j, 1.0)) + continue + + score = _name_similarity(ni, nj) + if score >= name_threshold: + uf.union(i, j) + merge_pairs.append((i, j, score)) + + # Paso 4: agrupar indices por raiz del Union-Find + clusters: dict[int, list[int]] = {} + for i in range(n): + root = uf.find(i) + clusters.setdefault(root, []).append(i) + + # Paso 5: merge por cluster + merged_entities: list[EntityCandidate] = [] + entity_id_map: dict[str, str] = {} + name_to_id: dict[str, str] = {} + merge_log: list[dict] = [] + + # Pares mergeados para construir el log + merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {} + for i, j, score in merge_pairs: + root = uf.find(i) + merged_pairs_by_root.setdefault(root, []).append((i, j, score)) + + for root, indices in clusters.items(): + cluster_candidates = [candidates[idx] for idx in indices] + + if len(cluster_candidates) == 1: + c = cluster_candidates[0] + canonical_name = c.name + canonical_norm = normalized[indices[0]] + merged_attrs = c.attributes + merged_confidence = c.confidence + merged_chunks = list(c.source_chunk_indices) + merged_from = list(c.merged_from) if c.merged_from else [c.name] + else: + # Candidato con mayor confidence es el canonico + best = max(cluster_candidates, key=lambda c: c.confidence) + canonical_name = best.name + canonical_norm = normalize_entity_name(best.name, best.type_ref) + + merged_attrs = merge_entity_attributes( + [c.attributes for c in cluster_candidates] + ) + merged_confidence = max(c.confidence for c in cluster_candidates) + + merged_chunks: list[int] = [] + seen_chunks: set[int] = set() + for c in cluster_candidates: + for idx in c.source_chunk_indices: + if idx not in seen_chunks: + merged_chunks.append(idx) + seen_chunks.add(idx) + + merged_from: list[str] = [] + seen_names: set[str] = set() + for c in cluster_candidates: + names_to_add = c.merged_from if c.merged_from else [c.name] + for nm in names_to_add: + if nm not in seen_names: + merged_from.append(nm) + seen_names.add(nm) + + # Log de merge + other_names = [c.name for c in cluster_candidates if c is not best] + pairs = merged_pairs_by_root.get(root, []) + max_score = max((s for _, _, s in pairs), default=1.0) + merge_log.append( + { + "canonical": canonical_name, + "merged": other_names, + "score": round(max_score, 4), + "reason": "fuzzy_name", + } + ) + + ent_id = str(uuid.uuid4()) + entity = EntityCandidate( + name=canonical_name, + name_normalized=canonical_norm, + type_ref=cluster_candidates[0].type_ref, + type_label=cluster_candidates[0].type_label, + attributes=merged_attrs, + confidence=merged_confidence, + source_chunk_indices=merged_chunks, + merged_from=merged_from, + ) + merged_entities.append(entity) + + # Poblar mapas de resolucion + entity_id_map[canonical_norm] = ent_id + for orig_name in merged_from: + name_to_id[orig_name] = ent_id + name_to_id[canonical_norm] = ent_id + + return DeduplicationResult( + entities=merged_entities, + entity_id_map=entity_id_map, + name_to_id=name_to_id, + merge_log=merge_log, + total_before=n, + total_after=len(merged_entities), + ) diff --git a/python/functions/datascience/deduplicate_entities_test.py b/python/functions/datascience/deduplicate_entities_test.py new file mode 100644 index 00000000..8ba2c524 --- /dev/null +++ b/python/functions/datascience/deduplicate_entities_test.py @@ -0,0 +1,113 @@ +"""Tests para deduplicate_entities.""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from python.types.datascience.entity_candidate import EntityCandidate +from python.functions.datascience.deduplicate_entities import deduplicate_entities + + +def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate: + return EntityCandidate( + name=name, + type_ref=type_ref, + type_label=type_ref.capitalize(), + attributes=attrs, + confidence=confidence, + source_chunk_indices=[0], + ) + + +def test_john_smith_y_smith_john_merge(): + """John Smith y Smith, John se mergean.""" + a = _make("John Smith", type_ref="person") + b = _make("Smith, John", type_ref="person") + result = deduplicate_entities([a, b]) + assert result.total_before == 2 + assert result.total_after == 1 + assert len(result.entities) == 1 + assert len(result.merge_log) == 1 + + +def test_google_y_google_llc_merge(): + """Google y Google LLC se mergean.""" + a = _make("Google", type_ref="organization") + b = _make("Google LLC", type_ref="organization") + result = deduplicate_entities([a, b]) + assert result.total_after == 1 + assert len(result.entities) == 1 + + +def test_ip_matching_exacto(): + """192.168.1.1 y 192.168.1.1 se mergean por matching exacto.""" + a = _make("192.168.1.1", type_ref="ip", confidence=0.8) + b = _make("192.168.1.1", type_ref="ip", confidence=0.9) + result = deduplicate_entities([a, b]) + assert result.total_after == 1 + + +def test_same_name_different_type_no_merge(): + """John Smith (person) y John Smith (organization) NO se mergean.""" + a = _make("John Smith", type_ref="person") + b = _make("John Smith", type_ref="organization") + result = deduplicate_entities([a, b], same_type_only=True) + assert result.total_after == 2 + + +def test_clusters_transitivos(): + """Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster.""" + a = _make("Alice Johnson", type_ref="person") + b = _make("Alice Johnso", type_ref="person") # muy similar a A + c = _make("Alice Johns", type_ref="person") # muy similar a B + result = deduplicate_entities([a, b, c], name_threshold=0.80) + assert result.total_after == 1 + + +def test_sin_duplicados_sin_cambios(): + """Entidades sin duplicados pasan sin modificacion.""" + a = _make("Alice Smith", type_ref="person") + b = _make("Bob Jones", type_ref="person") + c = _make("Charlie Brown", type_ref="person") + result = deduplicate_entities([a, b, c]) + assert result.total_before == 3 + assert result.total_after == 3 + assert len(result.merge_log) == 0 + + +def test_confidence_y_atributos_merge_correctos(): + """Confidence toma el max del cluster; atributos se fusionan.""" + a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO") + b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme") + result = deduplicate_entities([a, b]) + assert result.total_after == 1 + entity = result.entities[0] + # confidence = max(0.7, 0.95) + assert entity.confidence == 0.95 + # atributos de ambos candidatos presentes + assert "role" in entity.attributes + assert "company" in entity.attributes + + +def test_lista_vacia(): + """Lista vacia retorna resultado vacio.""" + result = deduplicate_entities([]) + assert result.total_before == 0 + assert result.total_after == 0 + assert result.entities == [] + assert result.merge_log == [] + + +def test_name_to_id_resolucion(): + """name_to_id contiene todos los nombres originales del cluster.""" + a = _make("John Smith", type_ref="person") + b = _make("Smith, John", type_ref="person") + result = deduplicate_entities([a, b]) + # Ambos nombres deben apuntar al mismo ID + ids = list(result.entity_id_map.values()) + assert len(ids) == 1 + ent_id = ids[0] + # name_to_id debe tener entradas para los nombres originales + assert any(v == ent_id for v in result.name_to_id.values()) + assert len(result.name_to_id) >= 2 diff --git a/python/functions/datascience/deduplicate_relations.md b/python/functions/datascience/deduplicate_relations.md new file mode 100644 index 00000000..71a6cc60 --- /dev/null +++ b/python/functions/datascience/deduplicate_relations.md @@ -0,0 +1,81 @@ +--- +name: deduplicate_relations +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def deduplicate_relations(relations: list[RelationCandidate], entity_id_map: dict[str, str]) -> list[RelationCandidate]" +description: "Deduplica relaciones candidatas resolviendo from_name/to_name a entity IDs finales via entity_id_map. Descarta self-loops y relaciones sin match. Mergea duplicados (mismo from_id, to_id, relation_type) concatenando descripciones unicas y tomando max confidence." +tags: [datascience, extraction, knowledge-graph, nlp, deduplication, fuzzy-match, fuzzygraph] +uses_functions: + - levenshtein_distance_py_cybersecurity +uses_types: + - relation_candidate_py_datascience +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "dos relaciones identicas se colapsan en una" + - "relacion con nombre mergeado se resuelve al id correcto" + - "self loop se descarta" + - "nombre no mapeado sin fuzzy match se descarta" + - "relaciones distintas se mantienen" + - "merge descripcion concatena unicas" + - "lista vacia retorna lista vacia" + - "fuzzy match resuelve nombre cercano" +test_file_path: "python/functions/datascience/deduplicate_relations_test.py" +file_path: "python/functions/datascience/deduplicate_relations.py" +--- + +## Ejemplo + +```python +from python.types.datascience.relation_candidate import RelationCandidate +from python.functions.datascience.deduplicate_relations import deduplicate_relations + +# entity_id_map producido por deduplicate_entities +entity_id_map = { + "john smith": "entity_001", + "smith, john": "entity_001", # alias mergeado + "acme corp": "entity_002", +} + +relations = [ + RelationCandidate(from_name="John Smith", to_name="Acme Corp", + relation_type="works_at", description="John es CEO", + confidence=0.9, source_chunk_index=0), + RelationCandidate(from_name="Smith, John", to_name="Acme Corp", + relation_type="works_at", description="CEO de Acme", + confidence=0.7, source_chunk_index=2), +] + +result = deduplicate_relations(relations, entity_id_map) +# → 1 RelationCandidate con from_id="entity_001", to_id="entity_002", +# confidence=0.9, description="John es CEO; CEO de Acme" +``` + +## Notas + +La funcion es pura: no hace I/O, no tiene efectos secundarios. El logging es +de nivel DEBUG/WARNING — en produccion configurar el logger de la aplicacion. + +**Resolucion de nombres:** +- Lookup exacto primero (lowercase strip del nombre contra las claves del mapa). +- Si no hay match exacto, fuzzy match con Levenshtein (threshold=3 ediciones). +- Si sigue sin match, la relacion se descarta con `logger.warning`. + +**Self-loops:** relaciones donde `from_id == to_id` siempre se descartan. + +**Merge:** cuando varias relaciones comparten `(from_id, to_id, relation_type)`: +- `confidence`: max del grupo. +- `description`: union de descripciones unicas (no duplicadas), separadas por `'; '`. +- `from_name` / `to_name` / `source_chunk_index`: del primer candidato del grupo. + +**Integracion con fuzzygraph:** +Esta funcion es el paso 4 del pipeline de extraccion. Recibe el output de +`extract_relations_llm` (relaciones crudas con nombres de texto) y el +`entity_id_map` producido por `deduplicate_entities`. Produce la lista final +de relaciones para `ExtractionResult`. diff --git a/python/functions/datascience/deduplicate_relations.py b/python/functions/datascience/deduplicate_relations.py new file mode 100644 index 00000000..1257494c --- /dev/null +++ b/python/functions/datascience/deduplicate_relations.py @@ -0,0 +1,189 @@ +"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados.""" + +import logging +import os +import sys + +logger = logging.getLogger(__name__) + +# --- Importar levenshtein_distance desde cybersecurity --- +# Soporta dos contextos: +# 1. Ejecutado desde python/functions/datascience/ (pytest local) +# 2. Ejecutado desde la raiz del registry (fn run) +def _levenshtein_distance(a: str, b: str) -> int: + """Calcula la distancia de edicion de Levenshtein entre dos strings.""" + if len(a) < len(b): + return _levenshtein_distance(b, a) + if len(b) == 0: + return len(a) + prev_row = list(range(len(b) + 1)) + for i, ca in enumerate(a): + curr_row = [i + 1] + for j, cb in enumerate(b): + cost = 0 if ca == cb else 1 + curr_row.append( + min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost) + ) + prev_row = curr_row + return prev_row[-1] + + +try: + _here = os.path.dirname(os.path.abspath(__file__)) + _cyber_path = os.path.join(_here, "..", "cybersecurity") + if _cyber_path not in sys.path: + sys.path.insert(0, _cyber_path) + from cybersecurity import levenshtein_distance as _lev +except ImportError: + _lev = None # type: ignore + +levenshtein_distance = _lev if _lev is not None else _levenshtein_distance + + +def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str: + """Intenta resolver un nombre contra las claves del mapa por fuzzy match. + + Recorre todas las claves de entity_id_map y busca la mas cercana segun + distancia de Levenshtein. Retorna el entity_id si la distancia es <= + threshold, o '' si no hay match aceptable. + + Args: + name: nombre a resolver (ya en lowercase strip). + entity_id_map: mapa nombre_normalizado -> entity_id. + threshold: distancia maxima de edicion para considerar match (default 3). + + Returns: + entity_id del mejor match o '' si no hay match. + """ + best_id = "" + best_dist = threshold + 1 + for key, entity_id in entity_id_map.items(): + dist = levenshtein_distance(name, key) + if dist < best_dist: + best_dist = dist + best_id = entity_id + return best_id if best_dist <= threshold else "" + + +def deduplicate_relations( + relations: list, + entity_id_map: dict[str, str], +) -> list: + """Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales. + + Algoritmo: + 1. Para cada RelationCandidate, intentar resolver from_name y to_name al + entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas). + Si no hay match exacto, intentar fuzzy match con levenshtein_distance. + Si sigue sin match, descartar la relacion con warning. + 2. Descartar self-loops (from_id == to_id). + 3. Deduplicar por (from_id, to_id, relation_type): + - description: concatenar descripciones unicas separadas por '; ' + - confidence: max del grupo + 4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos. + + Args: + relations: lista de RelationCandidate con from_name/to_name originales. + entity_id_map: mapa nombre_normalizado -> entity_id (output de + deduplicate_entities). Permite resolver nombres que fueron mergeados. + + Returns: + Lista deduplicada de RelationCandidate con from_id y to_id resueltos. + """ + # Importar tipo — funciona tanto desde datascience/ como desde raiz del registry + try: + _types_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", "..", "..", "python", "types", "datascience", + ) + if _types_path not in sys.path: + sys.path.insert(0, _types_path) + from relation_candidate import RelationCandidate + except ImportError: + from python.types.datascience.relation_candidate import RelationCandidate # type: ignore + + resolved: list = [] + + for rel in relations: + # --- Resolver from_name --- + from_key = rel.from_name.lower().strip() + from_id = entity_id_map.get(from_key, "") + if not from_id: + from_id = _fuzzy_resolve(from_key, entity_id_map) + if not from_id: + logger.warning( + "deduplicate_relations: no se pudo resolver from_name=%r — descartando", + rel.from_name, + ) + continue + + # --- Resolver to_name --- + to_key = rel.to_name.lower().strip() + to_id = entity_id_map.get(to_key, "") + if not to_id: + to_id = _fuzzy_resolve(to_key, entity_id_map) + if not to_id: + logger.warning( + "deduplicate_relations: no se pudo resolver to_name=%r — descartando", + rel.to_name, + ) + continue + + # --- Descartar self-loops --- + if from_id == to_id: + logger.debug( + "deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)", + rel.from_name, + rel.to_name, + rel.relation_type, + ) + continue + + resolved.append( + RelationCandidate( + from_name=rel.from_name, + to_name=rel.to_name, + from_id=from_id, + to_id=to_id, + relation_type=rel.relation_type, + description=rel.description, + confidence=rel.confidence, + source_chunk_index=rel.source_chunk_index, + ) + ) + + # --- Deduplicar por (from_id, to_id, relation_type) --- + groups: dict[tuple, list] = {} + for rel in resolved: + key = (rel.from_id, rel.to_id, rel.relation_type) + groups.setdefault(key, []).append(rel) + + result: list = [] + for (from_id, to_id, rel_type), group in groups.items(): + if len(group) == 1: + result.append(group[0]) + continue + + # Mergear: max confidence + union de descripciones unicas + best_confidence = max(r.confidence for r in group) + seen_desc: set[str] = set() + descriptions: list[str] = [] + for r in group: + if r.description and r.description not in seen_desc: + descriptions.append(r.description) + seen_desc.add(r.description) + + result.append( + RelationCandidate( + from_name=group[0].from_name, + to_name=group[0].to_name, + from_id=from_id, + to_id=to_id, + relation_type=rel_type, + description="; ".join(descriptions), + confidence=best_confidence, + source_chunk_index=group[0].source_chunk_index, + ) + ) + + return result diff --git a/python/functions/datascience/deduplicate_relations_test.py b/python/functions/datascience/deduplicate_relations_test.py new file mode 100644 index 00000000..016897b4 --- /dev/null +++ b/python/functions/datascience/deduplicate_relations_test.py @@ -0,0 +1,120 @@ +"""Tests para deduplicate_relations.""" + +import os +import sys + +# Permitir importar RelationCandidate desde python/types/datascience/ +_here = os.path.dirname(os.path.abspath(__file__)) +_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience") +if _types_path not in sys.path: + sys.path.insert(0, _types_path) + +from relation_candidate import RelationCandidate +from deduplicate_relations import deduplicate_relations + + +def _make_rel( + from_name: str, + to_name: str, + relation_type: str = "works_at", + description: str = "", + confidence: float = 0.8, + source_chunk_index: int = 0, +) -> RelationCandidate: + return RelationCandidate( + from_name=from_name, + to_name=to_name, + relation_type=relation_type, + description=description, + confidence=confidence, + source_chunk_index=source_chunk_index, + ) + + +# entity_id_map tipico: claves en lowercase normalizado +_ENTITY_MAP: dict[str, str] = { + "john smith": "entity_001", + "acme corp": "entity_002", + "jane doe": "entity_003", + "google": "entity_004", +} + + +def test_dos_relaciones_identicas_se_colapsan_en_una(): + """2 relaciones identicas (from, to, type) → 1.""" + rels = [ + _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9), + _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7), + ] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 1 + assert result[0].from_id == "entity_001" + assert result[0].to_id == "entity_002" + assert result[0].confidence == 0.9 # max + + +def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto(): + """Relacion con nombre mergeado → se resuelve al ID correcto.""" + # entity_id_map incluye "smith, john" como alias de entity_001 + merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"} + rels = [_make_rel("Smith, John", "Acme Corp")] + result = deduplicate_relations(rels, merged_map) + assert len(result) == 1 + assert result[0].from_id == "entity_001" + assert result[0].to_id == "entity_002" + + +def test_self_loop_se_descarta(): + """Self-loop (from_id == to_id) → descartado.""" + rels = [_make_rel("John Smith", "John Smith", relation_type="knows")] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 0 + + +def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta(): + """Relacion con nombre no mapeado y sin fuzzy match → descartada.""" + rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 0 + + +def test_relaciones_distintas_se_mantienen(): + """Relaciones con (from, to, type) distintos → todas se mantienen.""" + rels = [ + _make_rel("John Smith", "Acme Corp", relation_type="works_at"), + _make_rel("Jane Doe", "Acme Corp", relation_type="works_at"), + _make_rel("John Smith", "Google", relation_type="invested_in"), + ] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 3 + + +def test_merge_descripcion_concatena_unicas(): + """Merge de relaciones: descripciones unicas se concatenan.""" + rels = [ + _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9), + _make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7), + _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6), + ] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 1 + assert "John es CEO" in result[0].description + assert "Acme fue fundada por John" in result[0].description + # La descripcion duplicada ("John es CEO") no aparece dos veces + assert result[0].description.count("John es CEO") == 1 + assert result[0].confidence == 0.9 + + +def test_lista_vacia_retorna_lista_vacia(): + """Lista vacia de relaciones → lista vacia.""" + result = deduplicate_relations([], _ENTITY_MAP) + assert result == [] + + +def test_fuzzy_match_resuelve_nombre_cercano(): + """Nombre con typo pequeño → fuzzy match lo resuelve.""" + # "john smit" tiene distancia 1 de "john smith" + rels = [_make_rel("John Smit", "Acme Corp")] + result = deduplicate_relations(rels, _ENTITY_MAP) + assert len(result) == 1 + assert result[0].from_id == "entity_001" diff --git a/python/functions/datascience/detect_drift.md b/python/functions/datascience/detect_drift.md new file mode 100644 index 00000000..384bae78 --- /dev/null +++ b/python/functions/datascience/detect_drift.md @@ -0,0 +1,56 @@ +--- +name: detect_drift +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def detect_drift(history: list[dict], current: dict, fields: list[str], threshold: float = 2.0) -> list[dict]" +description: "Detecta drift estadistico comparando metricas de la ejecucion actual contra el historial usando z-score. Si |z| > threshold, el campo ha drifteado. Util para monitorizar executions en operations.db." +tags: [drift, statistics, z-score, monitoring, executions, operations, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [math] +tested: true +tests: + - "campo con drift claro (z > threshold)" + - "campo estable (z < threshold)" + - "historial con un solo punto → std=0, no puede calcular → drifted=False con nota" + - "historial vacio → todos drifted=False" + - "threshold custom" +test_file_path: "python/functions/datascience/detect_drift_test.py" +file_path: "python/functions/datascience/detect_drift.py" +--- + +## Ejemplo + +```python +history = [ + {"records_out": 100, "duration_ms": 500}, + {"records_out": 105, "duration_ms": 480}, + {"records_out": 98, "duration_ms": 510}, +] +current = {"records_out": 50, "duration_ms": 2000} + +results = detect_drift(history, current, ["records_out", "duration_ms"]) +# [ +# {"field": "records_out", "current": 50, "mean": 101.0, "std": 3.6, "z_score": -14.2, "drifted": True}, +# {"field": "duration_ms", "current": 2000, "mean": 496.7, "std": 15.3, "z_score": 98.3, "drifted": True}, +# ] +``` + +## Notas + +Funcion pura. Solo stdlib (`math`). + +El z-score usa desviacion estandar poblacional (dividir por N, no N-1) para ser consistente con historial de cualquier tamanio. + +Casos especiales: +- **Historial vacio**: z_score=0.0, drifted=False para todos los campos. +- **Un solo punto en historial**: std=0.0, z_score=0.0, drifted=False. No hay suficiente historia para calcular variabilidad. +- **Std=0 con N>=2**: todos los valores historicos identicos. z_score=0.0, drifted=False (cualquier desviacion seria tecnicamente infinita, pero se asume que el sistema es muy estable). + +Pensado para el paso ANALIZAR del bucle reactivo: comparar `metrics` de la ejecucion actual con executions historicas de `operations.db`. diff --git a/python/functions/datascience/detect_drift.py b/python/functions/datascience/detect_drift.py new file mode 100644 index 00000000..3b8eedf3 --- /dev/null +++ b/python/functions/datascience/detect_drift.py @@ -0,0 +1,86 @@ +"""detect_drift — detecta drift estadistico por z-score comparando metricas contra historial.""" + +import math + + +def detect_drift( + history: list[dict], + current: dict, + fields: list[str], + threshold: float = 2.0, +) -> list[dict]: + """Detecta drift estadistico comparando metricas actuales contra el historial. + + Usa z-score: si |z| > threshold, el campo ha drifteado. Pensado para + comparar metrics de executions sucesivas en operations.db. + + Args: + history: Lista de dicts con metricas historicas. Cada dict puede + contener cualquier combinacion de los campos indicados. + current: Dict con las metricas de la ejecucion actual. + fields: Lista de campos numericos a analizar. + threshold: Umbral de z-score para considerar drift. Default 2.0. + + Returns: + Lista de dicts con: field, current, mean, std, z_score, drifted. + Si el historial tiene 0 o 1 punto, z_score=0.0 y drifted=False + porque no hay suficiente informacion estadistica. + """ + results = [] + + for field in fields: + values = [ + float(h[field]) + for h in history + if field in h and h[field] is not None + ] + + current_val = float(current.get(field, 0)) + + if len(values) == 0: + results.append({ + "field": field, + "current": current_val, + "mean": 0.0, + "std": 0.0, + "z_score": 0.0, + "drifted": False, + }) + continue + + n = len(values) + mean = sum(values) / n + + if n < 2: + # Un solo punto: no hay std, no podemos calcular z-score + results.append({ + "field": field, + "current": current_val, + "mean": mean, + "std": 0.0, + "z_score": 0.0, + "drifted": False, + }) + continue + + variance = sum((v - mean) ** 2 for v in values) / n + std = math.sqrt(variance) + + if std == 0.0: + # Todos los valores identicos: z_score indeterminado, no drift + z_score = 0.0 + drifted = False + else: + z_score = (current_val - mean) / std + drifted = abs(z_score) > threshold + + results.append({ + "field": field, + "current": current_val, + "mean": mean, + "std": std, + "z_score": z_score, + "drifted": drifted, + }) + + return results diff --git a/python/functions/datascience/detect_drift_test.py b/python/functions/datascience/detect_drift_test.py new file mode 100644 index 00000000..9bd74ecb --- /dev/null +++ b/python/functions/datascience/detect_drift_test.py @@ -0,0 +1,90 @@ +"""Tests para detect_drift.""" + +import sys +import os +import math + +sys.path.insert(0, os.path.dirname(__file__)) +from detect_drift import detect_drift + + +def test_campo_con_drift_claro_z_mayor_threshold(): + history = [ + {"records_out": 100}, + {"records_out": 105}, + {"records_out": 98}, + ] + current = {"records_out": 50} + results = detect_drift(history, current, ["records_out"]) + assert len(results) == 1 + r = results[0] + assert r["field"] == "records_out" + assert r["current"] == 50.0 + assert r["drifted"] is True + assert r["z_score"] < -2.0 # muy lejos de la media + + +def test_campo_estable_z_menor_threshold(): + history = [ + {"val": 100.0}, + {"val": 102.0}, + {"val": 98.0}, + {"val": 101.0}, + ] + current = {"val": 100.5} # dentro del rango normal + results = detect_drift(history, current, ["val"]) + assert len(results) == 1 + r = results[0] + assert r["drifted"] is False + assert abs(r["z_score"]) < 2.0 + + +def test_historial_con_un_solo_punto_std_0_drifted_False_con_nota(): + history = [{"val": 100.0}] + current = {"val": 999.0} + results = detect_drift(history, current, ["val"]) + assert len(results) == 1 + r = results[0] + assert r["std"] == 0.0 + assert r["z_score"] == 0.0 + assert r["drifted"] is False + assert r["mean"] == 100.0 + + +def test_historial_vacio_todos_drifted_False(): + history = [] + current = {"records_out": 50, "duration_ms": 2000} + results = detect_drift(history, current, ["records_out", "duration_ms"]) + assert len(results) == 2 + for r in results: + assert r["drifted"] is False + assert r["z_score"] == 0.0 + assert r["mean"] == 0.0 + + +def test_threshold_custom(): + history = [ + {"val": 100.0}, + {"val": 100.0}, + {"val": 110.0}, + {"val": 90.0}, + ] + # std ~ 7.07, mean = 100 + current = {"val": 115.0} # z ~ 2.12 + + # threshold default 2.0 -> drifted + results = detect_drift(history, current, ["val"], threshold=2.0) + assert results[0]["drifted"] is True + + # threshold 3.0 -> no drifted + results2 = detect_drift(history, current, ["val"], threshold=3.0) + assert results2[0]["drifted"] is False + + +if __name__ == "__main__": + test_campo_con_drift_claro_z_mayor_threshold() + test_campo_estable_z_menor_threshold() + test_historial_con_un_solo_punto_std_0_drifted_False_con_nota() + test_historial_vacio_todos_drifted_False() + test_threshold_custom() + print("All tests passed.") diff --git a/python/functions/datascience/diff_entities.md b/python/functions/datascience/diff_entities.md new file mode 100644 index 00000000..75f7cf44 --- /dev/null +++ b/python/functions/datascience/diff_entities.md @@ -0,0 +1,58 @@ +--- +name: diff_entities +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def diff_entities(before: list[dict], after: list[dict], key: str = 'id', ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict" +description: "Compara dos snapshots de entities y devuelve diferencias campo a campo. Detecta añadidas, eliminadas, modificadas e inalteradas. Ignora created_at y updated_at por defecto." +tags: [diff, entities, snapshot, operations, comparison, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "entity añadida" + - "entity eliminada" + - "entity modificada con detalle de campos" + - "entities identicas → unchanged" + - "ignore_fields funciona" + - "compare_fields filtra correctamente" + - "lista vacia vs lista con datos" +test_file_path: "python/functions/datascience/diff_entities_test.py" +file_path: "python/functions/datascience/diff_entities.py" +--- + +## Ejemplo + +```python +before = [ + {"id": "1", "name": "Alice", "status": "active", "updated_at": "2024-01-01"}, + {"id": "2", "name": "Bob", "status": "active", "updated_at": "2024-01-01"}, +] +after = [ + {"id": "1", "name": "Alice", "status": "inactive", "updated_at": "2024-01-02"}, + {"id": "3", "name": "Carol", "status": "active", "updated_at": "2024-01-02"}, +] + +result = diff_entities(before, after) +# result["added"] -> [{"id": "3", "name": "Carol", ...}] +# result["removed"] -> [{"id": "2", "name": "Bob", ...}] +# result["modified"] -> [{"key": "1", "changes": {"status": {"old": "active", "new": "inactive"}}}] +# result["unchanged"] -> 0 +# result["summary"] -> "1 added, 1 removed, 1 modified, 0 unchanged" +``` + +## Notas + +Funcion pura. No hace I/O — toma listas de dicts ya cargadas en memoria. + +El campo `key` debe existir en todas las entities; las que no lo tengan se ignoran silenciosamente. + +Si `compare_fields` se da, tiene prioridad sobre `ignore_fields`. Esto permite comparar solo un subconjunto especifico de campos sin preocuparse por los campos temporales. + +El orden de `added` y `removed` no esta garantizado (depende del orden de iteracion de sets). diff --git a/python/functions/datascience/diff_entities.py b/python/functions/datascience/diff_entities.py new file mode 100644 index 00000000..dda79726 --- /dev/null +++ b/python/functions/datascience/diff_entities.py @@ -0,0 +1,77 @@ +"""diff_entities — compara dos snapshots de entities detectando cambios campo a campo.""" + + +def diff_entities( + before: list[dict], + after: list[dict], + key: str = "id", + ignore_fields: list[str] | None = None, + compare_fields: list[str] | None = None, +) -> dict: + """Compara dos snapshots de entities y devuelve diferencias campo a campo. + + Detecta entities añadidas, eliminadas, modificadas e inalteradas. + Ignora campos de metadata temporal por defecto (created_at, updated_at). + + Args: + before: Lista de entities del snapshot anterior. + after: Lista de entities del snapshot posterior. + key: Campo que identifica unicamente cada entity. Default "id". + ignore_fields: Campos a excluir de la comparacion. + Default ["created_at", "updated_at"]. + compare_fields: Si se da, solo compara estos campos (tiene prioridad + sobre ignore_fields). + + Returns: + Dict con keys: added, removed, modified, unchanged, summary. + modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}. + """ + if ignore_fields is None: + ignore_fields = ["created_at", "updated_at"] + + before_map = {str(e[key]): e for e in before if key in e} + after_map = {str(e[key]): e for e in after if key in e} + + before_keys = set(before_map.keys()) + after_keys = set(after_map.keys()) + + added = [after_map[k] for k in after_keys - before_keys] + removed = [before_map[k] for k in before_keys - after_keys] + + modified = [] + unchanged = 0 + + for k in before_keys & after_keys: + b = before_map[k] + a = after_map[k] + + if compare_fields is not None: + fields_to_check = compare_fields + else: + all_fields = set(b.keys()) | set(a.keys()) + fields_to_check = [f for f in all_fields if f not in ignore_fields and f != key] + + changes = {} + for field in fields_to_check: + old_val = b.get(field) + new_val = a.get(field) + if old_val != new_val: + changes[field] = {"old": old_val, "new": new_val} + + if changes: + modified.append({"key": k, "changes": changes}) + else: + unchanged += 1 + + n_added = len(added) + n_removed = len(removed) + n_modified = len(modified) + summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged" + + return { + "added": added, + "removed": removed, + "modified": modified, + "unchanged": unchanged, + "summary": summary, + } diff --git a/python/functions/datascience/diff_entities_test.py b/python/functions/datascience/diff_entities_test.py new file mode 100644 index 00000000..6753cebe --- /dev/null +++ b/python/functions/datascience/diff_entities_test.py @@ -0,0 +1,111 @@ +"""Tests para diff_entities.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from diff_entities import diff_entities + + +def test_entity_anadida(): + before = [{"id": "1", "name": "Alice"}] + after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + result = diff_entities(before, after) + assert len(result["added"]) == 1 + assert result["added"][0]["id"] == "2" + assert result["removed"] == [] + assert result["modified"] == [] + assert result["unchanged"] == 1 + assert "1 added" in result["summary"] + + +def test_entity_eliminada(): + before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + after = [{"id": "1", "name": "Alice"}] + result = diff_entities(before, after) + assert result["added"] == [] + assert len(result["removed"]) == 1 + assert result["removed"][0]["id"] == "2" + assert result["unchanged"] == 1 + assert "1 removed" in result["summary"] + + +def test_entity_modificada_con_detalle_de_campos(): + before = [{"id": "1", "name": "Alice", "status": "active"}] + after = [{"id": "1", "name": "Alice", "status": "inactive"}] + result = diff_entities(before, after) + assert result["added"] == [] + assert result["removed"] == [] + assert len(result["modified"]) == 1 + mod = result["modified"][0] + assert mod["key"] == "1" + assert "status" in mod["changes"] + assert mod["changes"]["status"]["old"] == "active" + assert mod["changes"]["status"]["new"] == "inactive" + assert result["unchanged"] == 0 + + +def test_entities_identicas_unchanged(): + before = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + result = diff_entities(before, after) + assert result["added"] == [] + assert result["removed"] == [] + assert result["modified"] == [] + assert result["unchanged"] == 2 + assert "2 unchanged" in result["summary"] + + +def test_ignore_fields_funciona(): + before = [{"id": "1", "name": "Alice", "updated_at": "2024-01-01", "created_at": "2023-01-01"}] + after = [{"id": "1", "name": "Alice", "updated_at": "2024-06-01", "created_at": "2023-01-01"}] + result = diff_entities(before, after) + # updated_at se ignora por defecto -> unchanged + assert result["unchanged"] == 1 + assert result["modified"] == [] + + # Si no ignoramos updated_at, debe detectar el cambio + result2 = diff_entities(before, after, ignore_fields=[]) + assert len(result2["modified"]) == 1 + assert "updated_at" in result2["modified"][0]["changes"] + + +def test_compare_fields_filtra_correctamente(): + before = [{"id": "1", "name": "Alice", "status": "active", "score": 10}] + after = [{"id": "1", "name": "Bob", "status": "inactive", "score": 10}] + # Solo comparar score -> no hay cambio en score, unchanged + result = diff_entities(before, after, compare_fields=["score"]) + assert result["unchanged"] == 1 + assert result["modified"] == [] + + # Solo comparar name -> detecta cambio + result2 = diff_entities(before, after, compare_fields=["name"]) + assert len(result2["modified"]) == 1 + assert "name" in result2["modified"][0]["changes"] + assert "status" not in result2["modified"][0]["changes"] + + +def test_lista_vacia_vs_lista_con_datos(): + before = [] + after = [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}] + result = diff_entities(before, after) + assert len(result["added"]) == 2 + assert result["removed"] == [] + assert result["unchanged"] == 0 + + # Invertido + result2 = diff_entities(after, before) + assert result2["added"] == [] + assert len(result2["removed"]) == 2 + assert result2["unchanged"] == 0 + + +if __name__ == "__main__": + test_entity_anadida() + test_entity_eliminada() + test_entity_modificada_con_detalle_de_campos() + test_entities_identicas_unchanged() + test_ignore_fields_funciona() + test_compare_fields_filtra_correctamente() + test_lista_vacia_vs_lista_con_datos() + print("All tests passed.") diff --git a/python/functions/datascience/diff_relations.md b/python/functions/datascience/diff_relations.md new file mode 100644 index 00000000..dafa8f02 --- /dev/null +++ b/python/functions/datascience/diff_relations.md @@ -0,0 +1,52 @@ +--- +name: diff_relations +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def diff_relations(before: list[dict], after: list[dict], key: tuple[str, str, str] = ('source_id', 'target_id', 'relation_type'), ignore_fields: list[str] | None = None, compare_fields: list[str] | None = None) -> dict" +description: "Compara relaciones entre dos snapshots usando key compuesta (source_id, target_id, relation_type). Detecta relaciones añadidas, eliminadas y modificadas con detalle campo a campo." +tags: [diff, relations, graph, snapshot, operations, comparison, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "relacion añadida" + - "relacion eliminada" + - "relacion con metadata modificada (mismo source/target/type, distinto weight)" + - "key compuesta funciona correctamente" +test_file_path: "python/functions/datascience/diff_relations_test.py" +file_path: "python/functions/datascience/diff_relations.py" +--- + +## Ejemplo + +```python +before = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}, + {"source_id": "B", "target_id": "C", "relation_type": "owns", "weight": 0.5}, +] +after = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 2.0}, + {"source_id": "C", "target_id": "D", "relation_type": "knows", "weight": 1.0}, +] + +result = diff_relations(before, after) +# result["added"] -> [{"source_id": "C", "target_id": "D", ...}] +# result["removed"] -> [{"source_id": "B", "target_id": "C", ...}] +# result["modified"] -> [{"key": "A|B|knows", "changes": {"weight": {"old": 1.0, "new": 2.0}}}] +# result["unchanged"] -> 0 +``` + +## Notas + +La key compuesta se serializa como `source_id|target_id|relation_type`. Si alguno de los campos clave no existe en la relacion, se usa string vacio. + +Misma semantica que `diff_entities_py_datascience` pero adaptada para relaciones donde no hay un ID unico — la identidad se define por los tres campos de la key. + +Complemento natural de `diff_entities_py_datascience` para comparar grafos completos entre ejecuciones de pipelines. diff --git a/python/functions/datascience/diff_relations.py b/python/functions/datascience/diff_relations.py new file mode 100644 index 00000000..7e920a01 --- /dev/null +++ b/python/functions/datascience/diff_relations.py @@ -0,0 +1,82 @@ +"""diff_relations — compara dos snapshots de relaciones con key compuesta.""" + + +def diff_relations( + before: list[dict], + after: list[dict], + key: tuple[str, str, str] = ("source_id", "target_id", "relation_type"), + ignore_fields: list[str] | None = None, + compare_fields: list[str] | None = None, +) -> dict: + """Compara relaciones entre dos snapshots usando key compuesta. + + Las relaciones se identifican por (source_id, target_id, relation_type) + porque no tienen un ID unico propio. Detecta relaciones añadidas, + eliminadas y modificadas (mismo source/target/type, distinta metadata). + + Args: + before: Lista de relaciones del snapshot anterior. + after: Lista de relaciones del snapshot posterior. + key: Tupla de campos que forman la key compuesta. + Default ("source_id", "target_id", "relation_type"). + ignore_fields: Campos a excluir de la comparacion. + Default ["created_at", "updated_at"]. + compare_fields: Si se da, solo compara estos campos. + + Returns: + Dict con keys: added, removed, modified, unchanged, summary. + modified contiene lista de {"key": str, "changes": {"field": {"old": ..., "new": ...}}}. + """ + if ignore_fields is None: + ignore_fields = ["created_at", "updated_at"] + + def make_key(rel: dict) -> str: + return "|".join(str(rel.get(k, "")) for k in key) + + before_map = {make_key(r): r for r in before} + after_map = {make_key(r): r for r in after} + + before_keys = set(before_map.keys()) + after_keys = set(after_map.keys()) + + added = [after_map[k] for k in after_keys - before_keys] + removed = [before_map[k] for k in before_keys - after_keys] + + modified = [] + unchanged = 0 + + for k in before_keys & after_keys: + b = before_map[k] + a = after_map[k] + + if compare_fields is not None: + fields_to_check = compare_fields + else: + all_fields = set(b.keys()) | set(a.keys()) + key_set = set(key) + fields_to_check = [f for f in all_fields if f not in ignore_fields and f not in key_set] + + changes = {} + for field in fields_to_check: + old_val = b.get(field) + new_val = a.get(field) + if old_val != new_val: + changes[field] = {"old": old_val, "new": new_val} + + if changes: + modified.append({"key": k, "changes": changes}) + else: + unchanged += 1 + + n_added = len(added) + n_removed = len(removed) + n_modified = len(modified) + summary = f"{n_added} added, {n_removed} removed, {n_modified} modified, {unchanged} unchanged" + + return { + "added": added, + "removed": removed, + "modified": modified, + "unchanged": unchanged, + "summary": summary, + } diff --git a/python/functions/datascience/diff_relations_test.py b/python/functions/datascience/diff_relations_test.py new file mode 100644 index 00000000..1ee8540f --- /dev/null +++ b/python/functions/datascience/diff_relations_test.py @@ -0,0 +1,78 @@ +"""Tests para diff_relations.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from diff_relations import diff_relations + + +def test_relacion_anadida(): + before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}] + after = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}, + {"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5}, + ] + result = diff_relations(before, after) + assert len(result["added"]) == 1 + assert result["added"][0]["source_id"] == "C" + assert result["removed"] == [] + assert result["unchanged"] == 1 + assert "1 added" in result["summary"] + + +def test_relacion_eliminada(): + before = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}, + {"source_id": "C", "target_id": "D", "relation_type": "owns", "weight": 0.5}, + ] + after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}] + result = diff_relations(before, after) + assert result["added"] == [] + assert len(result["removed"]) == 1 + assert result["removed"][0]["source_id"] == "C" + assert result["unchanged"] == 1 + assert "1 removed" in result["summary"] + + +def test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight(): + before = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}] + after = [{"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 5.0}] + result = diff_relations(before, after) + assert result["added"] == [] + assert result["removed"] == [] + assert len(result["modified"]) == 1 + mod = result["modified"][0] + assert mod["key"] == "A|B|knows" + assert "weight" in mod["changes"] + assert mod["changes"]["weight"]["old"] == 1.0 + assert mod["changes"]["weight"]["new"] == 5.0 + assert result["unchanged"] == 0 + + +def test_key_compuesta_funciona_correctamente(): + # Misma pareja A->B pero diferente tipo de relacion -> dos relaciones distintas + before = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}, + {"source_id": "A", "target_id": "B", "relation_type": "owns", "weight": 0.5}, + ] + after = [ + {"source_id": "A", "target_id": "B", "relation_type": "knows", "weight": 1.0}, + {"source_id": "A", "target_id": "B", "relation_type": "trusts", "weight": 0.8}, + ] + result = diff_relations(before, after) + # owns eliminada, trusts añadida, knows sin cambios + assert len(result["added"]) == 1 + assert result["added"][0]["relation_type"] == "trusts" + assert len(result["removed"]) == 1 + assert result["removed"][0]["relation_type"] == "owns" + assert result["unchanged"] == 1 + assert result["modified"] == [] + + +if __name__ == "__main__": + test_relacion_anadida() + test_relacion_eliminada() + test_relacion_con_metadata_modificada_mismo_source_target_type_distinto_weight() + test_key_compuesta_funciona_correctamente() + print("All tests passed.") diff --git a/python/functions/datascience/estimate_hawkes.md b/python/functions/datascience/estimate_hawkes.md new file mode 100644 index 00000000..dc23eccf --- /dev/null +++ b/python/functions/datascience/estimate_hawkes.md @@ -0,0 +1,36 @@ +--- +name: estimate_hawkes +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def estimate_hawkes(arrivals: list[int], max_lag: int = 30) -> dict" +description: "Estima parámetros de un proceso Hawkes (alpha, beta, branching_ratio) desde la autocorrelación de arrivals ajustando una exponencial decreciente sobre la ACF." +tags: [estimation, hawkes, stochastic-process, microstructure, timeseries] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [numpy, scipy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/datascience/datascience.py" +--- + +## Ejemplo + +```python +arrivals = [0, 1, 3, 2, 0, 1, 4, 2, 1, 0] * 10 +result = estimate_hawkes(arrivals, max_lag=10) +# {'alpha': 0.312, 'beta': 0.874, 'branching_ratio': 0.357, 'acf': [...]} +``` + +## Notas + +Ajusta la función `a * exp(-b * lag)` sobre los lags 1..max_lag de la ACF usando `curve_fit` de scipy. +Si el primer lag de la ACF es <= 0.01 (sin autocorrelación), retorna alpha=0, beta=1, branching_ratio=0. +El branching_ratio = alpha/beta; si se acerca a 1, el proceso es explosivo. +Función pura: requiere numpy y scipy instalados. diff --git a/python/functions/datascience/estimate_pareto_alpha.md b/python/functions/datascience/estimate_pareto_alpha.md new file mode 100644 index 00000000..af16ae87 --- /dev/null +++ b/python/functions/datascience/estimate_pareto_alpha.md @@ -0,0 +1,38 @@ +--- +name: estimate_pareto_alpha +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def estimate_pareto_alpha(values: list[float], x_min_percentile: float = 90.0) -> dict" +description: "Estima el exponente alpha de una distribución Pareto via MLE. Alpha bajo indica cola más pesada y mayor frecuencia de valores extremos." +tags: [estimation, pareto, power-law, heavy-tail, statistics] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/datascience/datascience.py" +--- + +## Ejemplo + +```python +import numpy as np +# Simular datos con cola pesada +values = list(np.random.pareto(2.0, 1000) + 1) +result = estimate_pareto_alpha(values, x_min_percentile=90.0) +# {'alpha': ~2.0, 'x_min': ..., 'n_tail': 100} +``` + +## Notas + +Usa el estimador MLE de Hill: α = n / Σ ln(xᵢ / x_min). +x_min se determina como el percentil indicado de los valores positivos. +Retorna alpha=0 si hay menos de 10 valores positivos o la cola tiene menos de 2 elementos. +Función pura: requiere numpy instalado. diff --git a/python/functions/datascience/extract_entities_llm.md b/python/functions/datascience/extract_entities_llm.md new file mode 100644 index 00000000..7c8f9c98 --- /dev/null +++ b/python/functions/datascience/extract_entities_llm.md @@ -0,0 +1,87 @@ +--- +name: extract_entities_llm +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def extract_entities_llm(text: str, entity_schema: list[dict], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list[EntityCandidate]" +description: "Extrae entidades de un chunk de texto usando un LLM inyectado. Construye el system prompt con el schema, llama al LLM y valida la respuesta retornando EntityCandidate. JSON invalido o type_ref fuera del schema se descartan con warning." +tags: [llm, extraction, entity, nlp, osint, graph, fuzzygraph, datascience, prompt] +uses_functions: [] +uses_types: [entity_candidate_py_datascience] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [warnings, typing.Callable] +tested: true +tests: + - "texto con entidades claras retorna EntityCandidate" + - "texto sin entidades retorna lista vacia" + - "llm retorna json mal formado retorna lista vacia con warning" + - "type_ref invalido en respuesta se descarta con warning" + - "confidence se propaga correctamente" + - "schema vacio lanza ValueError" +test_file_path: "python/functions/datascience/extract_entities_llm_test.py" +file_path: "python/functions/datascience/extract_entities_llm.py" +--- + +## Ejemplo + +```python +import json +from extract_entities_llm import extract_entities_llm + +# LLM stub para tests — en produccion usar litellm o similar +def mock_llm(messages: list[dict]) -> dict: + return { + "entities": [ + { + "name": "John Smith", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {"full_name": "John Smith", "nationality": "US"}, + "confidence": 0.95, + }, + { + "name": "evil-corp.com", + "type_ref": "osint_domain_go_cybersecurity", + "attributes": {"fqdn": "evil-corp.com"}, + "confidence": 0.88, + }, + ] + } + +schema = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"], + }, + { + "type_ref": "osint_domain_go_cybersecurity", + "label": "Domain", + "metadata_fields": ["fqdn", "registrar", "created_date"], + }, +] + +text = "John Smith, a US citizen, was linked to the domain evil-corp.com." +candidates = extract_entities_llm(text, schema, mock_llm) +# [EntityCandidate(name='John Smith', type_ref='osint_person_go_cybersecurity', confidence=0.95), +# EntityCandidate(name='evil-corp.com', type_ref='osint_domain_go_cybersecurity', confidence=0.88)] +``` + +## Notas + +**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe mensajes en formato OpenAI (`[{"role": "system", "content": "..."}, ...]`) y retorna un `dict` con la respuesta ya parseada como JSON. Esto desacopla la funcion de cualquier cliente especifico — puede usarse con OpenAI, Anthropic via litellm, o cualquier mock. + +**Validacion de type_ref:** Solo se aceptan entidades cuyo `type_ref` aparece en el `entity_schema`. Entidades con type_ref desconocido se descartan con `warnings.warn` (no lanzan excepcion) para ser resiliente ante alucinaciones del LLM. + +**Manejo de JSON invalido:** Si `llm_chat_json` lanza una excepcion o retorna un dict sin la clave `entities`, se retorna lista vacia y se emite un warning. El llamador puede decidir si reintentar. + +**Confidence clamping:** El valor de confidence se clampea al rango [0.0, 1.0] automaticamente. + +**Atributos null:** Los atributos con valor `None` se filtran del dict de atributos para mantener el output limpio. + +**source_chunk_indices:** Esta funcion no setea `source_chunk_indices` — ese campo lo llena el pipeline exterior que conoce el indice del chunk actual. + +Esta funcion es el bloque atomico de extraccion. El pipeline completo de grafos la llama por cada chunk del documento y luego deduplica los candidatos resultantes. diff --git a/python/functions/datascience/extract_entities_llm.py b/python/functions/datascience/extract_entities_llm.py new file mode 100644 index 00000000..093dbcfa --- /dev/null +++ b/python/functions/datascience/extract_entities_llm.py @@ -0,0 +1,145 @@ +"""Extrae entidades de un chunk de texto usando un LLM inyectado.""" + +import sys +import os +import warnings +from typing import Callable + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from python.types.datascience.entity_candidate import EntityCandidate + + +def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str: + """Construye el system prompt para extraccion de entidades.""" + lines = [ + "You are an entity extraction expert. Given text, extract all entities", + "matching these types. For each entity, provide: name, type_ref,", + "attributes (matching the metadata_fields for that type), and a", + "confidence score (0.0-1.0).", + "", + "Entity types:", + ] + + for schema_entry in entity_schema: + label = schema_entry.get("label", "Unknown") + type_ref = schema_entry.get("type_ref", "") + metadata_fields = schema_entry.get("metadata_fields", []) + lines.append(f"- {label} (type_ref: {type_ref})") + if metadata_fields: + lines.append(f" fields: {', '.join(metadata_fields)}") + + lines += [ + "", + 'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}', + "", + "Rules:", + "- Only extract entities explicitly mentioned in the text", + "- Use the exact type_ref from the schema", + "- Leave unknown attributes as null", + "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied", + f"- {language_instruction}", + ] + + return "\n".join(lines) + + +def extract_entities_llm( + text: str, + entity_schema: list[dict], + llm_chat_json: Callable[[list[dict]], dict], + language_instruction: str = "Respond in English.", +) -> list[EntityCandidate]: + """Extrae entidades de un chunk de texto usando un LLM inyectado. + + Construye un system prompt con el schema de entity types, llama al LLM + y valida la respuesta retornando una lista de EntityCandidate. + + Args: + text: Chunk de texto a analizar. + entity_schema: Lista de tipos con metadata fields. Cada entrada es un + dict con las claves 'type_ref', 'label' y opcionalmente + 'metadata_fields'. Ejemplo: + [{"type_ref": "osint_person_go_cybersecurity", "label": "Person", + "metadata_fields": ["full_name", "alias"]}] + llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style + y retorna un dict con la respuesta JSON del LLM. Interfaz: + llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict + language_instruction: Instruccion de idioma para el LLM. Por defecto + "Respond in English." + + Returns: + Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM + no retorna JSON valido o si no se encuentran entidades. + + Raises: + ValueError: Si entity_schema esta vacio. + """ + if not entity_schema: + raise ValueError("entity_schema no puede estar vacio") + + valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema} + type_ref_to_label = { + entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema + } + + system_prompt = _build_system_prompt(entity_schema, language_instruction) + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": text}, + ] + + try: + response = llm_chat_json(messages) + except Exception as exc: + warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2) + return [] + + raw_entities = response.get("entities", []) + if not isinstance(raw_entities, list): + warnings.warn( + "extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista", + stacklevel=2, + ) + return [] + + candidates: list[EntityCandidate] = [] + for item in raw_entities: + if not isinstance(item, dict): + continue + + name = item.get("name", "") + if not name: + continue + + type_ref = item.get("type_ref", "") + if type_ref not in valid_type_refs: + warnings.warn( + f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'", + stacklevel=2, + ) + continue + + attributes = item.get("attributes", {}) + if not isinstance(attributes, dict): + attributes = {} + # Normalizar null values a None + attributes = {k: v for k, v in attributes.items() if v is not None} + + confidence = item.get("confidence", 0.0) + if not isinstance(confidence, (int, float)): + confidence = 0.0 + confidence = float(max(0.0, min(1.0, confidence))) + + candidates.append( + EntityCandidate( + name=name, + type_ref=type_ref, + type_label=type_ref_to_label.get(type_ref, ""), + attributes=attributes, + confidence=confidence, + ) + ) + + return candidates diff --git a/python/functions/datascience/extract_entities_llm_test.py b/python/functions/datascience/extract_entities_llm_test.py new file mode 100644 index 00000000..d4eae206 --- /dev/null +++ b/python/functions/datascience/extract_entities_llm_test.py @@ -0,0 +1,164 @@ +"""Tests para extract_entities_llm.""" + +import warnings +import sys +import os +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from python.functions.datascience.extract_entities_llm import extract_entities_llm +from python.types.datascience.entity_candidate import EntityCandidate + +SCHEMA = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"], + }, + { + "type_ref": "osint_domain_go_cybersecurity", + "label": "Domain", + "metadata_fields": ["fqdn", "registrar", "created_date"], + }, +] + + +def make_llm(response: dict): + """Crea un stub de LLM que retorna la respuesta dada.""" + def _llm(messages: list[dict]) -> dict: + return response + return _llm + + +def test_texto_con_entidades_claras_retorna_entity_candidate(): + """texto con entidades claras retorna EntityCandidate""" + llm = make_llm({ + "entities": [ + { + "name": "John Smith", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {"full_name": "John Smith", "nationality": "US"}, + "confidence": 0.95, + }, + { + "name": "evil-corp.com", + "type_ref": "osint_domain_go_cybersecurity", + "attributes": {"fqdn": "evil-corp.com"}, + "confidence": 0.88, + }, + ] + }) + + result = extract_entities_llm( + "John Smith, US citizen, linked to evil-corp.com.", SCHEMA, llm + ) + + assert len(result) == 2 + + person = next(e for e in result if e.name == "John Smith") + assert person.type_ref == "osint_person_go_cybersecurity" + assert person.type_label == "Person" + assert person.attributes["full_name"] == "John Smith" + assert person.confidence == 0.95 + + domain = next(e for e in result if e.name == "evil-corp.com") + assert domain.type_ref == "osint_domain_go_cybersecurity" + assert domain.type_label == "Domain" + assert domain.attributes["fqdn"] == "evil-corp.com" + assert domain.confidence == 0.88 + + +def test_texto_sin_entidades_retorna_lista_vacia(): + """texto sin entidades retorna lista vacia""" + llm = make_llm({"entities": []}) + + result = extract_entities_llm( + "The sky is blue and the grass is green.", SCHEMA, llm + ) + + assert result == [] + + +def test_llm_retorna_json_mal_formado_retorna_lista_vacia_con_warning(): + """llm retorna json mal formado retorna lista vacia con warning""" + def bad_llm(messages: list[dict]) -> dict: + raise ValueError("JSON decode error") + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = extract_entities_llm("Some text with entities.", SCHEMA, bad_llm) + + assert result == [] + assert len(caught) == 1 + assert "error llamando al LLM" in str(caught[0].message) + + +def test_type_ref_invalido_en_respuesta_se_descarta_con_warning(): + """type_ref invalido en respuesta se descarta con warning""" + llm = make_llm({ + "entities": [ + { + "name": "Valid Person", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {}, + "confidence": 0.9, + }, + { + "name": "Unknown Thing", + "type_ref": "nonexistent_type_ref", + "attributes": {}, + "confidence": 0.8, + }, + ] + }) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = extract_entities_llm("Text with entities.", SCHEMA, llm) + + assert len(result) == 1 + assert result[0].name == "Valid Person" + assert any("nonexistent_type_ref" in str(w.message) for w in caught) + + +def test_confidence_se_propaga_correctamente(): + """confidence se propaga correctamente""" + llm = make_llm({ + "entities": [ + { + "name": "Implied Person", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {}, + "confidence": 0.7, + }, + { + "name": "Weakly Implied Domain", + "type_ref": "osint_domain_go_cybersecurity", + "attributes": {}, + "confidence": 0.5, + }, + { + "name": "Explicit Entity", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {}, + "confidence": 1.0, + }, + ] + }) + + result = extract_entities_llm("Some text.", SCHEMA, llm) + + assert len(result) == 3 + confidences = {e.name: e.confidence for e in result} + assert confidences["Implied Person"] == 0.7 + assert confidences["Weakly Implied Domain"] == 0.5 + assert confidences["Explicit Entity"] == 1.0 + + +def test_schema_vacio_lanza_value_error(): + """schema vacio lanza ValueError""" + llm = make_llm({"entities": []}) + + with pytest.raises(ValueError, match="entity_schema no puede estar vacio"): + extract_entities_llm("Some text.", [], llm) diff --git a/python/functions/datascience/extract_relations_llm.md b/python/functions/datascience/extract_relations_llm.md new file mode 100644 index 00000000..83a4b803 --- /dev/null +++ b/python/functions/datascience/extract_relations_llm.md @@ -0,0 +1,75 @@ +--- +name: extract_relations_llm +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def extract_relations_llm(text: str, entities: list, relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = 'Respond in English.') -> list" +description: "Extrae relaciones entre entidades de un chunk de texto usando un LLM inyectado. Valida que from_name y to_name correspondan a entidades existentes, y usa 'related_to' como fallback para tipos de relacion no permitidos." +tags: [extraction, relation, llm, knowledge-graph, nlp, datascience, fuzzygraph, graph] +uses_functions: [] +uses_types: + - entity_candidate_py_datascience + - relation_candidate_py_datascience +returns: + - relation_candidate_py_datascience +returns_optional: false +error_type: "error_go_core" +imports: [logging, sys, os, typing] +tested: true +tests: + - "texto con dos entidades relacionadas" + - "texto con entidades pero sin relacion" + - "menos de dos entidades retorna lista vacia" + - "llm inventa entidad que no existe se descarta" +test_file_path: "python/functions/datascience/extract_relations_llm_test.py" +file_path: "python/functions/datascience/extract_relations_llm.py" +--- + +## Ejemplo + +```python +from extract_relations_llm import extract_relations_llm +from python.types.datascience.entity_candidate import EntityCandidate + +# Stub de llm_chat_json (en produccion usar llm_completion_retry o similar) +def my_llm(messages: list[dict]) -> dict: + # Llamar al LLM real aqui + return {"relations": [...]} + +entities = [ + EntityCandidate(name="Acme Corp", type_label="Organization", confidence=0.95), + EntityCandidate(name="John Smith", type_label="Person", confidence=0.9), +] + +relation_types = ["employs", "funds", "owns", "communicates_with", "related_to"] + +relations = extract_relations_llm( + text="Acme Corp employs John Smith as CEO and funds his research.", + entities=entities, + relation_types=relation_types, + llm_chat_json=my_llm, +) + +for rel in relations: + print(f"{rel.from_name} --[{rel.relation_type}]--> {rel.to_name} ({rel.confidence:.2f})") +# Acme Corp --[employs]--> John Smith (0.90) +# Acme Corp --[funds]--> John Smith (0.85) +``` + +## Notas + +**Inyeccion de dependencia del LLM:** `llm_chat_json` recibe una lista de mensajes en formato OpenAI (`[{"role": "system", "content": ...}, {"role": "user", "content": ...}]`) y retorna un dict con la clave `"relations"`. Esto desacopla la funcion de cualquier proveedor de LLM concreto. + +**Validacion de entidades:** Solo se aceptan relaciones donde `from_name` y `to_name` aparecen exactamente en los nombres de las entidades proporcionadas. Relaciones con nombres inventados por el LLM se descartan silenciosamente (con debug log). + +**Fallback de tipo:** Si el LLM propone un `relation_type` que no esta en la lista permitida, se reemplaza por `"related_to"`. Si `"related_to"` tampoco esta en la lista, se incluye igualmente como catch-all seguro. + +**Menos de 2 entidades:** La funcion retorna `[]` inmediatamente sin llamar al LLM, ya que no puede haber relaciones con menos de 2 participantes. + +**Error handling:** Si `llm_chat_json` lanza una excepcion, se captura con warning y retorna `[]`. Si la respuesta no contiene la clave `"relations"` o no es una lista, idem. + +**Confianza:** Los valores de confianza del LLM se clampean al rango `[0.0, 1.0]`. Valores no numericos se convierten a `0.0`. + +Disenado para fuzzygraph — se compone con `extract_entities_llm` (paso anterior) y `deduplicate_relations` (paso siguiente en el pipeline de extraccion). diff --git a/python/functions/datascience/extract_relations_llm.py b/python/functions/datascience/extract_relations_llm.py new file mode 100644 index 00000000..46e26f35 --- /dev/null +++ b/python/functions/datascience/extract_relations_llm.py @@ -0,0 +1,141 @@ +"""extract_relations_llm — extrae relaciones entre entidades usando un LLM.""" + +import logging +import sys +import os +from typing import Callable + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "")) + +from python.types.datascience.entity_candidate import EntityCandidate +from python.types.datascience.relation_candidate import RelationCandidate + +logger = logging.getLogger(__name__) + + +def extract_relations_llm( + text: str, + entities: list[EntityCandidate], + relation_types: list[str], + llm_chat_json: Callable[[list[dict]], dict], + language_instruction: str = "Respond in English.", +) -> list[RelationCandidate]: + """Extrae relaciones entre entidades de un chunk de texto usando un LLM. + + Dado el texto original y las entidades ya extraidas, pide al LLM que + identifique relaciones entre pares de entidades. Las relaciones cuyo + from_name o to_name no coincidan con ninguna entidad existente se descartan. + Los tipos de relacion no permitidos se reemplazan por "related_to". + + Args: + text: chunk de texto (el mismo que se uso para extraer las entidades). + entities: entidades ya extraidas del chunk. + relation_types: tipos de relacion permitidos, ej: ["funds", "employs", + "communicates_with", "owns", "related_to"]. + llm_chat_json: funcion inyectada que recibe una lista de mensajes + (dicts con "role" y "content") y retorna un dict con la respuesta + JSON del LLM. + language_instruction: instruccion de idioma para el LLM. + + Returns: + Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades + o si el LLM no encuentra relaciones. + """ + if len(entities) < 2: + return [] + + entity_names = {e.name for e in entities} + relation_types_set = set(relation_types) + + # Construir lista de entidades para el prompt + entity_lines = "\n".join( + f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities + ) + + # Construir tipos de relacion para el prompt + relation_types_str = ", ".join(relation_types) + + system_prompt = f"""\ +You are a relation extraction expert. Given text and a list of entities already \ +extracted, identify relationships between them. + +Entities found in this text: +{entity_lines} + +Allowed relation types: {relation_types_str} + +Output JSON: {{"relations": [ + {{"from_name": "Entity A", "to_name": "Entity B", + "relation_type": "employs", "description": "...", "confidence": 0.8}} +]}} + +Rules: +- Only extract relations explicitly stated or strongly implied in the text +- from_name and to_name must match entity names exactly as listed above +- relation_type must be one of the allowed types +- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied +- Do not invent entities not in the list above +- {language_instruction}""" + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": text}, + ] + + try: + response = llm_chat_json(messages) + except Exception as exc: + logger.warning("extract_relations_llm: LLM call failed: %s", exc) + return [] + + raw_relations = response.get("relations", []) + if not isinstance(raw_relations, list): + logger.warning("extract_relations_llm: 'relations' is not a list in LLM response") + return [] + + results: list[RelationCandidate] = [] + for item in raw_relations: + if not isinstance(item, dict): + continue + + from_name = item.get("from_name", "") + to_name = item.get("to_name", "") + + # Validar que ambos nombres corresponden a entidades existentes + if from_name not in entity_names: + logger.debug( + "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando", + from_name, + ) + continue + if to_name not in entity_names: + logger.debug( + "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando", + to_name, + ) + continue + + relation_type = item.get("relation_type", "") + if relation_type not in relation_types_set: + logger.debug( + "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'", + relation_type, + ) + relation_type = "related_to" + + confidence = item.get("confidence", 0.0) + if not isinstance(confidence, (int, float)): + confidence = 0.0 + confidence = float(max(0.0, min(1.0, confidence))) + + results.append( + RelationCandidate( + from_name=from_name, + to_name=to_name, + relation_type=relation_type, + description=item.get("description", ""), + confidence=confidence, + ) + ) + + return results diff --git a/python/functions/datascience/extract_relations_llm_test.py b/python/functions/datascience/extract_relations_llm_test.py new file mode 100644 index 00000000..a7655c61 --- /dev/null +++ b/python/functions/datascience/extract_relations_llm_test.py @@ -0,0 +1,140 @@ +"""Tests para extract_relations_llm.""" + +import sys +import os + +# Rutas para importar desde el registry +REGISTRY_ROOT = os.path.join(os.path.dirname(__file__), "..", "..", "..", "") +sys.path.insert(0, REGISTRY_ROOT) +sys.path.insert(0, os.path.dirname(__file__)) + +from python.types.datascience.entity_candidate import EntityCandidate +from python.types.datascience.relation_candidate import RelationCandidate +from extract_relations_llm import extract_relations_llm + + +def _make_entity(name: str, type_label: str = "Entity") -> EntityCandidate: + return EntityCandidate(name=name, type_label=type_label, confidence=0.9) + + +def _make_llm(response: dict): + """Crea un stub de llm_chat_json que retorna la respuesta fija.""" + def llm_chat_json(messages: list[dict]) -> dict: + return response + return llm_chat_json + + +def test_texto_con_dos_entidades_relacionadas(): + entities = [_make_entity("Acme Corp", "Organization"), _make_entity("John Smith", "Person")] + relation_types = ["employs", "funds", "related_to"] + + llm_response = { + "relations": [ + { + "from_name": "Acme Corp", + "to_name": "John Smith", + "relation_type": "employs", + "description": "Acme Corp employs John Smith as CEO", + "confidence": 0.9, + } + ] + } + + result = extract_relations_llm( + text="Acme Corp employs John Smith as CEO.", + entities=entities, + relation_types=relation_types, + llm_chat_json=_make_llm(llm_response), + ) + + assert len(result) == 1 + rel = result[0] + assert rel.from_name == "Acme Corp" + assert rel.to_name == "John Smith" + assert rel.relation_type == "employs" + assert rel.confidence == 0.9 + assert "CEO" in rel.description + + +def test_texto_con_entidades_pero_sin_relacion(): + entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")] + relation_types = ["funds", "employs"] + + llm_response = {"relations": []} + + result = extract_relations_llm( + text="Alice and Bob both attended the conference.", + entities=entities, + relation_types=relation_types, + llm_chat_json=_make_llm(llm_response), + ) + + assert result == [] + + +def test_menos_de_dos_entidades_retorna_lista_vacia(): + entities = [_make_entity("Solo Corp", "Organization")] + relation_types = ["employs", "funds"] + + # El LLM nunca deberia ser llamado, pero si lo fuera retornaria relaciones + llm_response = { + "relations": [ + {"from_name": "Solo Corp", "to_name": "Nobody", "relation_type": "employs", "confidence": 0.9} + ] + } + + result = extract_relations_llm( + text="Solo Corp is a company.", + entities=entities, + relation_types=relation_types, + llm_chat_json=_make_llm(llm_response), + ) + + assert result == [] + + +def test_llm_inventa_entidad_que_no_existe_se_descarta(): + entities = [_make_entity("Alice", "Person"), _make_entity("Bob", "Person")] + relation_types = ["funds", "employs", "related_to"] + + llm_response = { + "relations": [ + # Valida — Alice y Bob existen + { + "from_name": "Alice", + "to_name": "Bob", + "relation_type": "funds", + "description": "Alice funds Bob", + "confidence": 0.8, + }, + # Invalida — "Charlie" no esta en entities + { + "from_name": "Alice", + "to_name": "Charlie", + "relation_type": "employs", + "description": "Alice employs Charlie", + "confidence": 0.7, + }, + # Invalida — "Unknown Corp" no esta en entities + { + "from_name": "Unknown Corp", + "to_name": "Bob", + "relation_type": "related_to", + "description": "...", + "confidence": 0.6, + }, + ] + } + + result = extract_relations_llm( + text="Alice funds Bob. Alice also employs Charlie from Unknown Corp.", + entities=entities, + relation_types=relation_types, + llm_chat_json=_make_llm(llm_response), + ) + + # Solo la primera relacion es valida + assert len(result) == 1 + assert result[0].from_name == "Alice" + assert result[0].to_name == "Bob" + assert result[0].relation_type == "funds" diff --git a/python/functions/datascience/hotness_score.md b/python/functions/datascience/hotness_score.md new file mode 100644 index 00000000..8c94f314 --- /dev/null +++ b/python/functions/datascience/hotness_score.md @@ -0,0 +1,72 @@ +--- +name: hotness_score +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def hotness_score(active_count: int, updated_at: datetime | None, now: datetime | None = None, half_life_days: float = 7.0) -> float" +description: "Calcula un score de hotness combinando frecuencia de acceso y recencia temporal. Util para ranking de resultados, memoria hot/cold y cache eviction." +tags: [ranking, decay, recency, frequency, scoring, cache, memory, datascience] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [math, datetime] +tested: true +tests: + - "active_count=0, updated_at reciente" + - "active_count=100, updated_at reciente (score alto)" + - "active_count=100, updated_at hace 30 dias (score bajo)" + - "updated_at=None (retorna 0.0)" + - "now explicito (determinista para tests)" + - "half_life_days custom" +test_file_path: "python/functions/datascience/hotness_score_test.py" +file_path: "python/functions/datascience/hotness_score.py" +--- + +## Ejemplo + +```python +from datetime import datetime, timedelta, timezone +from datascience.hotness_score import hotness_score + +now = datetime.now(timezone.utc) + +# Item reciente con muchos accesos -> score alto +score = hotness_score(active_count=150, updated_at=now - timedelta(hours=2), now=now) +# score > 0.95 + +# Item antiguo aunque muy accedido -> score bajo +score = hotness_score(active_count=150, updated_at=now - timedelta(days=30), now=now) +# score ~ 0.05 + +# Item sin fecha -> siempre 0 +score = hotness_score(active_count=999, updated_at=None) +# score == 0.0 +``` + +## Notas + +Formula: `score = sigmoid(log1p(active_count)) * exp(-ln(2)/half_life_days * age_days)` + +**Componente de frecuencia** — `sigmoid(log1p(count))` mapea enteros no negativos al rango `(0.5, 1.0)`: +- count=0 -> 0.5 +- count=10 -> ~0.92 +- count=100 -> ~0.99 + +**Componente de recencia** — decaimiento exponencial con vida media configurable: +- `half_life_days=7` (default): score se reduce a la mitad cada 7 dias +- `half_life_days=1`: decaimiento agresivo (util para feeds en tiempo real) +- `half_life_days=365`: decaimiento lento (util para contenido evergreen) + +**Propiedades del score:** +- `updated_at=None` -> 0.0 siempre (item sin fecha no tiene hotness) +- `active_count=0, reciente` -> ~0.5 (neutro pero fresco) +- `active_count alto, reciente` -> ~1.0 (muy caliente) +- `active_count alto, antiguo` -> ~0.0 (frio a pesar de popularidad pasada) + +Timestamps sin timezone se interpretan como UTC. Pasar `now` explicitamente garantiza determinismo en tests y reproducibilidad en pipelines batch. + +Fuente conceptual: openviking/retrieve/memory_lifecycle.py (AGPL-3.0). Reimplementado desde cero con formula equivalente. diff --git a/python/functions/datascience/hotness_score.py b/python/functions/datascience/hotness_score.py new file mode 100644 index 00000000..c7d45690 --- /dev/null +++ b/python/functions/datascience/hotness_score.py @@ -0,0 +1,49 @@ +"""Hotness score — combining access frequency and recency decay.""" + +import math +from datetime import datetime, timezone + + +def hotness_score( + active_count: int, + updated_at: datetime | None, + now: datetime | None = None, + half_life_days: float = 7.0, +) -> float: + """Calcula un score de hotness combinando frecuencia de acceso y recencia. + + Formula: sigmoid(log1p(active_count)) * exp_decay(age_days, half_life_days) + + El componente de frecuencia mapea conteos enteros al rango (0, 1) via sigmoid(log1p). + El componente de recencia decae exponencialmente con vida media configurable. + + Args: + active_count: Numero de accesos o activaciones. Debe ser >= 0. + updated_at: Timestamp de la ultima actualizacion. None retorna 0.0. + now: Momento de referencia para calcular la edad. Si es None usa datetime.now(UTC). + half_life_days: Dias para que la recencia se reduzca a la mitad. Default 7. + + Returns: + float en [0.0, 1.0]. Valores mas cercanos a 1.0 indican mayor hotness. + """ + if updated_at is None: + return 0.0 + + # Componente de frecuencia: sigmoid(log1p(count)) mapea 0..inf -> (0.5, 1.0) + freq = 1.0 / (1.0 + math.exp(-math.log1p(active_count))) + + # Componente de recencia: decaimiento exponencial + if now is None: + now = datetime.now(timezone.utc) + + # Normalizar ambos timestamps a UTC para comparacion segura + if updated_at.tzinfo is None: + updated_at = updated_at.replace(tzinfo=timezone.utc) + if now.tzinfo is None: + now = now.replace(tzinfo=timezone.utc) + + age_days = max((now - updated_at).total_seconds() / 86400.0, 0.0) + decay_rate = math.log(2) / half_life_days + recency = math.exp(-decay_rate * age_days) + + return freq * recency diff --git a/python/functions/datascience/hotness_score_test.py b/python/functions/datascience/hotness_score_test.py new file mode 100644 index 00000000..58854e17 --- /dev/null +++ b/python/functions/datascience/hotness_score_test.py @@ -0,0 +1,61 @@ +"""Tests para hotness_score.""" + +import math +from datetime import datetime, timedelta, timezone + +from hotness_score import hotness_score + +NOW = datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc) + + +def test_active_count_zero_updated_at_reciente(): + """active_count=0, updated_at reciente.""" + updated_at = NOW - timedelta(hours=1) + score = hotness_score(0, updated_at, now=NOW) + # freq = sigmoid(log1p(0)) = sigmoid(0) = 0.5 + # recency ~ 1.0 (casi nuevo) + assert 0.45 < score < 0.55, f"Expected ~0.5, got {score}" + + +def test_active_count_alto_updated_at_reciente(): + """active_count=100, updated_at reciente (score alto).""" + updated_at = NOW - timedelta(hours=1) + score = hotness_score(100, updated_at, now=NOW) + # freq = sigmoid(log1p(100)) = sigmoid(4.615) ~ 0.99 + # recency ~ 1.0 + assert score > 0.95, f"Expected > 0.95, got {score}" + + +def test_active_count_alto_updated_at_hace_30_dias(): + """active_count=100, updated_at hace 30 dias (score bajo).""" + updated_at = NOW - timedelta(days=30) + score = hotness_score(100, updated_at, now=NOW) + # recency = exp(-ln2/7 * 30) = exp(-2.97) ~ 0.051 + # score ~ 0.99 * 0.051 ~ 0.05 + assert score < 0.1, f"Expected < 0.1, got {score}" + + +def test_updated_at_none_retorna_cero(): + """updated_at=None (retorna 0.0).""" + score = hotness_score(100, None, now=NOW) + assert score == 0.0, f"Expected 0.0, got {score}" + + +def test_now_explicito(): + """now explicito (determinista para tests).""" + updated_at = NOW - timedelta(days=7) + score = hotness_score(50, updated_at, now=NOW) + # recency = exp(-ln2/7 * 7) = 0.5 + # freq = sigmoid(log1p(50)) ~ sigmoid(3.93) ~ 0.981 + expected = (1.0 / (1.0 + math.exp(-math.log1p(50)))) * 0.5 + assert abs(score - expected) < 1e-9, f"Expected {expected}, got {score}" + + +def test_half_life_days_custom(): + """half_life_days custom.""" + updated_at = NOW - timedelta(days=1) + # Con half_life=1 dia, despues de 1 dia recency = 0.5 + score = hotness_score(50, updated_at, now=NOW, half_life_days=1.0) + freq = 1.0 / (1.0 + math.exp(-math.log1p(50))) + expected = freq * 0.5 + assert abs(score - expected) < 1e-6, f"Expected {expected}, got {score}" diff --git a/python/functions/datascience/melt.md b/python/functions/datascience/melt.md new file mode 100644 index 00000000..5e3961a6 --- /dev/null +++ b/python/functions/datascience/melt.md @@ -0,0 +1,40 @@ +--- +name: melt +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def melt(rows: list[dict], id_vars: list[str], value_vars: list[str] | None = None, var_name: str = 'variable', value_name: str = 'value') -> list[dict]" +description: "Inversa de pivot. Convierte columnas en filas (formato largo). Cada combinacion de id_vars + value_var genera una fila. Si value_vars es None, derrite todas las columnas no-id." +tags: [datascience, tabular, melt, unpivot, transform, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: + - "Melt basico" + - "Multiples id_vars" + - "value_vars None derrite todas las columnas no-id" + - "Fila con campo faltante en value_vars" +test_file_path: "python/functions/datascience/melt_test.py" +file_path: "python/functions/datascience/melt.py" +--- + +## Ejemplo + +```python +rows = [{"region": "US", "q1": 10, "q2": 20}] +melt(rows, id_vars=["region"], value_vars=["q1", "q2"]) +# [{"region": "US", "variable": "q1", "value": 10}, +# {"region": "US", "variable": "q2", "value": 20}] +``` + +## Notas + +Funcion pura sin dependencias externas. +Si un campo de value_vars no existe en la fila, su valor sera None. +El parametro value_vars=None es util cuando se desconoce el schema exacto. diff --git a/python/functions/datascience/melt.py b/python/functions/datascience/melt.py new file mode 100644 index 00000000..465e4df4 --- /dev/null +++ b/python/functions/datascience/melt.py @@ -0,0 +1,40 @@ +"""Melt (unpivot) para datos tabulares list[dict].""" + + +def melt( + rows: list[dict], + id_vars: list[str], + value_vars: list[str] | None = None, + var_name: str = "variable", + value_name: str = "value", +) -> list[dict]: + """Convierte columnas en filas (formato largo). Inversa de pivot. + + Cada combinacion de id_vars + value_var genera una fila nueva. + Si value_vars es None, se usan todas las columnas que no esten en id_vars. + + Args: + rows: Lista de dicts en formato ancho. + id_vars: Columnas que se mantienen como identificadores en cada fila. + value_vars: Columnas a convertir en filas. None = todas las no-id. + var_name: Nombre de la columna que contendra los nombres de variables. + value_name: Nombre de la columna que contendra los valores. + + Returns: + Lista de dicts en formato largo con una fila por combinacion id+variable. + """ + result = [] + for row in rows: + # Determinar que columnas derretir + if value_vars is None: + vars_to_melt = [k for k in row if k not in id_vars] + else: + vars_to_melt = value_vars + + for var in vars_to_melt: + new_row: dict = {k: row.get(k) for k in id_vars} + new_row[var_name] = var + new_row[value_name] = row.get(var) + result.append(new_row) + + return result diff --git a/python/functions/datascience/melt_test.py b/python/functions/datascience/melt_test.py new file mode 100644 index 00000000..c200c601 --- /dev/null +++ b/python/functions/datascience/melt_test.py @@ -0,0 +1,49 @@ +"""Tests para melt.""" + +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) + +from melt import melt + + +def test_melt_basico(): + """Melt basico.""" + rows = [{"region": "US", "q1": 10, "q2": 20}] + result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"]) + assert len(result) == 2 + assert result[0] == {"region": "US", "variable": "q1", "value": 10} + assert result[1] == {"region": "US", "variable": "q2", "value": 20} + + +def test_melt_multiples_id_vars(): + """Multiples id_vars.""" + rows = [{"region": "US", "year": 2023, "q1": 10, "q2": 20}] + result = melt(rows, id_vars=["region", "year"], value_vars=["q1", "q2"]) + assert len(result) == 2 + assert result[0]["region"] == "US" + assert result[0]["year"] == 2023 + assert result[0]["variable"] == "q1" + assert result[0]["value"] == 10 + assert result[1]["variable"] == "q2" + assert result[1]["value"] == 20 + + +def test_melt_value_vars_none_derrite_todas_las_columnas_no_id(): + """value_vars None derrite todas las columnas no-id.""" + rows = [{"id": 1, "a": 10, "b": 20, "c": 30}] + result = melt(rows, id_vars=["id"]) + assert len(result) == 3 + vars_found = {r["variable"] for r in result} + assert vars_found == {"a", "b", "c"} + values_found = {r["value"] for r in result} + assert values_found == {10, 20, 30} + + +def test_melt_fila_con_campo_faltante_en_value_vars(): + """Fila con campo faltante en value_vars.""" + rows = [{"region": "US", "q1": 10}] # q2 no existe + result = melt(rows, id_vars=["region"], value_vars=["q1", "q2"]) + assert len(result) == 2 + q2_row = next(r for r in result if r["variable"] == "q2") + assert q2_row["value"] is None diff --git a/python/functions/datascience/merge_graphs.md b/python/functions/datascience/merge_graphs.md new file mode 100644 index 00000000..0bca1a69 --- /dev/null +++ b/python/functions/datascience/merge_graphs.md @@ -0,0 +1,68 @@ +--- +name: merge_graphs +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def merge_graphs(graphs: list[dict], entity_key: str = 'name', similarity_threshold: float = 0.85) -> dict" +description: "Mergea multiples grafos de conocimiento en uno deduplicando entities por similitud de nombre (Levenshtein normalizado). Relaciones se re-apuntan a las entities canonicas. Atributos se combinan por union." +tags: [graph, merge, deduplication, knowledge-graph, levenshtein, similarity, datascience] +uses_functions: [levenshtein_distance_py_cybersecurity] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [sys, os] +tested: true +tests: + - "dos grafos con entity duplicada → merge" + - "entities similares pero bajo threshold → no merge" + - "relaciones re-apuntadas correctamente" + - "merge log registra cada merge" + - "tres grafos → merge transitivo" + - "grafos sin overlap → concatenacion simple" +test_file_path: "python/functions/datascience/merge_graphs_test.py" +file_path: "python/functions/datascience/merge_graphs.py" +--- + +## Ejemplo + +```python +g1 = { + "entities": [ + {"id": "1", "name": "Alice Corp", "type": "company"}, + {"id": "2", "name": "Bob", "type": "person"}, + ], + "relations": [ + {"source_id": "2", "target_id": "1", "relation_type": "works_at"}, + ], +} +g2 = { + "entities": [ + {"id": "3", "name": "Alice Corp.", "type": "company", "country": "US"}, + ], + "relations": [], +} + +result = merge_graphs([g1, g2], similarity_threshold=0.85) +# result["entities"] -> 2 entities (Alice Corp mergeada, Bob) +# result["merge_log"] -> [{"merged": ["3", "1"], "into": "1", "similarity": 0.909}] +# "Alice Corp." mergeada en "Alice Corp" porque similitud > 0.85 +``` + +## Notas + +Funcion pura. Reutiliza `levenshtein_distance_py_cybersecurity` para calcular similitud normalizada entre nombres. + +**Algoritmo de merge transitivo**: si A~B y B~C, entonces A, B, C se mergean en uno solo. Se implementa via union-find (path compression simple). + +**Eleccion de canonical**: la entity con mas campos no-null gana. En caso de empate, la primera encontrada en el par. + +**Conflictos de atributos**: si ambas entities tienen un campo con valor, el canonical conserva el suyo (primero gana). Solo se copian campos que el canonical no tiene o tiene null. + +**Deduplicacion de relaciones**: por (source_id, target_id, relation_type). Si dos relaciones son identicas tras re-apuntar los IDs, se conserva la primera encontrada. + +**Complejidad**: O(n^2) en numero de entities por la comparacion de pares. Adecuado para grafos de knowledge tipicos (< 10K entities). Para grafos muy grandes, usar indexado por prefijo antes de comparar. + +**Importacion**: intenta importar `levenshtein_distance` desde el paquete `cybersecurity` del registry. Si no esta disponible, usa una reimplementacion inline equivalente. diff --git a/python/functions/datascience/merge_graphs.py b/python/functions/datascience/merge_graphs.py new file mode 100644 index 00000000..36ada35a --- /dev/null +++ b/python/functions/datascience/merge_graphs.py @@ -0,0 +1,169 @@ +"""merge_graphs — mergea multiples grafos de conocimiento deduplicando entities por similitud.""" + +import sys +import os + +# Importar levenshtein_distance desde el registry +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "cybersecurity")) +try: + from cybersecurity import levenshtein_distance +except ImportError: + # Fallback: reimplementacion inline si el paquete no esta disponible + def levenshtein_distance(a: str, b: str) -> int: + """Calcula la distancia de Levenshtein entre dos strings.""" + if len(a) < len(b): + return levenshtein_distance(b, a) + if len(b) == 0: + return len(a) + prev_row = list(range(len(b) + 1)) + for i, ca in enumerate(a): + curr_row = [i + 1] + for j, cb in enumerate(b): + cost = 0 if ca == cb else 1 + curr_row.append( + min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost) + ) + prev_row = curr_row + return prev_row[-1] + + +def _name_similarity(a: str, b: str) -> float: + """Similitud de Levenshtein normalizada entre 0 y 1.""" + if not a and not b: + return 1.0 + max_len = max(len(a), len(b)) + if max_len == 0: + return 1.0 + dist = levenshtein_distance(a.lower(), b.lower()) + return 1.0 - dist / max_len + + +def _count_non_null_fields(entity: dict) -> int: + """Cuenta campos con valor no-None.""" + return sum(1 for v in entity.values() if v is not None) + + +def _merge_two_entities(canonical: dict, other: dict) -> dict: + """Combina dos entities: union de campos, ultimo gana en conflictos.""" + merged = dict(canonical) + for k, v in other.items(): + if k not in merged or merged[k] is None: + merged[k] = v + # Si ambos tienen valor, el canonical (primero) gana — no sobreescribir + return merged + + +def merge_graphs( + graphs: list[dict], + entity_key: str = "name", + similarity_threshold: float = 0.85, +) -> dict: + """Mergea multiples grafos de conocimiento en uno, deduplicando entities por similitud. + + Algoritmo: + 1. Juntar todas las entities de todos los grafos (con ID de origen). + 2. Para cada par con similitud de nombre >= threshold, mergear. + 3. Elegir entity canonica (la que tiene mas campos no-null). + 4. Re-apuntar relaciones al ID canonico. + 5. Deduplicar relaciones identicas (mismo source, target, type). + 6. Registrar cada merge en merge_log. + + Args: + graphs: Lista de grafos. Cada grafo es un dict con keys: + "entities" (list[dict]) y "relations" (list[dict]). + Las entities deben tener "id" y el campo entity_key. + entity_key: Campo de texto usado para calcular similitud. Default "name". + similarity_threshold: Umbral de similitud Levenshtein normalizada [0,1]. + Default 0.85. + + Returns: + Dict con keys: entities, relations, merge_log. + """ + # Recopilar todas las entities y relaciones + all_entities: list[dict] = [] + all_relations: list[dict] = [] + + for graph in graphs: + all_entities.extend(graph.get("entities", [])) + all_relations.extend(graph.get("relations", [])) + + # Construir union-find para agrupar entities similares + # id_map: entity_id original -> entity_id canonico + id_map: dict[str, str] = {e["id"]: e["id"] for e in all_entities if "id" in e} + entity_by_id: dict[str, dict] = {e["id"]: e for e in all_entities if "id" in e} + + merge_log: list[dict] = [] + + def find_canonical(eid: str) -> str: + while id_map.get(eid, eid) != eid: + eid = id_map[eid] + return eid + + entity_ids = [e["id"] for e in all_entities if "id" in e] + + # Comparar todos los pares (O(n^2) — aceptable para grafos de knowledge tipicos) + for i in range(len(entity_ids)): + for j in range(i + 1, len(entity_ids)): + id_i = find_canonical(entity_ids[i]) + id_j = find_canonical(entity_ids[j]) + + if id_i == id_j: + continue # ya mergeados + + e_i = entity_by_id.get(id_i) + e_j = entity_by_id.get(id_j) + + if e_i is None or e_j is None: + continue + + name_i = str(e_i.get(entity_key, "")) + name_j = str(e_j.get(entity_key, "")) + + sim = _name_similarity(name_i, name_j) + if sim >= similarity_threshold: + # Elegir canonical: el que tiene mas campos no-null + if _count_non_null_fields(e_i) >= _count_non_null_fields(e_j): + canonical_id, other_id = id_i, id_j + else: + canonical_id, other_id = id_j, id_i + + # Mergear datos + merged = _merge_two_entities(entity_by_id[canonical_id], entity_by_id[other_id]) + entity_by_id[canonical_id] = merged + + # Redirigir other_id -> canonical_id + id_map[other_id] = canonical_id + + merge_log.append({ + "merged": [other_id, canonical_id], + "into": canonical_id, + "similarity": round(sim, 4), + }) + + # Construir lista final de entities (solo canonicas) + canonical_ids = {eid for eid in entity_ids if find_canonical(eid) == eid} + final_entities = [entity_by_id[eid] for eid in canonical_ids if eid in entity_by_id] + + # Re-apuntar relaciones a IDs canonicos + final_relations_set: dict[tuple, dict] = {} + for rel in all_relations: + new_rel = dict(rel) + if "source_id" in new_rel: + new_rel["source_id"] = find_canonical(new_rel["source_id"]) + if "target_id" in new_rel: + new_rel["target_id"] = find_canonical(new_rel["target_id"]) + + # Deduplicar por (source_id, target_id, relation_type) + rel_key = ( + new_rel.get("source_id", ""), + new_rel.get("target_id", ""), + new_rel.get("relation_type", ""), + ) + if rel_key not in final_relations_set: + final_relations_set[rel_key] = new_rel + + return { + "entities": final_entities, + "relations": list(final_relations_set.values()), + "merge_log": merge_log, + } diff --git a/python/functions/datascience/merge_graphs_test.py b/python/functions/datascience/merge_graphs_test.py new file mode 100644 index 00000000..b6a14812 --- /dev/null +++ b/python/functions/datascience/merge_graphs_test.py @@ -0,0 +1,120 @@ +"""Tests para merge_graphs.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +from merge_graphs import merge_graphs + + +def test_dos_grafos_con_entity_duplicada_merge(): + g1 = { + "entities": [{"id": "1", "name": "Alice Corp", "type": "company"}], + "relations": [], + } + g2 = { + "entities": [{"id": "2", "name": "Alice Corp", "type": "company", "country": "US"}], + "relations": [], + } + result = merge_graphs([g1, g2], similarity_threshold=0.95) + # Nombres identicos -> similitud 1.0 -> deben mergearse + assert len(result["entities"]) == 1 + assert len(result["merge_log"]) == 1 + merged = result["entities"][0] + # El merge debe preservar "country" aunque el canonical no lo tuviera + assert merged.get("country") == "US" or merged.get("name") == "Alice Corp" + + +def test_entities_similares_pero_bajo_threshold_no_merge(): + g1 = { + "entities": [{"id": "1", "name": "Alice"}], + "relations": [], + } + g2 = { + "entities": [{"id": "2", "name": "Bob"}], + "relations": [], + } + result = merge_graphs([g1, g2], similarity_threshold=0.85) + # Alice y Bob son muy distintos -> no merge + assert len(result["entities"]) == 2 + assert len(result["merge_log"]) == 0 + + +def test_relaciones_re_apuntadas_correctamente(): + g1 = { + "entities": [ + {"id": "1", "name": "Alice Corp"}, + {"id": "2", "name": "Bob"}, + ], + "relations": [ + {"source_id": "2", "target_id": "1", "relation_type": "works_at"}, + ], + } + g2 = { + "entities": [ + {"id": "3", "name": "Alice Corp"}, # duplicada de id=1 + ], + "relations": [ + {"source_id": "3", "target_id": "2", "relation_type": "knows"}, + ], + } + result = merge_graphs([g1, g2], similarity_threshold=0.95) + # Entity 3 mergeada en 1 -> relacion source_id=3 debe apuntar al canonical de 1 + assert len(result["entities"]) == 2 # Alice Corp + Bob + # Verificar que las relaciones tienen IDs canonicos (no "3") + for rel in result["relations"]: + assert rel["source_id"] != "3" + assert rel["target_id"] != "3" + + +def test_merge_log_registra_cada_merge(): + g1 = { + "entities": [{"id": "1", "name": "OpenAI"}], + "relations": [], + } + g2 = { + "entities": [{"id": "2", "name": "OpenAI"}], + "relations": [], + } + result = merge_graphs([g1, g2], similarity_threshold=0.9) + assert len(result["merge_log"]) == 1 + log = result["merge_log"][0] + assert "merged" in log + assert "into" in log + assert "similarity" in log + assert log["similarity"] == 1.0 + + +def test_tres_grafos_merge_transitivo(): + # A~B y B~C -> A, B, C deben mergearse en uno + g1 = {"entities": [{"id": "1", "name": "Acme Corp"}], "relations": []} + g2 = {"entities": [{"id": "2", "name": "Acme Corp"}], "relations": []} + g3 = {"entities": [{"id": "3", "name": "Acme Corp"}], "relations": []} + result = merge_graphs([g1, g2, g3], similarity_threshold=0.9) + assert len(result["entities"]) == 1 + + +def test_grafos_sin_overlap_concatenacion_simple(): + g1 = { + "entities": [{"id": "1", "name": "Alice"}, {"id": "2", "name": "Bob"}], + "relations": [{"source_id": "1", "target_id": "2", "relation_type": "knows"}], + } + g2 = { + "entities": [{"id": "3", "name": "Carol"}, {"id": "4", "name": "Dave"}], + "relations": [{"source_id": "3", "target_id": "4", "relation_type": "knows"}], + } + result = merge_graphs([g1, g2], similarity_threshold=0.85) + # Ninguna entity similar -> concatenacion directa + assert len(result["entities"]) == 4 + assert len(result["relations"]) == 2 + assert len(result["merge_log"]) == 0 + + +if __name__ == "__main__": + test_dos_grafos_con_entity_duplicada_merge() + test_entities_similares_pero_bajo_threshold_no_merge() + test_relaciones_re_apuntadas_correctamente() + test_merge_log_registra_cada_merge() + test_tres_grafos_merge_transitivo() + test_grafos_sin_overlap_concatenacion_simple() + print("All tests passed.") diff --git a/python/functions/datascience/pivot.md b/python/functions/datascience/pivot.md new file mode 100644 index 00000000..06edb060 --- /dev/null +++ b/python/functions/datascience/pivot.md @@ -0,0 +1,44 @@ +--- +name: pivot +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def pivot(rows: list[dict], index: str, columns: str, values: str, agg: str = 'sum') -> list[dict]" +description: "Pivot table sin pandas. Agrupa por index, expande valores unicos de columns como nuevas columnas y agrega values con la funcion indicada (sum, count, mean, min, max, first, last)." +tags: [datascience, tabular, pivot, transform, aggregation, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: ["collections"] +tested: true +tests: + - "Pivot basico con sum" + - "Pivot con count y mean" + - "Valores faltantes rellenados con 0" + - "Una sola fila" + - "Multiples valores por celda requieren agregacion" +test_file_path: "python/functions/datascience/pivot_test.py" +file_path: "python/functions/datascience/pivot.py" +--- + +## Ejemplo + +```python +rows = [ + {"region": "US", "product": "A", "sales": 10}, + {"region": "US", "product": "B", "sales": 20}, + {"region": "EU", "product": "A", "sales": 15}, +] +pivot(rows, index="region", columns="product", values="sales") +# [{"region": "US", "A": 10, "B": 20}, {"region": "EU", "A": 15, "B": 0}] +``` + +## Notas + +Funcion pura sin dependencias externas (solo collections.defaultdict de stdlib). +Preserva el orden de aparicion de los valores de index y columns. +Valores numericos faltantes se rellenan con 0; no numericos con None. diff --git a/python/functions/datascience/pivot.py b/python/functions/datascience/pivot.py new file mode 100644 index 00000000..dfbe6ad7 --- /dev/null +++ b/python/functions/datascience/pivot.py @@ -0,0 +1,89 @@ +"""Pivot table sin pandas para datos tabulares list[dict].""" + +from collections import defaultdict + + +def pivot( + rows: list[dict], + index: str, + columns: str, + values: str, + agg: str = "sum", +) -> list[dict]: + """Transforma datos del formato largo al formato ancho (pivot table). + + Agrupa por `index`, expande los valores unicos de `columns` como nuevas + columnas y agrega la columna `values` con la funcion indicada. + + Args: + rows: Lista de dicts con los datos en formato largo. + index: Nombre de la columna que actua como indice de filas. + columns: Nombre de la columna cuyos valores unicos se convierten en columnas. + values: Nombre de la columna cuyos valores se agregan. + agg: Funcion de agregacion: sum, count, mean, min, max, first, last. + + Returns: + Lista de dicts con una fila por valor unico de index y una columna + por cada valor unico de columns. Valores numericos faltantes rellenados + con 0, valores no numericos con None. + """ + # Recopilar valores unicos de columns (orden de aparicion) + col_values: list = [] + seen_cols: set = set() + index_order: list = [] + seen_index: set = set() + + for row in rows: + idx = row.get(index) + col = row.get(columns) + if idx not in seen_index: + seen_index.add(idx) + index_order.append(idx) + if col not in seen_cols: + seen_cols.add(col) + col_values.append(col) + + # Acumular: groups[index_val][col_val] = lista de values + groups: dict[any, dict[any, list]] = defaultdict(lambda: defaultdict(list)) + for row in rows: + idx = row.get(index) + col = row.get(columns) + val = row.get(values) + if val is not None: + groups[idx][col].append(val) + + # Determinar si los valores son numericos (para relleno de 0) + sample_vals = [v for g in groups.values() for vs in g.values() for v in vs] + is_numeric = all(isinstance(v, (int, float)) for v in sample_vals) if sample_vals else True + + def _aggregate(vals: list, func: str): + if not vals: + return 0 if is_numeric else None + if func == "sum": + return sum(vals) + if func == "count": + return len(vals) + if func == "mean": + return sum(vals) / len(vals) + if func == "min": + return min(vals) + if func == "max": + return max(vals) + if func == "first": + return vals[0] + if func == "last": + return vals[-1] + raise ValueError(f"Funcion de agregacion no soportada: {func}") + + result = [] + for idx in index_order: + record: dict = {index: idx} + for col in col_values: + vals = groups[idx][col] + if vals: + record[col] = _aggregate(vals, agg) + else: + record[col] = 0 if is_numeric else None + result.append(record) + + return result diff --git a/python/functions/datascience/pivot_test.py b/python/functions/datascience/pivot_test.py new file mode 100644 index 00000000..e9d507e7 --- /dev/null +++ b/python/functions/datascience/pivot_test.py @@ -0,0 +1,78 @@ +"""Tests para pivot.""" + +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) + +from pivot import pivot + + +def test_pivot_basico_con_sum(): + """Pivot basico con sum.""" + rows = [ + {"region": "US", "product": "A", "sales": 10}, + {"region": "US", "product": "B", "sales": 20}, + {"region": "EU", "product": "A", "sales": 15}, + ] + result = pivot(rows, index="region", columns="product", values="sales") + assert len(result) == 2 + us = next(r for r in result if r["region"] == "US") + eu = next(r for r in result if r["region"] == "EU") + assert us["A"] == 10 + assert us["B"] == 20 + assert eu["A"] == 15 + assert eu["B"] == 0 + + +def test_pivot_con_count_y_mean(): + """Pivot con count y mean.""" + rows = [ + {"region": "US", "product": "A", "sales": 10}, + {"region": "US", "product": "A", "sales": 20}, + {"region": "EU", "product": "A", "sales": 15}, + ] + result_count = pivot(rows, index="region", columns="product", values="sales", agg="count") + us_count = next(r for r in result_count if r["region"] == "US") + assert us_count["A"] == 2 + + result_mean = pivot(rows, index="region", columns="product", values="sales", agg="mean") + us_mean = next(r for r in result_mean if r["region"] == "US") + assert us_mean["A"] == 15.0 + + +def test_pivot_valores_faltantes_rellenados_con_0(): + """Valores faltantes rellenados con 0.""" + rows = [ + {"region": "US", "product": "A", "sales": 5}, + {"region": "EU", "product": "B", "sales": 8}, + ] + result = pivot(rows, index="region", columns="product", values="sales") + us = next(r for r in result if r["region"] == "US") + eu = next(r for r in result if r["region"] == "EU") + assert us["B"] == 0 + assert eu["A"] == 0 + + +def test_pivot_una_sola_fila(): + """Una sola fila.""" + rows = [{"region": "US", "product": "A", "sales": 42}] + result = pivot(rows, index="region", columns="product", values="sales") + assert len(result) == 1 + assert result[0]["region"] == "US" + assert result[0]["A"] == 42 + + +def test_pivot_multiples_valores_por_celda_requieren_agregacion(): + """Multiples valores por celda requieren agregacion.""" + rows = [ + {"region": "US", "product": "A", "sales": 10}, + {"region": "US", "product": "A", "sales": 30}, + ] + result_sum = pivot(rows, index="region", columns="product", values="sales", agg="sum") + assert result_sum[0]["A"] == 40 + + result_min = pivot(rows, index="region", columns="product", values="sales", agg="min") + assert result_min[0]["A"] == 10 + + result_max = pivot(rows, index="region", columns="product", values="sales", agg="max") + assert result_max[0]["A"] == 30 diff --git a/python/functions/finance/avellaneda_stoikov_quotes.md b/python/functions/finance/avellaneda_stoikov_quotes.md new file mode 100644 index 00000000..8add2c9e --- /dev/null +++ b/python/functions/finance/avellaneda_stoikov_quotes.md @@ -0,0 +1,48 @@ +--- +name: avellaneda_stoikov_quotes +kind: function +lang: py +domain: finance +version: "1.0.0" +purity: pure +signature: "avellaneda_stoikov_quotes(mid_price: float, inventory: float, gamma: float, sigma: float, spread_base: float, n_levels: int, qty_base: float) -> list[dict]" +description: "Genera ordenes de market maker usando el modelo Avellaneda-Stoikov. Calcula precio de reserva y half spread optimos segun inventario y volatilidad." +tags: [simulation, market-making, avellaneda-stoikov, montecarlo, finance, order-book] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/finance/finance.py" +--- + +## Ejemplo + +```python +orders = avellaneda_stoikov_quotes( + mid_price=100.0, + inventory=0.0, + gamma=0.1, + sigma=0.02, + spread_base=0.5, + n_levels=3, + qty_base=10.0, +) +# [ +# {'side': 'buy', 'price': 99.75, 'qty': 10.0}, +# {'side': 'sell', 'price': 100.25, 'qty': 10.0}, +# ... +# ] +``` + +## Notas + +Funcion pura — sin aleatoriedad. +`gamma` controla la aversion al riesgo de inventario: mayor gamma = spreads mas amplios. +`inventory` positivo sesga los quotes hacia venta (reduce inventario largo). +Cada nivel adicional ensancha el spread en `half_spread * 0.5` y aumenta la cantidad en `qty_base * 0.5`. +Ordenes con precio <= 0 se descartan automaticamente. diff --git a/python/functions/finance/finance.py b/python/functions/finance/finance.py index a4870d21..45ec6c8b 100644 --- a/python/functions/finance/finance.py +++ b/python/functions/finance/finance.py @@ -135,3 +135,104 @@ def annualized_volatility(returns: list, periods_per_year: float) -> float: mean = sum(returns) / n variance = sum((r - mean) ** 2 for r in returns) / (n - 1) return math.sqrt(variance) * math.sqrt(periods_per_year) + + +def generate_gbm_prices( + initial_price: float, + n_ticks: int, + sigma: float, + mu: float = 0.0, + jump_intensity: float = 0.0, + jump_size_std: float = 0.05, + seed: int = 42, +) -> list: + """Genera serie de precios fundamentales con Geometric Brownian Motion + jump-diffusion. + + S(t+1) = S(t) * exp((mu - sigma^2/2)*dt + sigma*sqrt(dt)*Z + J*N) + donde Z ~ N(0,1), N ~ Bernoulli(jump_intensity), J ~ N(0, jump_size_std) + """ + import numpy as np + rng = np.random.default_rng(seed) + prices = [0.0] * n_ticks + prices[0] = initial_price + dt = 1.0 + for t in range(1, n_ticks): + z = rng.standard_normal() + gbm = (mu - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * z + jump = 0.0 + if jump_intensity > 0 and rng.random() < jump_intensity: + jump = rng.normal(0, jump_size_std) + prices[t] = prices[t - 1] * np.exp(gbm + jump) + return prices + + +def avellaneda_stoikov_quotes( + mid_price: float, + inventory: float, + gamma: float, + sigma: float, + spread_base: float, + n_levels: int = 3, + qty_base: float = 10.0, +) -> list: + """Genera ordenes de market maker usando el modelo Avellaneda-Stoikov. + + Precio de reserva: r = mid - inventory * gamma * sigma^2 + Half spread: delta = spread_base/2 + gamma * sigma^2/2 + + Retorna lista de dicts con keys: side, price, qty + """ + reservation = mid_price - inventory * gamma * sigma**2 + half_spread = spread_base / 2 + gamma * sigma**2 / 2 + orders = [] + for level in range(n_levels): + offset = level * half_spread * 0.5 + qty = qty_base * (1 + level * 0.5) + bid_price = round(reservation - half_spread - offset, 2) + ask_price = round(reservation + half_spread + offset, 2) + if bid_price > 0: + orders.append({'side': 'buy', 'price': bid_price, 'qty': qty}) + if ask_price > 0: + orders.append({'side': 'sell', 'price': ask_price, 'qty': qty}) + return orders + + +def generate_taker_order( + alpha: float = 2.0, + size_min: float = 1.0, + size_max: float = 100.0, + buy_prob: float = 0.5, + seed: int | None = None, +) -> dict: + """Genera una market order de taker con tamano power-law (Pareto). + + P(size > x) ~ x^(-alpha). Alpha bajo = mas ballenas. + Retorna dict con keys: side, qty + """ + import numpy as np + rng = np.random.default_rng(seed) + side = 'buy' if rng.random() < buy_prob else 'sell' + raw_size = (rng.pareto(alpha) + 1) * size_min + size = min(round(raw_size, 1), size_max) + return {'side': side, 'qty': size} + + +def hawkes_intensity( + base_rate: float, + hawkes_alpha: float, + hawkes_beta: float, + event_times: list, + current_time: float, +) -> float: + """Calcula la intensidad lambda(t) de un proceso de Hawkes en el tiempo actual. + + lambda(t) = base_rate + sum(alpha * exp(-beta * (t - ti))) + donde ti son los tiempos de eventos pasados. + """ + import numpy as np + excitation = sum( + hawkes_alpha * np.exp(-hawkes_beta * (current_time - ti)) + for ti in event_times + if ti < current_time + ) + return max(0.0, base_rate + excitation) diff --git a/python/functions/finance/generate_gbm_prices.md b/python/functions/finance/generate_gbm_prices.md new file mode 100644 index 00000000..5427ac3f --- /dev/null +++ b/python/functions/finance/generate_gbm_prices.md @@ -0,0 +1,44 @@ +--- +name: generate_gbm_prices +kind: function +lang: py +domain: finance +version: "1.0.0" +purity: pure +signature: "generate_gbm_prices(initial_price: float, n_ticks: int, sigma: float, mu: float, jump_intensity: float, jump_size_std: float, seed: int) -> list[float]" +description: "Genera serie de precios fundamentales con Geometric Brownian Motion + jump-diffusion. S(t+1) = S(t) * exp((mu - sigma^2/2)*dt + sigma*sqrt(dt)*Z + J*N)." +tags: [simulation, gbm, price, montecarlo, finance, stochastic] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/finance/finance.py" +--- + +## Ejemplo + +```python +prices = generate_gbm_prices( + initial_price=100.0, + n_ticks=1000, + sigma=0.02, + mu=0.0, + jump_intensity=0.01, + jump_size_std=0.05, + seed=42, +) +# prices[0] == 100.0 +# len(prices) == 1000 +``` + +## Notas + +Funcion pura — el seed fija el resultado deterministicamente. +`jump_intensity=0.0` desactiva los saltos (GBM puro). +`dt=1.0` por tick (tiempo discreto). Para tiempo continuo, ajustar sigma y mu en consecuencia. +Requiere numpy para la generacion de numeros aleatorios y el calculo de exp. diff --git a/python/functions/finance/generate_taker_order.md b/python/functions/finance/generate_taker_order.md new file mode 100644 index 00000000..0280a503 --- /dev/null +++ b/python/functions/finance/generate_taker_order.md @@ -0,0 +1,41 @@ +--- +name: generate_taker_order +kind: function +lang: py +domain: finance +version: "1.0.0" +purity: pure +signature: "generate_taker_order(alpha: float, size_min: float, size_max: float, buy_prob: float, seed: int | None) -> dict" +description: "Genera una market order de taker con tamano distribuido segun power-law (Pareto). Alpha bajo produce ordenes mas grandes (ballenas)." +tags: [simulation, taker, power-law, montecarlo, finance, order-book] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/finance/finance.py" +--- + +## Ejemplo + +```python +order = generate_taker_order( + alpha=2.0, + size_min=1.0, + size_max=100.0, + buy_prob=0.5, + seed=42, +) +# {'side': 'buy', 'qty': 3.7} +``` + +## Notas + +Funcion pura cuando se fija seed. Con seed=None el resultado es no deterministico. +La distribucion Pareto con alpha=2 modela bien la distribucion empirica de tamaños de ordenes en mercados reales. +`size_max` actua como techo (clipping) para evitar ordenes extremas. +Retorna dict con keys: `side` ('buy' o 'sell') y `qty` (float redondeado a 1 decimal). diff --git a/python/functions/finance/hawkes_intensity.md b/python/functions/finance/hawkes_intensity.md new file mode 100644 index 00000000..5c7c6717 --- /dev/null +++ b/python/functions/finance/hawkes_intensity.md @@ -0,0 +1,43 @@ +--- +name: hawkes_intensity +kind: function +lang: py +domain: finance +version: "1.0.0" +purity: pure +signature: "hawkes_intensity(base_rate: float, hawkes_alpha: float, hawkes_beta: float, event_times: list[float], current_time: float) -> float" +description: "Calcula la intensidad lambda(t) de un proceso de Hawkes en el tiempo actual. Modela la autocorrelacion temporal de eventos de mercado (rafagas de ordenes)." +tags: [simulation, hawkes, stochastic-process, montecarlo, finance, point-process] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/finance/finance.py" +--- + +## Ejemplo + +```python +intensity = hawkes_intensity( + base_rate=1.0, + hawkes_alpha=0.8, + hawkes_beta=2.0, + event_times=[0.5, 1.2, 1.8], + current_time=2.5, +) +# Intensidad > base_rate por excitacion de eventos pasados +``` + +## Notas + +Funcion pura — determinista dado el mismo historial de eventos. +`hawkes_alpha` controla la magnitud del salto de intensidad por evento. +`hawkes_beta` controla la velocidad de decaimiento (mayor beta = decaimiento mas rapido). +La condicion de estabilidad del proceso es hawkes_alpha < hawkes_beta. +Eventos con ti >= current_time se ignoran automaticamente. +Retorna max(0.0, ...) para garantizar intensidad no negativa. diff --git a/python/functions/pipelines/extraction_pipeline.md b/python/functions/pipelines/extraction_pipeline.md new file mode 100644 index 00000000..81e5149b --- /dev/null +++ b/python/functions/pipelines/extraction_pipeline.md @@ -0,0 +1,123 @@ +--- +name: extraction_pipeline +kind: pipeline +lang: py +domain: pipelines +version: "1.0.0" +purity: impure +signature: "def extraction_pipeline(file_path: str, entity_presets: list[dict], relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], chunk_size: int = 500, chunk_overlap: int = 50, confidence_threshold: float = 0.5, dedup_threshold: float = 0.85, on_progress: Callable[[str, float], None] | None = None) -> ExtractionResult" +description: "Pipeline completa de extraccion de entidades y relaciones desde un documento. Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks -> extract_entities_llm por chunk -> deduplicate_entities -> extract_relations_llm por chunk -> deduplicate_relations." +tags: [pipeline, extraction, entities, relations, llm, nlp, fuzzygraph, datascience] +uses_functions: + - extract_text_from_file_py_core + - preprocess_text_py_core + - split_text_into_chunks_py_core + - build_entity_schema_prompt_py_datascience + - build_relation_schema_prompt_py_datascience + - extract_entities_llm_py_datascience + - extract_relations_llm_py_datascience + - deduplicate_entities_py_datascience + - deduplicate_relations_py_datascience +uses_types: + - entity_candidate_py_datascience + - extraction_result_py_datascience + - extraction_stats_py_datascience + - relation_candidate_py_datascience +returns: + - extraction_result_py_datascience +returns_optional: false +error_type: "error_go_core" +imports: + - time + - warnings + - typing.Callable +tested: true +tests: + - "documento con entidades y relaciones retorna ExtractionResult completo" + - "documento vacio retorna ExtractionResult con listas vacias" + - "documento sin entidades detectables retorna listas vacias" + - "archivo no encontrado lanza FileNotFoundError" + - "entity presets vacio lanza ValueError" + - "progress callback se invoca durante la ejecucion" + - "stats se rellenan correctamente con conteos y tiempo" +test_file_path: "python/functions/pipelines/extraction_pipeline_test.py" +file_path: "python/functions/pipelines/extraction_pipeline.py" +--- + +## Ejemplo + +```python +from python.functions.pipelines.extraction_pipeline import extraction_pipeline + +entity_presets = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality"], + }, + { + "type_ref": "osint_domain_go_cybersecurity", + "label": "Domain", + "metadata_fields": ["fqdn", "registrar"], + }, +] + +relation_types = ["operates", "owns", "funds", "communicates_with", "related_to"] + +# Inyectar un cliente LLM real +def llm_chat_json(messages): + # llamada al proveedor LLM elegido + ... + +result = extraction_pipeline( + file_path="report.pdf", + entity_presets=entity_presets, + relation_types=relation_types, + llm_chat_json=llm_chat_json, + chunk_size=500, + chunk_overlap=50, + confidence_threshold=0.5, + dedup_threshold=0.85, + on_progress=lambda msg, pct: print(f"[{pct:.0%}] {msg}"), +) + +print(f"Entities: {len(result.entities)}, Relations: {len(result.relations)}") +print(f"Stats: {result.stats}") + +# Integrar con fuzzygraph / operations.db +for entity in result.entities: + db.add_entity( + name=entity.name, + type_ref=entity.type_ref, + metadata=entity.attributes, + ) + +for relation in result.relations: + db.add_relation( + name=relation.relation_type, + from_entity=relation.from_id, + to_entity=relation.to_id, + ) +``` + +## Algoritmo + +1. **Extract:** `extract_text_from_file(file_path)` — texto crudo desde PDF, TXT, Markdown +2. **Preprocess:** `preprocess_text(text)` — normaliza espacios, caracteres especiales +3. **Split:** `split_text_into_chunks(text, chunk_size, chunk_overlap)` — divide en ventanas solapadas +4. **Extract entities per chunk (0-40%):** Para cada chunk llama `extract_entities_llm` con el schema de presets. Anota `source_chunk_index` en cada candidato +5. **Filter:** filtra por `confidence >= confidence_threshold` +6. **Deduplicate entities (40%):** `deduplicate_entities` con fuzzy matching, produce `entity_id_map` +7. **Extract relations per chunk (40-80%):** Para cada chunk obtiene las entidades de ese chunk y llama `extract_relations_llm` +8. **Deduplicate relations (80-100%):** `deduplicate_relations` resuelve nombres a IDs y colapsa duplicados +9. **Return:** `ExtractionResult` con entidades, relaciones y stats del proceso + +## Notas + +- El parametro `llm_chat_json` inyecta el cliente LLM, sin acoplamiento a ningun proveedor (OpenAI, Anthropic, Ollama, etc.) +- El progress callback cubre: 0-40% extraccion de entidades, 40-80% extraccion de relaciones, 80-100% deduplicacion +- Si el archivo no existe lanza `FileNotFoundError` antes de cualquier llamada al LLM +- Si `entity_presets` esta vacio lanza `ValueError` +- Errores en chunks individuales se capturan con warnings y continuan (robustez) +- Los `entity_id_map` de `deduplicate_entities` conectan nombres originales del texto con IDs UUID finales para `deduplicate_relations` +- La retorna `ExtractionResult` esta lista para insertar en `operations.db` via `fn ops entity add` / `fn ops relation add` diff --git a/python/functions/pipelines/extraction_pipeline.py b/python/functions/pipelines/extraction_pipeline.py new file mode 100644 index 00000000..6e499f0a --- /dev/null +++ b/python/functions/pipelines/extraction_pipeline.py @@ -0,0 +1,211 @@ +"""Pipeline de extraccion de entidades y relaciones desde un documento.""" + +from __future__ import annotations + +import sys +import os +import time +import warnings +from typing import Callable + +# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo +_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from python.functions.core.extract_text_from_file import extract_text_from_file +from python.functions.core.core import preprocess_text +from python.functions.core.split_text_into_chunks import split_text_into_chunks +from python.functions.datascience.build_entity_schema_prompt import build_entity_schema_prompt +from python.functions.datascience.build_relation_schema_prompt import build_relation_schema_prompt +from python.functions.datascience.extract_entities_llm import extract_entities_llm +from python.functions.datascience.extract_relations_llm import extract_relations_llm +from python.functions.datascience.deduplicate_entities import deduplicate_entities +from python.functions.datascience.deduplicate_relations import deduplicate_relations +from python.types.datascience.entity_candidate import EntityCandidate +from python.types.datascience.extraction_result import ExtractionResult +from python.types.datascience.extraction_stats import ExtractionStats + + +def extraction_pipeline( + file_path: str, + entity_presets: list[dict], + relation_types: list[str], + llm_chat_json: Callable[[list[dict]], dict], + chunk_size: int = 500, + chunk_overlap: int = 50, + confidence_threshold: float = 0.5, + dedup_threshold: float = 0.85, + on_progress: Callable[[str, float], None] | None = None, +) -> ExtractionResult: + """Pipeline completa de extraccion de entidades y relaciones desde un documento. + + Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks + -> extract_entities_llm por chunk -> deduplicate_entities -> + extract_relations_llm por chunk -> deduplicate_relations. + + Args: + file_path: ruta al archivo a procesar (PDF, Markdown, TXT). + entity_presets: lista de dicts con type_ref, label y metadata_fields. + Ejemplo: [{"type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "nationality"]}] + relation_types: tipos de relacion permitidos para extraccion. + Ejemplo: ["funds", "employs", "communicates_with", "owns"] + llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict + con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor. + chunk_size: numero de caracteres por chunk (default 500). + chunk_overlap: overlap entre chunks consecutivos (default 50). + confidence_threshold: umbral minimo de confidence para aceptar entidades + candidatas antes de deduplicar (default 0.5). + dedup_threshold: score minimo de similitud para mergear entidades (default 0.85). + on_progress: callback opcional de progreso (message: str, pct: float 0-1). + 0-40%: extraccion de entidades, 40-80%: extraccion de relaciones, + 80-100%: deduplicacion. + + Returns: + ExtractionResult con entidades y relaciones deduplicadas y stats del proceso. + + Raises: + FileNotFoundError: si file_path no existe. + ValueError: si entity_presets esta vacio. + """ + if not entity_presets: + raise ValueError("entity_presets no puede estar vacio") + + if not os.path.exists(file_path): + raise FileNotFoundError(f"Archivo no encontrado: {file_path}") + + def _progress(msg: str, pct: float) -> None: + if on_progress is not None: + try: + on_progress(msg, pct) + except Exception: + pass + + start_time = time.monotonic() + stats = ExtractionStats() + + # ── Paso 1: Extraer texto ────────────────────────────────────────────────── + _progress("Extracting text from file...", 0.0) + try: + raw_text = extract_text_from_file(file_path) + except Exception as exc: + warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}") + raw_text = "" + + # ── Paso 2: Preprocesar ──────────────────────────────────────────────────── + clean_text = preprocess_text(raw_text) + stats.total_chars = len(clean_text) + + # ── Paso 3: Dividir en chunks ────────────────────────────────────────────── + chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap) + n = len(chunks) + stats.total_chunks = n + + if n == 0: + stats.processing_time_seconds = time.monotonic() - start_time + return ExtractionResult(entities=[], relations=[], stats=stats) + + # ── Paso 4: Extraer entidades por chunk ──────────────────────────────────── + all_raw_entities: list[EntityCandidate] = [] + + for i, chunk in enumerate(chunks): + _progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4) + try: + candidates = extract_entities_llm( + text=chunk, + entity_schema=entity_presets, + llm_chat_json=llm_chat_json, + ) + except Exception as exc: + warnings.warn( + f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}" + ) + candidates = [] + + for candidate in candidates: + # Anotar el chunk de origen + if i not in candidate.source_chunk_indices: + candidate.source_chunk_indices.append(i) + all_raw_entities.append(candidate) + + # ── Paso 5: Filtrar por confidence ───────────────────────────────────────── + filtered_entities = [ + e for e in all_raw_entities if e.confidence >= confidence_threshold + ] + stats.raw_entities_count = len(filtered_entities) + + # Actualizar stats de tipos + for ent in filtered_entities: + stats.entity_types_found[ent.type_ref] = ( + stats.entity_types_found.get(ent.type_ref, 0) + 1 + ) + + # ── Paso 6: Deduplicar entidades ─────────────────────────────────────────── + _progress("Deduplicating entities...", 0.4) + dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold) + + stats.final_entities_count = dedup_result.total_after + stats.entities_merged = dedup_result.total_before - dedup_result.total_after + + final_entities = dedup_result.entities + entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id + + # ── Paso 7: Extraer relaciones por chunk ─────────────────────────────────── + all_raw_relations = [] + + for i, chunk in enumerate(chunks): + _progress(f"Extracting relations...", 0.4 + (i / n) * 0.4) + + # Obtener entidades relevantes de este chunk + chunk_entities = [ + e for e in final_entities if i in e.source_chunk_indices + ] + # Si no hay entidades en este chunk especifico, usar todas + if not chunk_entities: + chunk_entities = final_entities + + if len(chunk_entities) < 2: + continue + + try: + chunk_relations = extract_relations_llm( + text=chunk, + entities=chunk_entities, + relation_types=relation_types, + llm_chat_json=llm_chat_json, + ) + except Exception as exc: + warnings.warn( + f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}" + ) + chunk_relations = [] + + for rel in chunk_relations: + rel.source_chunk_index = i + all_raw_relations.extend(chunk_relations) + + stats.raw_relations_count = len(all_raw_relations) + + # Actualizar stats de tipos de relacion + for rel in all_raw_relations: + stats.relation_types_found[rel.relation_type] = ( + stats.relation_types_found.get(rel.relation_type, 0) + 1 + ) + + # ── Paso 8: Deduplicar relaciones ────────────────────────────────────────── + _progress("Deduplicating relations...", 0.8) + final_relations = deduplicate_relations(all_raw_relations, entity_id_map) + + stats.final_relations_count = len(final_relations) + stats.relations_merged = stats.raw_relations_count - len(final_relations) + stats.processing_time_seconds = time.monotonic() - start_time + + _progress("Done", 1.0) + + return ExtractionResult( + entities=final_entities, + relations=final_relations, + stats=stats, + ) diff --git a/python/functions/pipelines/extraction_pipeline_test.py b/python/functions/pipelines/extraction_pipeline_test.py new file mode 100644 index 00000000..95f70d5d --- /dev/null +++ b/python/functions/pipelines/extraction_pipeline_test.py @@ -0,0 +1,227 @@ +"""Tests para extraction_pipeline.""" + +from __future__ import annotations + +import os +import sys +import tempfile + +_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from python.functions.pipelines.extraction_pipeline import extraction_pipeline + + +# ── LLM stubs ───────────────────────────────────────────────────────────────── + +def _llm_with_entities(messages: list[dict]) -> dict: + """LLM stub que retorna entidades fijas para el primer mensaje de extraccion.""" + system_content = messages[0]["content"] if messages else "" + if "entity" in system_content.lower() or "entities" in system_content.lower(): + return { + "entities": [ + { + "name": "John Smith", + "type_ref": "osint_person_go_cybersecurity", + "attributes": {"full_name": "John Smith", "nationality": "US"}, + "confidence": 0.95, + }, + { + "name": "evil-corp.com", + "type_ref": "osint_domain_go_cybersecurity", + "attributes": {"fqdn": "evil-corp.com"}, + "confidence": 0.88, + }, + ] + } + # Llamada de relaciones + return { + "relations": [ + { + "from_name": "John Smith", + "to_name": "evil-corp.com", + "relation_type": "operates", + "description": "John Smith operates evil-corp.com", + "confidence": 0.8, + } + ] + } + + +def _llm_empty(messages: list[dict]) -> dict: + """LLM stub que retorna siempre resultado vacio.""" + system_content = messages[0]["content"] if messages else "" + if "entit" in system_content.lower(): + return {"entities": []} + return {"relations": []} + + +ENTITY_PRESETS = [ + { + "type_ref": "osint_person_go_cybersecurity", + "label": "Person", + "metadata_fields": ["full_name", "alias", "nationality"], + }, + { + "type_ref": "osint_domain_go_cybersecurity", + "label": "Domain", + "metadata_fields": ["fqdn", "registrar"], + }, +] + +RELATION_TYPES = ["operates", "owns", "funds", "communicates_with", "related_to"] + + +# ── Tests ────────────────────────────────────────────────────────────────────── + +def test_documento_con_entidades_y_relaciones(): + """documento con entidades y relaciones retorna ExtractionResult completo""" + text = ( + "John Smith, a US national, operates the domain evil-corp.com. " + "He was identified as the main administrator of the infrastructure." + ) + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write(text) + tmp_path = f.name + + try: + result = extraction_pipeline( + file_path=tmp_path, + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_with_entities, + chunk_size=500, + chunk_overlap=50, + confidence_threshold=0.5, + dedup_threshold=0.85, + ) + assert result is not None + assert len(result.entities) >= 1 + assert result.stats.total_chunks >= 1 + assert result.stats.total_chars > 0 + finally: + os.unlink(tmp_path) + + +def test_documento_vacio(): + """documento vacio retorna ExtractionResult con listas vacias""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write("") + tmp_path = f.name + + try: + result = extraction_pipeline( + file_path=tmp_path, + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_empty, + ) + assert result is not None + assert result.entities == [] + assert result.relations == [] + assert result.stats.total_chunks == 0 + finally: + os.unlink(tmp_path) + + +def test_documento_sin_entidades_detectables(): + """documento sin entidades detectables retorna listas vacias""" + text = "The weather is nice today. The sun shines brightly over the mountains." + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write(text) + tmp_path = f.name + + try: + result = extraction_pipeline( + file_path=tmp_path, + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_empty, + confidence_threshold=0.5, + ) + assert result is not None + assert result.entities == [] + assert result.relations == [] + assert result.stats.raw_entities_count == 0 + finally: + os.unlink(tmp_path) + + +def test_archivo_no_encontrado_lanza_filenotfounderror(): + """archivo no encontrado lanza FileNotFoundError""" + import pytest + with pytest.raises(FileNotFoundError): + extraction_pipeline( + file_path="/tmp/no_existe_para_test_extraccion_pipeline.txt", + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_empty, + ) + + +def test_entity_presets_vacio_lanza_valueerror(): + """entity presets vacio lanza ValueError""" + import pytest + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write("some text") + tmp_path = f.name + + try: + with pytest.raises(ValueError): + extraction_pipeline( + file_path=tmp_path, + entity_presets=[], + relation_types=RELATION_TYPES, + llm_chat_json=_llm_empty, + ) + finally: + os.unlink(tmp_path) + + +def test_progress_callback_se_invoca(): + """progress callback se invoca durante la ejecucion""" + calls: list[tuple[str, float]] = [] + + def _on_progress(msg: str, pct: float) -> None: + calls.append((msg, pct)) + + text = "John Smith operates evil-corp.com." + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write(text) + tmp_path = f.name + + try: + extraction_pipeline( + file_path=tmp_path, + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_with_entities, + on_progress=_on_progress, + ) + assert len(calls) > 0 + messages = [c[0] for c in calls] + assert any("Extracting" in m or "Done" in m or "Dedup" in m for m in messages) + finally: + os.unlink(tmp_path) + + +def test_stats_se_rellenan_correctamente(): + """stats se rellenan correctamente con conteos y tiempo""" + text = "John Smith, a US national, operates the domain evil-corp.com." + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f: + f.write(text) + tmp_path = f.name + + try: + result = extraction_pipeline( + file_path=tmp_path, + entity_presets=ENTITY_PRESETS, + relation_types=RELATION_TYPES, + llm_chat_json=_llm_with_entities, + ) + assert result.stats.total_chars > 0 + assert result.stats.total_chunks >= 1 + assert result.stats.processing_time_seconds >= 0.0 + finally: + os.unlink(tmp_path) diff --git a/python/functions/pipelines/monte_carlo_market.md b/python/functions/pipelines/monte_carlo_market.md new file mode 100644 index 00000000..443f2f70 --- /dev/null +++ b/python/functions/pipelines/monte_carlo_market.md @@ -0,0 +1,74 @@ +--- +name: monte_carlo_market +kind: pipeline +lang: py +domain: pipelines +version: "1.0.0" +purity: impure +signature: "def monte_carlo_market(n_simulations: int, base_params: dict, vary_params: dict, seed_start: int) -> list[dict]" +description: "Ejecuta N simulaciones de mercado con parámetros variados uniformemente. Cada simulación usa run_market_sim y retorna métricas resumen: spreads, trades por tick, volatilidad realizada y PnL total de makers." +tags: [montecarlo, simulation, market, launcher, finance, microstructure] +uses_functions: + - run_market_sim_py_pipelines +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/pipelines/monte_carlo_market.py" +--- + +## Ejemplo + +```bash +# 10 simulaciones con sigma y gamma variables +python python/functions/pipelines/monte_carlo_market.py -n 10 +``` + +```python +from monte_carlo_market import monte_carlo_market + +results = monte_carlo_market( + n_simulations=50, + base_params={'n_ticks': 300, 'n_makers': 3}, + vary_params={ + 'sigma': (0.005, 0.05), + 'gamma': (0.01, 1.0), + 'hawkes_alpha': (0.1, 0.9), + }, + seed_start=42, +) +# Cada resultado tiene: sim_id, seed, sigma, gamma, hawkes_alpha, +# total_trades, mean_spread, std_spread, mean_trades_per_tick, +# price_return, maker_total_pnl, realized_vol +``` + +## Flujo + +1. Para cada simulación i en range(n_simulations): + - Tomar `base_params` + `seed = seed_start + i` + - Samplear `vary_params` uniformemente con rng derivado de `seed_start` + - Llamar `run_market_sim(**params)` + - Calcular métricas resumen sobre el resultado +2. Reportar progreso cada 10% de simulaciones +3. Retornar lista de dicts con params usados + métricas + +## Métricas por simulación + +| Campo | Descripción | +|---|---| +| `total_trades` | Número total de trades en la simulación | +| `mean_spread` | Spread bid-ask medio | +| `std_spread` | Desviación estándar del spread | +| `mean_trades_per_tick` | Intensidad media del flujo de órdenes | +| `price_return` | Retorno % del precio fundamental | +| `maker_total_pnl` | PnL agregado de todos los makers | +| `realized_vol` | Volatilidad realizada de los trade prices (si hay trades) | + +## Notas + +`vary_params` acepta cualquier parámetro válido de `run_market_sim` como clave, con valor `(min, max)`. +Los parámetros en `base_params` tienen precedencia sobre los defaults pero son sobreescritos por `vary_params`. diff --git a/python/functions/pipelines/monte_carlo_market.py b/python/functions/pipelines/monte_carlo_market.py new file mode 100644 index 00000000..b0bd735b --- /dev/null +++ b/python/functions/pipelines/monte_carlo_market.py @@ -0,0 +1,91 @@ +"""Ejecuta N simulaciones de mercado con parámetros variables para análisis Monte Carlo.""" + +import sys +import os +import json + + +def monte_carlo_market( + n_simulations: int = 100, + base_params: dict | None = None, + vary_params: dict | None = None, + seed_start: int = 0, +) -> list[dict]: + """Ejecuta N simulaciones variando parámetros. + + base_params: parámetros fijos para run_market_sim + vary_params: dict de param_name -> (min, max) para variar uniformemente + + Retorna lista de dicts, cada uno con los params usados + métricas resumen. + """ + import numpy as np + + sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions')) + sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions', 'pipelines')) + from run_market_sim import run_market_sim + + if base_params is None: + base_params = {} + if vary_params is None: + vary_params = {} + + rng = np.random.default_rng(seed_start) + results = [] + + for i in range(n_simulations): + params = dict(base_params) + params['seed'] = seed_start + i + + # Variar parámetros + varied = {} + for pname, (pmin, pmax) in vary_params.items(): + val = rng.uniform(pmin, pmax) + params[pname] = round(val, 6) + varied[pname] = params[pname] + + sim = run_market_sim(**params) + + # Métricas resumen + spreads = sim['spreads'] + trade_prices = sim['trade_prices'] + n_per_tick = sim['n_trades_per_tick'] + + result = { + 'sim_id': i, + 'seed': params['seed'], + **varied, + 'total_trades': sim['total_trades'], + 'mean_spread': round(np.mean(spreads), 6) if spreads else 0, + 'std_spread': round(np.std(spreads), 6) if spreads else 0, + 'mean_trades_per_tick': round(np.mean(n_per_tick), 2), + 'price_return': round((sim['fundamental_prices'][-1] / sim['fundamental_prices'][0] - 1) * 100, 4), + 'maker_total_pnl': round(sum(sim['maker_pnls']), 2), + } + + if trade_prices: + tp = np.array(trade_prices) + log_ret = np.diff(np.log(tp[tp > 0])) + if len(log_ret) > 1: + result['realized_vol'] = round(float(np.std(log_ret)), 6) + + results.append(result) + + if (i + 1) % max(1, n_simulations // 10) == 0: + print(f' {i+1}/{n_simulations} simulaciones completadas') + + return results + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-n', type=int, default=10) + args = parser.parse_args() + + results = monte_carlo_market( + n_simulations=args.n, + base_params={'n_ticks': 200}, + vary_params={'sigma': (0.005, 0.05), 'gamma': (0.01, 1.0)}, + ) + print(json.dumps(results[-1], indent=2)) + print(f'\n{len(results)} simulaciones completadas') diff --git a/python/functions/pipelines/run_market_sim.md b/python/functions/pipelines/run_market_sim.md new file mode 100644 index 00000000..f26f65e9 --- /dev/null +++ b/python/functions/pipelines/run_market_sim.md @@ -0,0 +1,65 @@ +--- +name: run_market_sim +kind: pipeline +lang: py +domain: pipelines +version: "1.0.0" +purity: impure +signature: "def run_market_sim(initial_price: float, n_ticks: int, sigma: float, mu: float, jump_intensity: float, jump_size_std: float, n_makers: int, maker_spread: float, gamma: float, maker_levels: int, maker_qty: float, n_takers_lambda: float, taker_size_alpha: float, taker_size_min: float, taker_size_max: float, hawkes_alpha: float, hawkes_beta: float, seed: int) -> dict" +description: "Simula un mercado completo con matching engine FIFO. Makers usan Avellaneda-Stoikov, takers llegan según proceso Hawkes con tamaños power-law. Retorna trades, spreads, midprices y PnL de makers." +tags: [simulation, market, matching-engine, montecarlo, launcher, finance, microstructure] +uses_functions: + - generate_gbm_prices_py_finance + - avellaneda_stoikov_quotes_py_finance +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [numpy] +tested: false +tests: [] +test_file_path: "" +file_path: "python/functions/pipelines/run_market_sim.py" +--- + +## Ejemplo + +```bash +python python/functions/pipelines/run_market_sim.py +# { +# "total_trades": 1234, +# "mean_spread": 0.4821, +# "maker_pnls": [12.5, -3.2, 8.1, 5.6, -1.4] +# } +``` + +```python +from run_market_sim import run_market_sim + +result = run_market_sim( + initial_price=100.0, + n_ticks=200, + sigma=0.01, + n_makers=3, + seed=0, +) +print(result['total_trades']) +print(result['maker_pnls']) +``` + +## Flujo + +1. `generate_gbm_prices` — genera la serie de precios fundamentales con GBM + saltos +2. Loop por ticks: + - Cada maker coloca quotes via `avellaneda_stoikov_quotes` + - Takers llegan según Poisson con intensidad modulada por excitación Hawkes + - Tamaños de taker siguen distribución Pareto (power-law) + - Matching FIFO sobre el order book simplificado + - Excitación Hawkes decae exponencialmente entre ticks +3. Mark-to-market final de inventarios de makers + +## Notas + +Los parámetros Hawkes (`hawkes_alpha`, `hawkes_beta`) controlan la autocorrelación del flujo de órdenes. +`branching_ratio = hawkes_alpha / hawkes_beta`; si > 1, el proceso es explosivo. +El matching es simplificado: no hay cancelaciones intra-tick, el book se reconstituye en cada tick. diff --git a/python/functions/pipelines/run_market_sim.py b/python/functions/pipelines/run_market_sim.py new file mode 100644 index 00000000..4e5a142f --- /dev/null +++ b/python/functions/pipelines/run_market_sim.py @@ -0,0 +1,149 @@ +"""Ejecuta una simulación de mercado completa con matching engine FIFO.""" + +import sys +import os +import json + + +def run_market_sim( + initial_price: float = 100.0, + n_ticks: int = 500, + sigma: float = 0.02, + mu: float = 0.0, + jump_intensity: float = 0.02, + jump_size_std: float = 0.05, + n_makers: int = 5, + maker_spread: float = 0.5, + gamma: float = 0.1, + maker_levels: int = 3, + maker_qty: float = 10.0, + n_takers_lambda: float = 2.0, + taker_size_alpha: float = 2.0, + taker_size_min: float = 1.0, + taker_size_max: float = 100.0, + hawkes_alpha: float = 0.5, + hawkes_beta: float = 1.0, + seed: int = 42, +) -> dict: + """Simula un mercado con makers (Avellaneda-Stoikov) y takers (Hawkes + power-law). + + Retorna dict con: + - trade_prices, trade_times, trade_sizes: listas de trades + - spreads, midprices: series por tick + - n_trades_per_tick: arrivals por tick + - maker_pnls: PnL final de cada maker + - total_trades: conteo total + """ + import numpy as np + + # Importar funciones del registry + sys.path.insert(0, os.path.join(os.environ.get('FN_REGISTRY_ROOT', os.path.expanduser('~/fn_registry')), 'python', 'functions')) + from finance.finance import generate_gbm_prices, avellaneda_stoikov_quotes + + rng = np.random.default_rng(seed) + + # Generar precios fundamentales + fund_prices = generate_gbm_prices(initial_price, n_ticks, sigma, mu, jump_intensity, jump_size_std, seed) + + # Order book simplificado: listas de (price, qty, maker_idx) + # Matching inline para no depender del notebook + trade_prices, trade_times, trade_sizes = [], [], [] + spreads, midprices = [], [] + n_trades_per_tick = [] + maker_inventories = [0.0] * n_makers + maker_pnls = [0.0] * n_makers + hawkes_excitation = 0.0 + + for t in range(n_ticks): + mid = fund_prices[t] + + # Makers place orders + all_bids = [] # (price, qty, maker_idx) + all_asks = [] + for m in range(n_makers): + noise = rng.uniform(-0.05, 0.05) + quotes = avellaneda_stoikov_quotes( + mid + noise, maker_inventories[m], gamma, sigma, maker_spread, maker_levels, maker_qty + ) + for q in quotes: + if q['side'] == 'buy': + all_bids.append((q['price'], q['qty'], m)) + else: + all_asks.append((q['price'], q['qty'], m)) + + all_bids.sort(key=lambda x: -x[0]) # best bid first + all_asks.sort(key=lambda x: x[0]) # best ask first + + # Record book state + if all_bids and all_asks: + spreads.append(all_asks[0][0] - all_bids[0][0]) + midprices.append((all_bids[0][0] + all_asks[0][0]) / 2) + else: + spreads.append(0.0) + midprices.append(mid) + + # Takers arrive (Hawkes) + lam = max(0.1, n_takers_lambda + hawkes_excitation) + n_takers = rng.poisson(lam) + tick_trades = 0 + + for _ in range(n_takers): + side = 'buy' if rng.random() < 0.5 else 'sell' + raw_size = (rng.pareto(taker_size_alpha) + 1) * taker_size_min + qty_remaining = min(round(raw_size, 1), taker_size_max) + + book = list(all_asks) if side == 'buy' else list(all_bids) + + for i, (price, available, maker_idx) in enumerate(book): + if qty_remaining <= 0: + break + fill = min(qty_remaining, available) + trade_prices.append(price) + trade_times.append(t) + trade_sizes.append(fill) + tick_trades += 1 + qty_remaining -= fill + + if side == 'buy': + maker_inventories[maker_idx] -= fill + maker_pnls[maker_idx] += price * fill + else: + maker_inventories[maker_idx] += fill + maker_pnls[maker_idx] -= price * fill + + book[i] = (price, available - fill, maker_idx) + + if side == 'buy': + all_asks = [(p, q, m) for p, q, m in book if q > 0] + else: + all_bids = [(p, q, m) for p, q, m in book if q > 0] + + hawkes_excitation *= np.exp(-hawkes_beta) + hawkes_excitation += hawkes_alpha * tick_trades + n_trades_per_tick.append(tick_trades) + + # Mark to market + final_price = fund_prices[-1] + for m in range(n_makers): + maker_pnls[m] += maker_inventories[m] * final_price + + return { + 'trade_prices': trade_prices, + 'trade_times': trade_times, + 'trade_sizes': trade_sizes, + 'spreads': spreads, + 'midprices': midprices, + 'n_trades_per_tick': n_trades_per_tick, + 'fundamental_prices': fund_prices, + 'maker_pnls': [round(p, 2) for p in maker_pnls], + 'total_trades': len(trade_prices), + } + + +if __name__ == '__main__': + result = run_market_sim() + print(json.dumps({ + 'total_trades': result['total_trades'], + 'mean_spread': round(sum(result['spreads']) / len(result['spreads']), 4), + 'maker_pnls': result['maker_pnls'], + }, indent=2))