chore: initial sync

This commit is contained in:
fn-registry agent
2026-04-28 22:13:08 +02:00
commit 40bea81603
30 changed files with 6675 additions and 0 deletions
+40
View File
@@ -0,0 +1,40 @@
# JUPYTER HABILITADO EN ESTE ANALISIS
## Reglas OBLIGATORIAS para Claude
### 1. CODIGO INMUTABLE — NUNCA MODIFICAR CELDAS EXISTENTES
- **PROHIBIDO** usar NotebookEdit para reemplazar celdas existentes
- **SIEMPRE** anadir celdas NUEVAS al final del notebook
- Si hay un error en una celda, crear celda nueva con la correccion
- El historial de trabajo debe quedar intacto para trazabilidad
### 2. PROGRAMACION FUNCIONAL OBLIGATORIA
- **Funciones puras**: sin efectos secundarios, mismo input -> mismo output
- **Inmutabilidad**: nunca mutar datos, crear copias transformadas
- **Composicion**: funciones pequenas que se combinan
- Preferir: `map`, `filter`, `reduce`, list comprehensions
- Evitar: loops con mutacion, `global`, modificar argumentos in-place
### 3. SIEMPRE usar MCP jupyter para ejecutar codigo Python
- Las ejecuciones se ven en tiempo real en Jupyter Lab del usuario
- Compartimos variables y estado del kernel
- **NUNCA usar bash para ejecutar Python en este analisis**
### 4. Verificar Jupyter activo ANTES de ejecutar
- Si no esta activo: pedir al usuario que ejecute `./run-jupyter-lab.sh`
### 5. Gestion de notebooks
- Notebooks en la carpeta `notebooks/` o subcarpetas
- Si un notebook tiene >50 celdas, crear uno nuevo
- Nombrar descriptivamente: `01_exploracion.ipynb`, `02_limpieza.ipynb`
### 6. Gestion de Python
- **SIEMPRE usar `uv`** para gestionar dependencias
- Anadir paquetes con `uv add nombre_paquete`
### 7. Acceso al fn_registry
- `FN_REGISTRY_ROOT` apunta a la raiz del registry
- Para importar funciones Python: `sys.path.insert(0, os.path.join(os.environ["FN_REGISTRY_ROOT"], "python", "functions"))`
- Para consultar registry.db: `sqlite3` o `import sqlite3` con la ruta `$FN_REGISTRY_ROOT/registry.db`
+12
View File
@@ -0,0 +1,12 @@
.venv/
.mcp.json
.jupyter-port
.jupyter/
.jupyter_ystore.db
.ipython/
__pycache__/
*.pyc
.ipynb_checkpoints/
bin/
data/
.DS_Store
+1
View File
@@ -0,0 +1 @@
3.13
View File
+540
View File
@@ -0,0 +1,540 @@
"""Extracción de grafo ontológico desde un documento.
Uso: python extract.py <archivo>
python extract.py data/condiciones-generales-bizum.pdf
Optimizaciones vs extraction_pipeline:
- 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos)
- Chunks de 2000 chars
- Paralelizado con ThreadPoolExecutor
"""
import sys
import os
import json
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib"))
from extract_text_from_file import extract_text_from_file
from core_functions import preprocess_text, extract_json_from_llm
from split_text_into_chunks import split_text_into_chunks
from deduplicate_entities import deduplicate_entities
from deduplicate_relations import deduplicate_relations
from entity_candidate import EntityCandidate
from relation_candidate import RelationCandidate
from render_sigma_html import render_sigma_html
# ── Presets ────────────────────────────────────────────────────────────────────
OSINT_PRESETS = [
{"type_ref": "person", "label": "Person",
"metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
{"type_ref": "organization", "label": "Organization",
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
{"type_ref": "location", "label": "Location",
"metadata_fields": ["lat", "lon", "address", "country", "city"]},
{"type_ref": "event", "label": "Event",
"metadata_fields": ["event_type", "date", "location", "description", "severity"]},
{"type_ref": "email", "label": "Email",
"metadata_fields": ["address", "provider", "verified", "breached"]},
{"type_ref": "domain", "label": "Domain",
"metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
{"type_ref": "ip_address", "label": "IP Address",
"metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
{"type_ref": "phone", "label": "Phone",
"metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
{"type_ref": "social_media", "label": "Social Media Account",
"metadata_fields": ["platform", "username", "url", "followers", "verified"]},
{"type_ref": "document", "label": "Document",
"metadata_fields": ["title", "format", "classification", "source"]},
{"type_ref": "crypto_wallet", "label": "Crypto Wallet",
"metadata_fields": ["address", "blockchain", "balance"]},
{"type_ref": "malware", "label": "Malware",
"metadata_fields": ["family", "hash_sha256", "threat_level"]},
{"type_ref": "vulnerability", "label": "Vulnerability",
"metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]},
]
GENERIC_PRESETS = [
{"type_ref": "concept", "label": "Concept",
"metadata_fields": ["name", "category", "definition"]},
{"type_ref": "url", "label": "URL/Link",
"metadata_fields": ["url", "domain", "context"]},
{"type_ref": "date_reference", "label": "Date/Time",
"metadata_fields": ["date", "precision", "context"]},
{"type_ref": "quantity", "label": "Quantity/Amount",
"metadata_fields": ["value", "unit", "context"]},
{"type_ref": "coordinates", "label": "Coordinates",
"metadata_fields": ["lat", "lon", "label"]},
{"type_ref": "text_fragment", "label": "Key Text Fragment",
"metadata_fields": ["text", "category", "relevance"]},
]
# ── Custom presets (acumulativo, pensado para promoción al registry) ───────────
CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json")
def load_custom_presets() -> list[dict]:
"""Carga presets custom desde data/custom_presets.json si existe."""
if not os.path.exists(CUSTOM_PRESETS_PATH):
return []
with open(CUSTOM_PRESETS_PATH) as f:
data = json.load(f)
return data.get("presets", [])
def save_custom_presets(presets: list[dict]) -> None:
"""Guarda presets custom en data/custom_presets.json.
Formato pensado para promoción al registry:
{
"presets": [
{
"type_ref": "snake_case_id",
"label": "Human Label",
"metadata_fields": ["field1", "field2"],
"reason": "why this type exists",
"source_doc": "document where it was first discovered",
"promoted": false // true cuando se registre en el registry
}
]
}
"""
os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True)
with open(CUSTOM_PRESETS_PATH, "w") as f:
json.dump({"presets": presets}, f, ensure_ascii=False, indent=2)
def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]:
"""Mergea tipos sugeridos con custom existentes. Dedup por type_ref."""
existing = load_custom_presets()
existing_refs = {p["type_ref"] for p in existing}
added = []
for s in suggested:
ref = s.get("type_ref", "")
if not ref or ref in existing_refs:
continue
existing_refs.add(ref)
preset = {
"type_ref": ref,
"label": s.get("label", ref),
"metadata_fields": s.get("metadata_fields", []),
"reason": s.get("reason", ""),
"source_doc": source_doc,
"promoted": False,
}
existing.append(preset)
added.append(preset)
if added:
save_custom_presets(existing)
return added
RELATION_TYPES = [
"employs", "works_for", "founded", "owns", "controls",
"member_of", "affiliated_with", "collaborates_with",
"communicates_with", "sent_to", "received_from",
"located_in", "headquartered_in", "traveled_to", "operates_in",
"participated_in", "caused", "occurred_at", "occurred_on",
"mentions", "references", "describes", "authored", "published",
"funds", "transacted_with", "invested_in",
"hosts", "resolves_to", "exploits", "targets",
"related_to", "part_of", "instance_of", "has_attribute",
]
# ── LLM wrapper ───────────────────────────────────────────────────────────────
def claude_haiku_json(messages: list[dict]) -> dict:
parts = []
for msg in messages:
if msg["role"] == "system":
parts.append(f"[SYSTEM]\n{msg['content']}")
elif msg["role"] == "user":
parts.append(f"[USER]\n{msg['content']}")
prompt = "\n\n".join(parts)
result = subprocess.run(
["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
capture_output=True, text=True, timeout=120,
)
if result.returncode != 0:
raise RuntimeError(f"claude -p failed: {result.stderr[:200]}")
envelope = json.loads(result.stdout)
return extract_json_from_llm(envelope.get("result", ""))
# ── Unified prompt ─────────────────────────────────────────────────────────────
def build_unified_prompt(presets, rel_types):
type_lines = []
for p in presets:
fields = ", ".join(p.get("metadata_fields", []))
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
return (
"You are an entity and relation extraction expert. "
"Given text, extract ALL entities and relations in a single pass.\n\n"
"ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
"RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
'OUTPUT FORMAT (strict JSON):\n'
'{\n'
' "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
' "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n'
' "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n'
'}\n\n'
"RULES:\n"
"- Extract ALL entities explicitly mentioned\n"
"- Use exact type_ref from schema. Unknown attributes = null\n"
"- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
"- Relations: from_name/to_name MUST match entity names exactly\n"
"- suggested_types: for important entities that do NOT fit any type, suggest a new type. "
"Use those suggested type_refs for those entities in the entities array.\n"
'- If no new types needed: "suggested_types": []\n'
"- Respond in the same language as the text for descriptions"
)
# ── Process one chunk ──────────────────────────────────────────────────────────
def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str):
"""Procesa un chunk: extrae entities + relations + suggested_types."""
try:
resp = claude_haiku_json([
{"role": "system", "content": system_prompt},
{"role": "user", "content": chunk_text},
])
except Exception as e:
print(f" [WARN] chunk {chunk_idx}: {e}")
return [], [], []
raw_entities = resp.get("entities", [])
raw_relations = resp.get("relations", [])
suggested = resp.get("suggested_types", [])
entities = []
for ent in raw_entities:
name = ent.get("name", "").strip()
if not name:
continue
entities.append(EntityCandidate(
name=name,
type_ref=ent.get("type_ref", "concept"),
attributes=ent.get("attributes", {}),
confidence=float(ent.get("confidence", 0.5)),
source_chunk_indices=[chunk_idx],
))
relations = []
for rel in raw_relations:
fn = rel.get("from_name", "").strip()
tn = rel.get("to_name", "").strip()
if not fn or not tn:
continue
relations.append(RelationCandidate(
from_name=fn,
to_name=tn,
relation_type=rel.get("relation_type", "related_to"),
confidence=float(rel.get("confidence", 0.5)),
description=rel.get("description", ""),
source_chunk_index=chunk_idx,
))
return entities, relations, suggested
# ── Sigma conversion ───────────────────────────────────────────────────────────
TYPE_COLORS = {
"person": "#e74c3c",
"organization": "#3498db",
"location": "#2ecc71",
"event": "#f39c12",
"email": "#9b59b6",
"domain": "#1abc9c",
"ip_address": "#e67e22",
"phone": "#95a5a6",
"social_media": "#e91e63",
"document": "#607d8b",
"crypto_wallet": "#ff9800",
"malware": "#f44336",
"vulnerability": "#ff5722",
"concept": "#00bcd4",
"url": "#8bc34a",
"date_reference": "#cddc39",
"quantity": "#ffc107",
"coordinates": "#4caf50",
"text_fragment": "#78909c",
}
def to_sigma(entities, relations, entity_id_map):
# Build name→UUID lookup from dedup map
# entity_id_map: {name_variant -> uuid, ...}
# Invert to uuid→canonical_name using entities list
uuid_to_name = {}
name_to_uuid = {}
for e in entities:
# Find this entity's UUID in the map
uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name))
uuid_to_name[uuid] = e.name
name_to_uuid[e.name] = uuid
degree = {}
for r in relations:
fid = r.from_id or r.from_name
tid = r.to_id or r.to_name
degree[fid] = degree.get(fid, 0) + 1
degree[tid] = degree.get(tid, 0) + 1
nodes = []
seen_uuids = set()
for e in entities:
uuid = name_to_uuid.get(e.name, e.name)
if uuid in seen_uuids:
continue
seen_uuids.add(uuid)
# Filter out 'type' — sigma.js reserves it for node render program
reserved = {"type", "hidden", "x", "y"}
attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved}
nodes.append({
"key": uuid,
"attributes": {
"label": e.name,
"color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"),
"size": 4 + min(degree.get(uuid, 0) * 2, 20),
"entity_type": e.type_ref,
**attrs,
},
})
node_keys = {n["key"] for n in nodes}
edges = []
seen_edges = set()
for i, r in enumerate(relations):
fid = r.from_id or r.from_name
tid = r.to_id or r.to_name
if fid in node_keys and tid in node_keys and fid != tid:
edge_key = (fid, tid, r.relation_type)
if edge_key in seen_edges:
continue
seen_edges.add(edge_key)
edges.append({
"key": f"e{i}",
"source": fid,
"target": tid,
"attributes": {"label": r.relation_type},
})
return {"nodes": nodes, "edges": edges}
# ── Reclasificación de entidades genéricas ─────────────────────────────────────
GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"}
def reclassify_generic_entities(entities, new_presets, workers=4):
"""Reclasifica entidades genéricas usando los tipos recién descubiertos.
En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades
genéricas y los nuevos presets para reclasificarlas in-place.
"""
generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS]
if not generic or not new_presets:
return 0
# Construir prompt de reclasificación
type_lines = []
for p in new_presets:
fields = ", ".join(p.get("metadata_fields", []))
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
system = (
"You reclassify entities into more specific types. "
"For each entity, decide if it fits one of the NEW types below better than its current generic type. "
"If it fits, return the new type_ref and updated attributes. If not, return null.\n\n"
"NEW TYPES:\n" + "\n".join(type_lines) + "\n\n"
'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n'
"Only include entities that should change. Omit those that should stay as-is."
)
# Procesar en batches de 30 entidades para no exceder contexto
batch_size = 30
total_changed = 0
def _reclassify_batch(batch):
items = [{"index": idx, "name": e.name, "current_type": e.type_ref,
"attributes": e.attributes} for idx, e in batch]
try:
resp = claude_haiku_json([
{"role": "system", "content": system},
{"role": "user", "content": json.dumps(items, ensure_ascii=False)},
])
return resp.get("reclassified", [])
except Exception:
return []
batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)]
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_reclassify_batch, b): b for b in batches}
for future in as_completed(futures):
for item in future.result():
idx = item.get("index")
new_ref = item.get("type_ref", "")
if idx is not None and new_ref and 0 <= idx < len(entities):
entities[idx].type_ref = new_ref
if item.get("attributes"):
entities[idx].attributes.update(item["attributes"])
total_changed += 1
return total_changed
# ── Main ───────────────────────────────────────────────────────────────────────
def main():
if len(sys.argv) < 2:
print("Uso: python extract.py <archivo>")
sys.exit(1)
file_path = sys.argv[1]
if not os.path.isabs(file_path):
file_path = os.path.join(os.path.dirname(__file__), file_path)
workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4
print(f"=== Ontology Graph Extraction ===")
print(f"File: {file_path}")
print(f"Workers: {workers}")
start = time.monotonic()
# 1. Extraer y preprocesar texto
print("\n[1/5] Extracting text...")
raw = extract_text_from_file(file_path)
text = preprocess_text(raw)
print(f" {len(text)} chars")
# 2. Chunking
print("[2/5] Chunking...")
chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
print(f" {len(chunks)} chunks")
# 3. Extracción paralela
custom = load_custom_presets()
# Solo usar custom no promovidos (los promovidos ya estarán en el registry)
active_custom = [p for p in custom if not p.get("promoted", False)]
all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom
print(f" Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom")
system_prompt = build_unified_prompt(all_presets, RELATION_TYPES)
print(f"[3/5] Extracting entities + relations ({workers} workers)...")
all_entities = []
all_relations = []
all_suggested = []
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {
pool.submit(process_chunk, i, chunk, system_prompt): i
for i, chunk in enumerate(chunks)
}
for future in as_completed(futures):
idx = futures[future]
ents, rels, sugg = future.result()
all_entities.extend(ents)
all_relations.extend(rels)
all_suggested.extend(sugg)
print(f" chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" +
(f", {len(sugg)} new types" if sugg else ""))
# 4. Deduplicación
print(f"\n[4/5] Deduplicating...")
print(f" Raw: {len(all_entities)} entities, {len(all_relations)} relations")
dedup = deduplicate_entities(all_entities, name_threshold=0.85)
final_entities = dedup.entities
entity_id_map = dedup.name_to_id
final_relations = deduplicate_relations(all_relations, entity_id_map)
print(f" Final: {len(final_entities)} entities, {len(final_relations)} relations")
print(f" Merged: {dedup.total_before - dedup.total_after} entities, "
f"{len(all_relations) - len(final_relations)} relations")
# Registrar tipos sugeridos en custom_presets.json
unique_suggested = []
if all_suggested:
seen = set()
for s in all_suggested:
key = s.get("type_ref", "")
if key and key not in seen:
seen.add(key)
unique_suggested.append(s)
source_doc = os.path.basename(file_path)
added = merge_suggested_into_custom(unique_suggested, source_doc)
total_custom = len(load_custom_presets())
if added:
print(f"\n New types registered ({len(added)}):")
for p in added:
print(f" + {p['label']} ({p['type_ref']}): {p['metadata_fields']}")
print(f" Reason: {p['reason']}")
print(f" Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})")
# Reclasificar entidades genéricas con los tipos recién descubiertos
n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS)
if n_generic > 0:
print(f"\n Reclassifying {n_generic} generic entities with new types...")
changed = reclassify_generic_entities(final_entities, added, workers=workers)
print(f" Reclassified: {changed}/{n_generic}")
else:
print(f"\n {len(unique_suggested)} suggested types already registered ({total_custom} total custom)")
# Stats por tipo
type_counts = {}
for e in final_entities:
type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1
print(f"\n Entity types:")
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {c}")
rel_counts = {}
for r in final_relations:
rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1
print(f" Relation types:")
for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {c}")
# 5. Visualización
print(f"\n[5/5] Generating graph...")
graph = to_sigma(final_entities, final_relations, entity_id_map)
out_dir = os.path.join(os.path.dirname(__file__), "data")
html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph")
print(f" {len(graph['nodes'])} nodes, {len(graph['edges'])} edges")
print(f" HTML: file://{html_path}")
# Guardar JSON intermedio
json_path = os.path.join(out_dir, "extraction_result.json")
with open(json_path, "w") as f:
json.dump({
"entities": [{"name": e.name, "type_ref": e.type_ref,
"confidence": e.confidence, "attributes": e.attributes}
for e in final_entities],
"relations": [{"from": r.from_name, "to": r.to_name,
"type": r.relation_type, "confidence": r.confidence,
"description": r.description}
for r in final_relations],
"suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])],
}, f, ensure_ascii=False, indent=2)
print(f" JSON: {json_path}")
elapsed = time.monotonic() - start
print(f"\nDone in {elapsed:.1f}s")
if __name__ == "__main__":
main()
View File
+43
View File
@@ -0,0 +1,43 @@
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
"""Genera texto legible para el LLM describiendo los entity types disponibles.
Formatea los presets del registry en una seccion del system prompt que indica
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
Args:
entity_presets: Lista de presets con campos 'label', 'type_ref' y
opcionalmente 'metadata_fields'. Ejemplo:
[{"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "alias"]}]
Returns:
String formateado con la seccion del prompt. Retorna string vacio si
la lista de presets esta vacia.
"""
if not entity_presets:
return ""
lines = ["Entity types available for extraction:", ""]
for i, preset in enumerate(entity_presets, start=1):
label = preset.get("label", "Unknown")
type_ref = preset.get("type_ref", "")
metadata_fields = preset.get("metadata_fields", [])
lines.append(f"{i}. {label} (type_ref: {type_ref})")
if metadata_fields:
attrs = ", ".join(metadata_fields)
lines.append(f" Attributes: {attrs}")
lines.append("")
# Remove trailing blank line
if lines and lines[-1] == "":
lines.pop()
return "\n".join(lines)
+22
View File
@@ -0,0 +1,22 @@
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
def build_relation_schema_prompt(relation_types: list[str]) -> str:
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
Formatea la lista de tipos de relacion en una seccion del system prompt que
indica al LLM que relaciones puede extraer entre entidades.
Args:
relation_types: Lista de strings con los tipos de relacion permitidos.
Ejemplo: ["funds", "employs", "communicates_with"]
Returns:
String formateado con la seccion del prompt. Retorna string vacio si
la lista esta vacia.
"""
if not relation_types:
return ""
joined = ", ".join(relation_types)
return f"Allowed relation types:\n{joined}"
+814
View File
@@ -0,0 +1,814 @@
"""Core functional programming utilities — pure functions for list/collection operations."""
import hashlib
import re
from functools import reduce as _reduce
from typing import Any, Callable, Dict, List, Optional, Tuple
def filter_list(xs: list, pred: Callable) -> list:
"""Filter list by predicate. Does not mutate the original."""
return [x for x in xs if pred(x)]
def map_list(xs: list, fn: Callable) -> list:
"""Map function over list. Does not mutate the original."""
return [fn(x) for x in xs]
def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
"""Reduce list with accumulator. fn(acc, x) -> acc."""
return _reduce(fn, xs, initial)
def flat_map(xs: list, fn: Callable) -> list:
"""Map function over list then flatten one level."""
result = []
for x in xs:
result.extend(fn(x))
return result
def flatten(xss: list) -> list:
"""Flatten a list of lists one level."""
result = []
for xs in xss:
result.extend(xs)
return result
def chunk(xs: list, size: int) -> list:
"""Split list into chunks of given size. Last chunk may be smaller."""
if size <= 0:
return []
return [xs[i : i + size] for i in range(0, len(xs), size)]
def take(xs: list, n: int) -> list:
"""Take first n elements from list."""
return xs[:n]
def drop(xs: list, n: int) -> list:
"""Drop first n elements from list."""
return xs[n:]
def unique(xs: list) -> list:
"""Remove duplicates preserving order. Uses identity for hashable elements."""
seen = set()
result = []
for x in xs:
if x not in seen:
seen.add(x)
result.append(x)
return result
def group_by(xs: list, key_fn: Callable) -> Dict:
"""Group elements by key function. Returns dict of key -> list."""
groups: Dict = {}
for x in xs:
k = key_fn(x)
if k not in groups:
groups[k] = []
groups[k].append(x)
return groups
def partition(xs: list, pred: Callable) -> Tuple[list, list]:
"""Split list into (matches, non_matches) based on predicate."""
matches = []
non_matches = []
for x in xs:
if pred(x):
matches.append(x)
else:
non_matches.append(x)
return (matches, non_matches)
def find(xs: list, pred: Callable) -> Any:
"""Find first element matching predicate. Returns None if not found."""
for x in xs:
if pred(x):
return x
return None
def find_index(xs: list, pred: Callable) -> int:
"""Find index of first element matching predicate. Returns -1 if not found."""
for i, x in enumerate(xs):
if pred(x):
return i
return -1
def zip_with(xs: list, ys: list, fn: Callable) -> list:
"""Zip two lists with a combining function. Stops at shorter list."""
return [fn(x, y) for x, y in zip(xs, ys)]
def all_of(xs: list, pred: Callable) -> bool:
"""Return True if all elements match predicate."""
return all(pred(x) for x in xs)
def any_of(xs: list, pred: Callable) -> bool:
"""Return True if any element matches predicate."""
return any(pred(x) for x in xs)
def pipe(value: Any, *fns: Callable) -> Any:
"""Pipe a value through a sequence of functions left-to-right."""
result = value
for fn in fns:
result = fn(result)
return result
def compose(*fns: Callable) -> Callable:
"""Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
def composed(x: Any) -> Any:
result = x
for fn in reversed(fns):
result = fn(result)
return result
return composed
# ── Tree manipulation ────────────────────────────────────────────────────────
def flatten_tree(structure: Any) -> List[Dict]:
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
import copy
if isinstance(structure, dict):
node = copy.deepcopy(structure)
node.pop('nodes', None)
nodes = [node]
for key in list(structure.keys()):
if 'nodes' in key:
nodes.extend(flatten_tree(structure[key]))
return nodes
elif isinstance(structure, list):
nodes = []
for item in structure:
nodes.extend(flatten_tree(item))
return nodes
return []
def tree_to_flat_list(structure: Any) -> List[Dict]:
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
if isinstance(structure, dict):
nodes = [structure]
if 'nodes' in structure:
nodes.extend(tree_to_flat_list(structure['nodes']))
return nodes
elif isinstance(structure, list):
nodes = []
for item in structure:
nodes.extend(tree_to_flat_list(item))
return nodes
return []
def get_leaf_nodes(structure: Any) -> List[Dict]:
"""Extract only leaf nodes (no children) from a hierarchical tree."""
import copy
if isinstance(structure, dict):
if not structure.get('nodes'):
node = copy.deepcopy(structure)
node.pop('nodes', None)
return [node]
leaf_nodes = []
for key in list(structure.keys()):
if 'nodes' in key:
leaf_nodes.extend(get_leaf_nodes(structure[key]))
return leaf_nodes
elif isinstance(structure, list):
leaf_nodes = []
for item in structure:
leaf_nodes.extend(get_leaf_nodes(item))
return leaf_nodes
return []
def write_node_ids(data: Any, node_id: int = 0) -> int:
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
if isinstance(data, dict):
data['node_id'] = str(node_id).zfill(4)
node_id += 1
for key in list(data.keys()):
if 'nodes' in key:
node_id = write_node_ids(data[key], node_id)
elif isinstance(data, list):
for item in data:
node_id = write_node_ids(item, node_id)
return node_id
def list_to_tree(data: List[Dict]) -> List[Dict]:
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
def get_parent_structure(structure):
if not structure:
return None
parts = str(structure).split('.')
return '.'.join(parts[:-1]) if len(parts) > 1 else None
nodes = {}
root_nodes = []
for item in data:
structure = item.get('structure')
node = {
'title': item.get('title'),
'start_index': item.get('start_index'),
'end_index': item.get('end_index'),
'nodes': []
}
nodes[structure] = node
parent_structure = get_parent_structure(structure)
if parent_structure and parent_structure in nodes:
nodes[parent_structure]['nodes'].append(node)
else:
root_nodes.append(node)
def clean_node(node):
if not node['nodes']:
del node['nodes']
else:
for child in node['nodes']:
clean_node(child)
return node
return [clean_node(node) for node in root_nodes]
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
"""Recursively remove specified fields from a tree (dict/list)."""
if fields is None:
fields = ['text']
if isinstance(data, dict):
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
elif isinstance(data, list):
return [remove_tree_fields(item, fields) for item in data]
return data
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
"""Reorder fields of each node in a tree according to specified key order."""
if not order:
return structure
if isinstance(structure, dict):
if 'nodes' in structure:
structure['nodes'] = format_tree_structure(structure['nodes'], order)
if not structure.get('nodes'):
structure.pop('nodes', None)
return {key: structure[key] for key in order if key in structure}
elif isinstance(structure, list):
return [format_tree_structure(item, order) for item in structure]
return structure
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
"""Create flat dict mapping node_id to node for O(1) lookup."""
mapping = {}
def _traverse(nodes):
for node in nodes:
if node.get('node_id'):
mapping[node['node_id']] = node
if node.get('nodes'):
_traverse(node['nodes'])
_traverse(tree)
return mapping
# ── Text / JSON extraction ───────────────────────────────────────────────────
def extract_json_from_llm(content: str) -> Dict:
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
import json
try:
start_idx = content.find("```json")
if start_idx != -1:
start_idx += 7
end_idx = content.rfind("```")
json_content = content[start_idx:end_idx].strip()
else:
json_content = content.strip()
json_content = json_content.replace('None', 'null')
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
json_content = ' '.join(json_content.split())
return json.loads(json_content)
except (json.JSONDecodeError, Exception):
try:
json_content = json_content.replace(',]', ']').replace(',}', '}')
return json.loads(json_content)
except Exception:
return {}
def parse_page_range(pages: str) -> List[int]:
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
result = []
for part in pages.split(','):
part = part.strip()
if '-' in part:
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
if start > end:
raise ValueError(f"Invalid range '{part}': start must be <= end")
result.extend(range(start, end + 1))
else:
result.append(int(part))
return sorted(set(result))
# ── Markdown parsing ─────────────────────────────────────────────────────────
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
import re
header_pattern = r'^(#{1,6})\s+(.+)$'
code_block_pattern = r'^```'
node_list = []
lines = markdown_content.split('\n')
in_code_block = False
for line_num, line in enumerate(lines, 1):
stripped_line = line.strip()
if re.match(code_block_pattern, stripped_line):
in_code_block = not in_code_block
continue
if not stripped_line:
continue
if not in_code_block:
match = re.match(header_pattern, stripped_line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
node_list.append({'title': title, 'level': level, 'line_num': line_num})
return node_list, lines
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
if not node_list:
return []
stack = []
root_nodes = []
node_counter = 1
for node in node_list:
current_level = node['level']
tree_node = {
'title': node['title'],
'node_id': str(node_counter).zfill(4),
'line_num': node['line_num'],
'nodes': []
}
node_counter += 1
while stack and stack[-1][1] >= current_level:
stack.pop()
if not stack:
root_nodes.append(tree_node)
else:
parent_node, _ = stack[-1]
parent_node['nodes'].append(tree_node)
stack.append((tree_node, current_level))
def clean_empty_nodes(nodes):
for n in nodes:
if n['nodes']:
clean_empty_nodes(n['nodes'])
else:
del n['nodes']
return nodes
return clean_empty_nodes(root_nodes)
# ── Pagination / chunking ────────────────────────────────────────────────────
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
"""Group pages into text chunks respecting token limit with configurable overlap."""
import math
num_tokens = sum(token_lengths)
if num_tokens <= max_tokens:
return ["".join(page_contents)]
subsets = []
current_subset = []
current_token_count = 0
expected_parts = math.ceil(num_tokens / max_tokens)
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
if current_token_count + page_tokens > avg_tokens:
subsets.append(''.join(current_subset))
overlap_start = max(i - overlap_pages, 0)
current_subset = list(page_contents[overlap_start:i])
current_token_count = sum(token_lengths[overlap_start:i])
current_subset.append(page_content)
current_token_count += page_tokens
if current_subset:
subsets.append(''.join(current_subset))
return subsets
def calculate_page_offset(pairs: List[Dict]) -> int:
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
differences = []
for pair in pairs:
try:
difference = pair['physical_index'] - pair['page']
differences.append(difference)
except (KeyError, TypeError):
continue
if not differences:
return 0
counts: Dict[int, int] = {}
for diff in differences:
counts[diff] = counts.get(diff, 0) + 1
return max(counts.items(), key=lambda x: x[1])[0]
# ── Text preprocessing ───────────────────────────────────────────────────────
def preprocess_text(text: str) -> str:
"""Normalize whitespace and newlines in raw text.
Args:
text: Raw text to normalize.
Returns:
Normalized text with consistent newlines, stripped lines, and no
excessive blank lines.
"""
# Normalize line endings: \r\n and \r -> \n
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Reduce 3+ consecutive newlines to at most 2
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip whitespace from each line
text = '\n'.join(line.strip() for line in text.split('\n'))
# Strip globally
return text.strip()
def get_text_stats(text: str) -> dict:
"""Compute basic statistics of a text: characters, lines, words.
Args:
text: Input text to analyze.
Returns:
Dict with keys total_chars (int), total_lines (int), total_words (int).
"""
return {
'total_chars': len(text),
'total_lines': text.count('\n') + 1,
'total_words': len(text.split()),
}
# ── Git URL parsing ──────────────────────────────────────────────────────────
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
def _sanitize_git_segment(segment: str) -> str:
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
if segment.endswith(".git"):
segment = segment[:-4]
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
"""Parse a code-hosting URL and return the 'org/repo' path component.
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
Returns None if the URL does not match any known host or is malformed.
Args:
url: Repository URL in any supported format.
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
Returns:
'org/repo' string or None.
"""
from urllib.parse import urlparse
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
url = url.strip()
if url.startswith("git@"):
# git@github.com:org/repo.git
rest = url[len("git@"):]
if ":" not in rest:
return None
host, path = rest.split(":", 1)
if host not in hosts:
return None
segments = [s for s in path.split("/") if s]
if len(segments) < 2:
return None
org = _sanitize_git_segment(segments[0])
repo = _sanitize_git_segment(segments[1])
if not org or not repo:
return None
return f"{org}/{repo}"
for prefix in ("http://", "https://", "git://", "ssh://"):
if url.startswith(prefix):
parsed = urlparse(url)
netloc = parsed.hostname or ""
if netloc not in hosts:
return None
segments = [s for s in parsed.path.split("/") if s]
if len(segments) < 2:
return None
org = _sanitize_git_segment(segments[0])
repo = _sanitize_git_segment(segments[1])
if not org or not repo:
return None
return f"{org}/{repo}"
return None
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
"""Return True only if url points to a clonable git repository.
Accepts org/repo and org/repo/tree/<ref> paths.
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
Args:
url: URL to verify.
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
Returns:
True if url is a clonable repository URL.
"""
from urllib.parse import urlparse
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
url = url.strip()
# SSH shorthand — always repo-level if host matches
if url.startswith("git@"):
rest = url[len("git@"):]
if ":" not in rest:
return False
host, _ = rest.split(":", 1)
return host in hosts
# git:// and ssh:// — always repo-level if host matches
for prefix in ("ssh://", "git://"):
if url.startswith(prefix):
parsed = urlparse(url)
return (parsed.hostname or "") in hosts
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
for prefix in ("http://", "https://"):
if url.startswith(prefix):
parsed = urlparse(url)
if (parsed.hostname or "") not in hosts:
return False
segments = [s for s in parsed.path.split("/") if s]
if len(segments) == 2:
return True
if len(segments) == 4 and segments[2] == "tree":
return True
return False
return False
def validate_git_ssh_uri(url: str) -> None:
"""Validate a git SSH URI of the form git@host:path.
Raises ValueError with a descriptive message if the URI is malformed.
Args:
url: URI string to validate.
Raises:
ValueError: If the URI does not conform to git SSH format.
"""
if not url.startswith("git@"):
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
rest = url[len("git@"):]
if ":" not in rest:
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
_, path = rest.split(":", 1)
if not path:
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
# ---------------------------------------------------------------------------
# Markdown parsing utilities
# ---------------------------------------------------------------------------
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
Args:
content: Raw markdown string, optionally starting with YAML frontmatter.
Returns:
Tuple of (content_without_frontmatter, frontmatter_dict).
frontmatter_dict is None when no frontmatter is found.
"""
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
match = pattern.match(content)
if not match:
return content, None
raw = match.group(1)
remaining = content[match.end():]
try:
import yaml # type: ignore
data = yaml.safe_load(raw)
if not isinstance(data, dict):
data = None
except Exception:
# Fallback: simple key: value parser (no yaml dependency)
data = {}
for line in raw.splitlines():
if ':' in line:
key, _, value = line.partition(':')
data[key.strip()] = value.strip()
return remaining, data
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
"""Find all markdown headings (# to ######), excluding those inside code blocks,
HTML comments, and indented blocks.
Args:
content: Markdown text to search.
Returns:
List of (start_pos, end_pos, title, level) for each heading found.
"""
excluded: List[Tuple[int, int]] = []
# Code blocks (triple backtick)
for m in re.finditer(r'```.*?```', content, re.DOTALL):
excluded.append((m.start(), m.end()))
# HTML comments
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
excluded.append((m.start(), m.end()))
# Indented blocks (lines starting with 4 spaces or a tab)
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
excluded.append((m.start(), m.end()))
def is_excluded(pos: int) -> bool:
return any(start <= pos < end for start, end in excluded)
results: List[Tuple[int, int, str, int]] = []
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
# Skip escaped headings (\#)
before = content[m.start() - 1] if m.start() > 0 else ''
if before == '\\':
continue
if is_excluded(m.start()):
continue
level = len(m.group(1))
title = m.group(2).strip()
results.append((m.start(), m.end(), title, level))
return results
def estimate_token_count(content: str) -> int:
"""Estimate token count without a tokenizer.
CJK characters count as ~0.7 tokens each; other non-whitespace characters
count as ~0.3 tokens each.
Args:
content: Text to estimate.
Returns:
Estimated integer token count.
"""
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
others = re.findall(r'\S', without_cjk)
return int(len(cjk) * 0.7 + len(others) * 0.3)
def smart_split_content(
content: str,
max_tokens: int = 1024,
max_chars: int = 8000,
) -> List[str]:
"""Split large content into parts respecting token and character limits.
Splits by paragraphs (double newline). If a single paragraph exceeds the
limit it is force-cut into chunks of max_chars.
Args:
content: Text to split.
max_tokens: Maximum estimated tokens per part.
max_chars: Maximum characters per part.
Returns:
List of string parts.
"""
paragraphs = content.split('\n\n')
parts: List[str] = []
current_parts: List[str] = []
current_tokens = 0
current_chars = 0
def flush() -> None:
if current_parts:
parts.append('\n\n'.join(current_parts))
current_parts.clear()
for para in paragraphs:
para_tokens = estimate_token_count(para)
para_chars = len(para)
# Single paragraph exceeds limits — force-cut it
if para_tokens > max_tokens or para_chars > max_chars:
flush()
current_tokens = 0
current_chars = 0
for i in range(0, len(para), max_chars):
parts.append(para[i:i + max_chars])
continue
# Would exceed limits if added — flush first
if (current_tokens + para_tokens > max_tokens or
current_chars + para_chars > max_chars):
flush()
current_tokens = 0
current_chars = 0
current_parts.append(para)
current_tokens += para_tokens
current_chars += para_chars
flush()
return parts if parts else [content]
def sanitize_for_path(text: str, max_length: int = 50) -> str:
"""Convert text to a safe string for use in file paths.
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
with underscores. Truncates with a sha256 suffix if the result exceeds
max_length.
Args:
text: Input text to sanitize.
max_length: Maximum length of the returned string.
Returns:
Safe path-friendly string.
"""
cleaned = re.sub(
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
'',
text,
)
cleaned = cleaned.replace(' ', '_').strip('_')
if not cleaned:
return 'section'
if len(cleaned) <= max_length:
return cleaned
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
return cleaned[:max_length - len(suffix)] + suffix
+283
View File
@@ -0,0 +1,283 @@
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
from __future__ import annotations
import sys
import os
import uuid
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from entity_candidate import EntityCandidate
from deduplication_result import DeduplicationResult
from normalize_entity_name import normalize_entity_name
from merge_entity_attributes import merge_entity_attributes
# ── Similitud helpers ──────────────────────────────────────────────────────────
def _levenshtein(a: str, b: str) -> int:
"""Distancia de edicion Levenshtein entre dos strings."""
if a == b:
return 0
if not a:
return len(b)
if not b:
return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
curr = [i]
for j, cb in enumerate(b, 1):
cost = 0 if ca == cb else 1
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
prev = curr
return prev[-1]
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
"""Similitud de Jaccard entre dos conjuntos de tokens."""
set_a = set(tokens_a)
set_b = set(tokens_b)
if not set_a and not set_b:
return 1.0
inter = len(set_a & set_b)
union = len(set_a | set_b)
return inter / union if union else 0.0
def _name_similarity(a: str, b: str) -> float:
"""Score de similitud entre dos nombres normalizados.
Combina similitud de Levenshtein y Jaccard sobre tokens.
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
"""
if a == b:
return 1.0
# Similitud Levenshtein
max_len = max(len(a), len(b))
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
# Similitud Jaccard sobre tokens
tokens_a = a.split()
tokens_b = b.split()
jac_sim = _jaccard(tokens_a, tokens_b)
score = max(lev_sim, jac_sim)
# Bonus de contencion: un nombre contiene al otro
if a in b or b in a:
score = min(1.0, score + 0.3)
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
score = min(1.0, score + 0.3)
return score
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
if not candidate or not tokens:
return False
initials = "".join(t[0] for t in tokens if t).upper()
return candidate.upper() == initials
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
def _is_exact_type(entity_type: str) -> bool:
"""Tipos tecnicos donde solo se acepta matching exacto."""
return entity_type.lower() in _EXACT_TYPES
# ── Union-Find ─────────────────────────────────────────────────────────────────
class _UnionFind:
def __init__(self, n: int) -> None:
self._parent = list(range(n))
self._rank = [0] * n
def find(self, x: int) -> int:
while self._parent[x] != x:
self._parent[x] = self._parent[self._parent[x]]
x = self._parent[x]
return x
def union(self, x: int, y: int) -> None:
rx, ry = self.find(x), self.find(y)
if rx == ry:
return
if self._rank[rx] < self._rank[ry]:
rx, ry = ry, rx
self._parent[ry] = rx
if self._rank[rx] == self._rank[ry]:
self._rank[rx] += 1
# ── Implementacion principal ────────────────────────────────────────────────────
def deduplicate_entities(
candidates: list[EntityCandidate],
name_threshold: float = 0.85,
same_type_only: bool = True,
) -> DeduplicationResult:
"""Agrupa entidades candidatas que refieren a la misma entidad real.
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
detectar clusters transitivos. Por cada cluster genera una entidad canonica
mergeando atributos de todos sus miembros.
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
acepta matching exacto normalizado, ignorando el umbral de nombre.
Args:
candidates: lista de EntityCandidate a deduplicar.
name_threshold: score minimo para considerar dos nombres iguales (0-1).
same_type_only: si True, solo compara entidades del mismo type_ref.
Returns:
DeduplicationResult con entidades deduplicadas, mapas de resolucion
e historial de merges.
"""
if not candidates:
return DeduplicationResult(
entities=[],
entity_id_map={},
name_to_id={},
merge_log=[],
total_before=0,
total_after=0,
)
n = len(candidates)
# Paso 1: normalizar nombres
normalized: list[str] = []
for c in candidates:
norm = normalize_entity_name(c.name, c.type_ref)
normalized.append(norm)
# Paso 2: Union-Find sobre todos los indices
uf = _UnionFind(n)
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
merge_pairs: list[tuple[int, int, float]] = []
for i in range(n):
for j in range(i + 1, n):
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
continue
ni, nj = normalized[i], normalized[j]
et = candidates[i].type_ref.lower()
if _is_exact_type(et):
if ni == nj:
uf.union(i, j)
merge_pairs.append((i, j, 1.0))
continue
score = _name_similarity(ni, nj)
if score >= name_threshold:
uf.union(i, j)
merge_pairs.append((i, j, score))
# Paso 4: agrupar indices por raiz del Union-Find
clusters: dict[int, list[int]] = {}
for i in range(n):
root = uf.find(i)
clusters.setdefault(root, []).append(i)
# Paso 5: merge por cluster
merged_entities: list[EntityCandidate] = []
entity_id_map: dict[str, str] = {}
name_to_id: dict[str, str] = {}
merge_log: list[dict] = []
# Pares mergeados para construir el log
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
for i, j, score in merge_pairs:
root = uf.find(i)
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
for root, indices in clusters.items():
cluster_candidates = [candidates[idx] for idx in indices]
if len(cluster_candidates) == 1:
c = cluster_candidates[0]
canonical_name = c.name
canonical_norm = normalized[indices[0]]
merged_attrs = c.attributes
merged_confidence = c.confidence
merged_chunks = list(c.source_chunk_indices)
merged_from = list(c.merged_from) if c.merged_from else [c.name]
else:
# Candidato con mayor confidence es el canonico
best = max(cluster_candidates, key=lambda c: c.confidence)
canonical_name = best.name
canonical_norm = normalize_entity_name(best.name, best.type_ref)
merged_attrs = merge_entity_attributes(
[c.attributes for c in cluster_candidates]
)
merged_confidence = max(c.confidence for c in cluster_candidates)
merged_chunks: list[int] = []
seen_chunks: set[int] = set()
for c in cluster_candidates:
for idx in c.source_chunk_indices:
if idx not in seen_chunks:
merged_chunks.append(idx)
seen_chunks.add(idx)
merged_from: list[str] = []
seen_names: set[str] = set()
for c in cluster_candidates:
names_to_add = c.merged_from if c.merged_from else [c.name]
for nm in names_to_add:
if nm not in seen_names:
merged_from.append(nm)
seen_names.add(nm)
# Log de merge
other_names = [c.name for c in cluster_candidates if c is not best]
pairs = merged_pairs_by_root.get(root, [])
max_score = max((s for _, _, s in pairs), default=1.0)
merge_log.append(
{
"canonical": canonical_name,
"merged": other_names,
"score": round(max_score, 4),
"reason": "fuzzy_name",
}
)
ent_id = str(uuid.uuid4())
entity = EntityCandidate(
name=canonical_name,
name_normalized=canonical_norm,
type_ref=cluster_candidates[0].type_ref,
type_label=cluster_candidates[0].type_label,
attributes=merged_attrs,
confidence=merged_confidence,
source_chunk_indices=merged_chunks,
merged_from=merged_from,
)
merged_entities.append(entity)
# Poblar mapas de resolucion
entity_id_map[canonical_norm] = ent_id
for orig_name in merged_from:
name_to_id[orig_name] = ent_id
name_to_id[canonical_norm] = ent_id
return DeduplicationResult(
entities=merged_entities,
entity_id_map=entity_id_map,
name_to_id=name_to_id,
merge_log=merge_log,
total_before=n,
total_after=len(merged_entities),
)
+189
View File
@@ -0,0 +1,189 @@
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
import logging
import os
import sys
logger = logging.getLogger(__name__)
# --- Importar levenshtein_distance desde cybersecurity ---
# Soporta dos contextos:
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
# 2. Ejecutado desde la raiz del registry (fn run)
def _levenshtein_distance(a: str, b: str) -> int:
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
if len(a) < len(b):
return _levenshtein_distance(b, a)
if len(b) == 0:
return len(a)
prev_row = list(range(len(b) + 1))
for i, ca in enumerate(a):
curr_row = [i + 1]
for j, cb in enumerate(b):
cost = 0 if ca == cb else 1
curr_row.append(
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
)
prev_row = curr_row
return prev_row[-1]
try:
_here = os.path.dirname(os.path.abspath(__file__))
_cyber_path = os.path.join(_here, "..", "cybersecurity")
if _cyber_path not in sys.path:
sys.path.insert(0, _cyber_path)
from cybersecurity import levenshtein_distance as _lev
except ImportError:
_lev = None # type: ignore
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
Recorre todas las claves de entity_id_map y busca la mas cercana segun
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
threshold, o '' si no hay match aceptable.
Args:
name: nombre a resolver (ya en lowercase strip).
entity_id_map: mapa nombre_normalizado -> entity_id.
threshold: distancia maxima de edicion para considerar match (default 3).
Returns:
entity_id del mejor match o '' si no hay match.
"""
best_id = ""
best_dist = threshold + 1
for key, entity_id in entity_id_map.items():
dist = levenshtein_distance(name, key)
if dist < best_dist:
best_dist = dist
best_id = entity_id
return best_id if best_dist <= threshold else ""
def deduplicate_relations(
relations: list,
entity_id_map: dict[str, str],
) -> list:
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
Algoritmo:
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
Si sigue sin match, descartar la relacion con warning.
2. Descartar self-loops (from_id == to_id).
3. Deduplicar por (from_id, to_id, relation_type):
- description: concatenar descripciones unicas separadas por '; '
- confidence: max del grupo
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
Args:
relations: lista de RelationCandidate con from_name/to_name originales.
entity_id_map: mapa nombre_normalizado -> entity_id (output de
deduplicate_entities). Permite resolver nombres que fueron mergeados.
Returns:
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
"""
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
try:
_types_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..", "..", "..", "python", "types", "datascience",
)
if _types_path not in sys.path:
sys.path.insert(0, _types_path)
from relation_candidate import RelationCandidate
except ImportError:
from relation_candidate import RelationCandidate # type: ignore
resolved: list = []
for rel in relations:
# --- Resolver from_name ---
from_key = rel.from_name.lower().strip()
from_id = entity_id_map.get(from_key, "")
if not from_id:
from_id = _fuzzy_resolve(from_key, entity_id_map)
if not from_id:
logger.warning(
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
rel.from_name,
)
continue
# --- Resolver to_name ---
to_key = rel.to_name.lower().strip()
to_id = entity_id_map.get(to_key, "")
if not to_id:
to_id = _fuzzy_resolve(to_key, entity_id_map)
if not to_id:
logger.warning(
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
rel.to_name,
)
continue
# --- Descartar self-loops ---
if from_id == to_id:
logger.debug(
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
rel.from_name,
rel.to_name,
rel.relation_type,
)
continue
resolved.append(
RelationCandidate(
from_name=rel.from_name,
to_name=rel.to_name,
from_id=from_id,
to_id=to_id,
relation_type=rel.relation_type,
description=rel.description,
confidence=rel.confidence,
source_chunk_index=rel.source_chunk_index,
)
)
# --- Deduplicar por (from_id, to_id, relation_type) ---
groups: dict[tuple, list] = {}
for rel in resolved:
key = (rel.from_id, rel.to_id, rel.relation_type)
groups.setdefault(key, []).append(rel)
result: list = []
for (from_id, to_id, rel_type), group in groups.items():
if len(group) == 1:
result.append(group[0])
continue
# Mergear: max confidence + union de descripciones unicas
best_confidence = max(r.confidence for r in group)
seen_desc: set[str] = set()
descriptions: list[str] = []
for r in group:
if r.description and r.description not in seen_desc:
descriptions.append(r.description)
seen_desc.add(r.description)
result.append(
RelationCandidate(
from_name=group[0].from_name,
to_name=group[0].to_name,
from_id=from_id,
to_id=to_id,
relation_type=rel_type,
description="; ".join(descriptions),
confidence=best_confidence,
source_chunk_index=group[0].source_chunk_index,
)
)
return result
+22
View File
@@ -0,0 +1,22 @@
"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
from dataclasses import dataclass, field
from entity_candidate import EntityCandidate
@dataclass
class DeduplicationResult:
"""Resultado de deduplicacion de entidades.
El `name_to_id` mapea TODOS los nombres originales (incluyendo los
mergeados) a su ID final, permitiendo resolver relaciones que usan
cualquier variante del nombre.
"""
entities: list[EntityCandidate]
entity_id_map: dict[str, str]
name_to_id: dict[str, str]
merge_log: list[dict] = field(default_factory=list)
total_before: int = 0
total_after: int = 0
+34
View File
@@ -0,0 +1,34 @@
"""EntityCandidate — candidato de entidad extraido por el LLM."""
from dataclasses import dataclass, field
@dataclass
class EntityCandidate:
"""Candidato de entidad extraido por el LLM.
Puede venir de un solo chunk o ser el resultado de mergear multiples
extracciones. `merged_from` rastrea los nombres originales para debugging.
"""
name: str
name_normalized: str = ""
type_ref: str = ""
type_label: str = ""
attributes: dict = field(default_factory=dict)
confidence: float = 0.0
source_chunk_indices: list[int] = field(default_factory=list)
merged_from: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
"""Serializa el candidato a un diccionario."""
return {
"name": self.name,
"name_normalized": self.name_normalized,
"type_ref": self.type_ref,
"type_label": self.type_label,
"attributes": self.attributes,
"confidence": self.confidence,
"source_chunk_indices": self.source_chunk_indices,
"merged_from": self.merged_from,
}
+145
View File
@@ -0,0 +1,145 @@
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
import sys
import os
import warnings
from typing import Callable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from entity_candidate import EntityCandidate
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
"""Construye el system prompt para extraccion de entidades."""
lines = [
"You are an entity extraction expert. Given text, extract all entities",
"matching these types. For each entity, provide: name, type_ref,",
"attributes (matching the metadata_fields for that type), and a",
"confidence score (0.0-1.0).",
"",
"Entity types:",
]
for schema_entry in entity_schema:
label = schema_entry.get("label", "Unknown")
type_ref = schema_entry.get("type_ref", "")
metadata_fields = schema_entry.get("metadata_fields", [])
lines.append(f"- {label} (type_ref: {type_ref})")
if metadata_fields:
lines.append(f" fields: {', '.join(metadata_fields)}")
lines += [
"",
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
"",
"Rules:",
"- Only extract entities explicitly mentioned in the text",
"- Use the exact type_ref from the schema",
"- Leave unknown attributes as null",
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
f"- {language_instruction}",
]
return "\n".join(lines)
def extract_entities_llm(
text: str,
entity_schema: list[dict],
llm_chat_json: Callable[[list[dict]], dict],
language_instruction: str = "Respond in English.",
) -> list[EntityCandidate]:
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
Construye un system prompt con el schema de entity types, llama al LLM
y valida la respuesta retornando una lista de EntityCandidate.
Args:
text: Chunk de texto a analizar.
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
dict con las claves 'type_ref', 'label' y opcionalmente
'metadata_fields'. Ejemplo:
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
"metadata_fields": ["full_name", "alias"]}]
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
y retorna un dict con la respuesta JSON del LLM. Interfaz:
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
language_instruction: Instruccion de idioma para el LLM. Por defecto
"Respond in English."
Returns:
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
no retorna JSON valido o si no se encuentran entidades.
Raises:
ValueError: Si entity_schema esta vacio.
"""
if not entity_schema:
raise ValueError("entity_schema no puede estar vacio")
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
type_ref_to_label = {
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
}
system_prompt = _build_system_prompt(entity_schema, language_instruction)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
]
try:
response = llm_chat_json(messages)
except Exception as exc:
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
return []
raw_entities = response.get("entities", [])
if not isinstance(raw_entities, list):
warnings.warn(
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
stacklevel=2,
)
return []
candidates: list[EntityCandidate] = []
for item in raw_entities:
if not isinstance(item, dict):
continue
name = item.get("name", "")
if not name:
continue
type_ref = item.get("type_ref", "")
if type_ref not in valid_type_refs:
warnings.warn(
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
stacklevel=2,
)
continue
attributes = item.get("attributes", {})
if not isinstance(attributes, dict):
attributes = {}
# Normalizar null values a None
attributes = {k: v for k, v in attributes.items() if v is not None}
confidence = item.get("confidence", 0.0)
if not isinstance(confidence, (int, float)):
confidence = 0.0
confidence = float(max(0.0, min(1.0, confidence)))
candidates.append(
EntityCandidate(
name=name,
type_ref=type_ref,
type_label=type_ref_to_label.get(type_ref, ""),
attributes=attributes,
confidence=confidence,
)
)
return candidates
+141
View File
@@ -0,0 +1,141 @@
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
import logging
import sys
import os
from typing import Callable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
from entity_candidate import EntityCandidate
from relation_candidate import RelationCandidate
logger = logging.getLogger(__name__)
def extract_relations_llm(
text: str,
entities: list[EntityCandidate],
relation_types: list[str],
llm_chat_json: Callable[[list[dict]], dict],
language_instruction: str = "Respond in English.",
) -> list[RelationCandidate]:
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
Dado el texto original y las entidades ya extraidas, pide al LLM que
identifique relaciones entre pares de entidades. Las relaciones cuyo
from_name o to_name no coincidan con ninguna entidad existente se descartan.
Los tipos de relacion no permitidos se reemplazan por "related_to".
Args:
text: chunk de texto (el mismo que se uso para extraer las entidades).
entities: entidades ya extraidas del chunk.
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
"communicates_with", "owns", "related_to"].
llm_chat_json: funcion inyectada que recibe una lista de mensajes
(dicts con "role" y "content") y retorna un dict con la respuesta
JSON del LLM.
language_instruction: instruccion de idioma para el LLM.
Returns:
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
o si el LLM no encuentra relaciones.
"""
if len(entities) < 2:
return []
entity_names = {e.name for e in entities}
relation_types_set = set(relation_types)
# Construir lista de entidades para el prompt
entity_lines = "\n".join(
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
)
# Construir tipos de relacion para el prompt
relation_types_str = ", ".join(relation_types)
system_prompt = f"""\
You are a relation extraction expert. Given text and a list of entities already \
extracted, identify relationships between them.
Entities found in this text:
{entity_lines}
Allowed relation types: {relation_types_str}
Output JSON: {{"relations": [
{{"from_name": "Entity A", "to_name": "Entity B",
"relation_type": "employs", "description": "...", "confidence": 0.8}}
]}}
Rules:
- Only extract relations explicitly stated or strongly implied in the text
- from_name and to_name must match entity names exactly as listed above
- relation_type must be one of the allowed types
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
- Do not invent entities not in the list above
- {language_instruction}"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
]
try:
response = llm_chat_json(messages)
except Exception as exc:
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
return []
raw_relations = response.get("relations", [])
if not isinstance(raw_relations, list):
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
return []
results: list[RelationCandidate] = []
for item in raw_relations:
if not isinstance(item, dict):
continue
from_name = item.get("from_name", "")
to_name = item.get("to_name", "")
# Validar que ambos nombres corresponden a entidades existentes
if from_name not in entity_names:
logger.debug(
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
from_name,
)
continue
if to_name not in entity_names:
logger.debug(
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
to_name,
)
continue
relation_type = item.get("relation_type", "")
if relation_type not in relation_types_set:
logger.debug(
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
relation_type,
)
relation_type = "related_to"
confidence = item.get("confidence", 0.0)
if not isinstance(confidence, (int, float)):
confidence = 0.0
confidence = float(max(0.0, min(1.0, confidence)))
results.append(
RelationCandidate(
from_name=from_name,
to_name=to_name,
relation_type=relation_type,
description=item.get("description", ""),
confidence=confidence,
)
)
return results
+92
View File
@@ -0,0 +1,92 @@
"""Extract plain text from PDF, Markdown, or TXT files."""
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
def _detect_encoding(data: bytes) -> str:
"""Detect encoding of raw bytes using multiple fallback strategies."""
# Strategy 1: UTF-8
try:
data.decode("utf-8")
return "utf-8"
except UnicodeDecodeError:
pass
# Strategy 2: charset_normalizer
try:
from charset_normalizer import from_bytes
result = from_bytes(data).best()
if result is not None and result.encoding:
return result.encoding
except ImportError:
pass
# Strategy 3: chardet
try:
import chardet
detected = chardet.detect(data)
if detected and detected.get("encoding"):
return detected["encoding"]
except ImportError:
pass
# Last resort: UTF-8 with replacement
return "utf-8"
def extract_text_from_file(file_path: str) -> str:
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
For PDF files uses PyMuPDF (fitz) to extract text from each page,
joining them with double newlines. For text-based files (.md, .markdown,
.txt) reads the file with automatic encoding detection.
Args:
file_path: Absolute or relative path to the file.
Returns:
str: Extracted plain text content.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the file extension is not supported.
ImportError: If PyMuPDF is not installed and a PDF is provided.
"""
import os
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
_, ext = os.path.splitext(file_path.lower())
if ext == ".pdf":
try:
import fitz # PyMuPDF
except ImportError as e:
raise ImportError(
"PyMuPDF is required for PDF extraction. "
"Install it with: pip install PyMuPDF"
) from e
doc = fitz.open(file_path)
pages = [page.get_text() for page in doc]
return "\n\n".join(pages)
elif ext in {".md", ".markdown", ".txt"}:
with open(file_path, "rb") as f:
raw = f.read()
encoding = _detect_encoding(raw)
try:
return raw.decode(encoding)
except (UnicodeDecodeError, LookupError):
return raw.decode("utf-8", errors="replace")
else:
raise ValueError(
f"Unsupported file extension: '{ext}'. "
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
)
+208
View File
@@ -0,0 +1,208 @@
"""Pipeline de extraccion de entidades y relaciones desde un documento."""
from __future__ import annotations
import sys
import os
import time
import warnings
from typing import Callable
# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
from extract_text_from_file import extract_text_from_file
from core_functions import preprocess_text
from split_text_into_chunks import split_text_into_chunks
from build_entity_schema_prompt import build_entity_schema_prompt
from build_relation_schema_prompt import build_relation_schema_prompt
from extract_entities_llm import extract_entities_llm
from extract_relations_llm import extract_relations_llm
from deduplicate_entities import deduplicate_entities
from deduplicate_relations import deduplicate_relations
from entity_candidate import EntityCandidate
from extraction_result import ExtractionResult
from extraction_stats import ExtractionStats
def extraction_pipeline(
file_path: str,
entity_presets: list[dict],
relation_types: list[str],
llm_chat_json: Callable[[list[dict]], dict],
chunk_size: int = 500,
chunk_overlap: int = 50,
confidence_threshold: float = 0.5,
dedup_threshold: float = 0.85,
on_progress: Callable[[str, float], None] | None = None,
) -> ExtractionResult:
"""Pipeline completa de extraccion de entidades y relaciones desde un documento.
Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
-> extract_entities_llm por chunk -> deduplicate_entities ->
extract_relations_llm por chunk -> deduplicate_relations.
Args:
file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
entity_presets: lista de dicts con type_ref, label y metadata_fields.
Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
"label": "Person",
"metadata_fields": ["full_name", "nationality"]}]
relation_types: tipos de relacion permitidos para extraccion.
Ejemplo: ["funds", "employs", "communicates_with", "owns"]
llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
chunk_size: numero de caracteres por chunk (default 500).
chunk_overlap: overlap entre chunks consecutivos (default 50).
confidence_threshold: umbral minimo de confidence para aceptar entidades
candidatas antes de deduplicar (default 0.5).
dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
on_progress: callback opcional de progreso (message: str, pct: float 0-1).
0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
80-100%: deduplicacion.
Returns:
ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
Raises:
FileNotFoundError: si file_path no existe.
ValueError: si entity_presets esta vacio.
"""
if not entity_presets:
raise ValueError("entity_presets no puede estar vacio")
if not os.path.exists(file_path):
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
def _progress(msg: str, pct: float) -> None:
if on_progress is not None:
try:
on_progress(msg, pct)
except Exception:
pass
start_time = time.monotonic()
stats = ExtractionStats()
# ── Paso 1: Extraer texto ──────────────────────────────────────────────────
_progress("Extracting text from file...", 0.0)
try:
raw_text = extract_text_from_file(file_path)
except Exception as exc:
warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
raw_text = ""
# ── Paso 2: Preprocesar ────────────────────────────────────────────────────
clean_text = preprocess_text(raw_text)
stats.total_chars = len(clean_text)
# ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
n = len(chunks)
stats.total_chunks = n
if n == 0:
stats.processing_time_seconds = time.monotonic() - start_time
return ExtractionResult(entities=[], relations=[], stats=stats)
# ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
all_raw_entities: list[EntityCandidate] = []
for i, chunk in enumerate(chunks):
_progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
try:
candidates = extract_entities_llm(
text=chunk,
entity_schema=entity_presets,
llm_chat_json=llm_chat_json,
)
except Exception as exc:
warnings.warn(
f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
)
candidates = []
for candidate in candidates:
# Anotar el chunk de origen
if i not in candidate.source_chunk_indices:
candidate.source_chunk_indices.append(i)
all_raw_entities.append(candidate)
# ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
filtered_entities = [
e for e in all_raw_entities if e.confidence >= confidence_threshold
]
stats.raw_entities_count = len(filtered_entities)
# Actualizar stats de tipos
for ent in filtered_entities:
stats.entity_types_found[ent.type_ref] = (
stats.entity_types_found.get(ent.type_ref, 0) + 1
)
# ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
_progress("Deduplicating entities...", 0.4)
dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
stats.final_entities_count = dedup_result.total_after
stats.entities_merged = dedup_result.total_before - dedup_result.total_after
final_entities = dedup_result.entities
entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id
# ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
all_raw_relations = []
for i, chunk in enumerate(chunks):
_progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
# Obtener entidades relevantes de este chunk
chunk_entities = [
e for e in final_entities if i in e.source_chunk_indices
]
# Si no hay entidades en este chunk especifico, usar todas
if not chunk_entities:
chunk_entities = final_entities
if len(chunk_entities) < 2:
continue
try:
chunk_relations = extract_relations_llm(
text=chunk,
entities=chunk_entities,
relation_types=relation_types,
llm_chat_json=llm_chat_json,
)
except Exception as exc:
warnings.warn(
f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
)
chunk_relations = []
for rel in chunk_relations:
rel.source_chunk_index = i
all_raw_relations.extend(chunk_relations)
stats.raw_relations_count = len(all_raw_relations)
# Actualizar stats de tipos de relacion
for rel in all_raw_relations:
stats.relation_types_found[rel.relation_type] = (
stats.relation_types_found.get(rel.relation_type, 0) + 1
)
# ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
_progress("Deduplicating relations...", 0.8)
final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
stats.final_relations_count = len(final_relations)
stats.relations_merged = stats.raw_relations_count - len(final_relations)
stats.processing_time_seconds = time.monotonic() - start_time
_progress("Done", 1.0)
return ExtractionResult(
entities=final_entities,
relations=final_relations,
stats=stats,
)
+20
View File
@@ -0,0 +1,20 @@
"""ExtractionResult — resultado final del pipeline de extraccion."""
from dataclasses import dataclass, field
from entity_candidate import EntityCandidate
from extraction_stats import ExtractionStats
from relation_candidate import RelationCandidate
@dataclass
class ExtractionResult:
"""Resultado final del pipeline de extraccion de entidades y relaciones.
Contiene las listas deduplicadas de entidades y relaciones junto con
las estadisticas del proceso completo.
"""
entities: list[EntityCandidate]
relations: list[RelationCandidate]
stats: ExtractionStats = field(default_factory=ExtractionStats)
+25
View File
@@ -0,0 +1,25 @@
"""ExtractionStats — estadisticas del proceso de extraccion."""
from dataclasses import dataclass, field
@dataclass
class ExtractionStats:
"""Estadisticas del proceso de extraccion.
Util para reporting y debugging. Registra conteos antes y despues de
deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
"""
total_chunks: int = 0
total_chars: int = 0
raw_entities_count: int = 0
final_entities_count: int = 0
entities_merged: int = 0
raw_relations_count: int = 0
final_relations_count: int = 0
relations_merged: int = 0
relations_discarded: int = 0
entity_types_found: dict[str, int] = field(default_factory=dict)
relation_types_found: dict[str, int] = field(default_factory=dict)
processing_time_seconds: float = 0.0
+78
View File
@@ -0,0 +1,78 @@
"""Combina atributos de multiples candidatos de la misma entidad."""
from __future__ import annotations
_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
_DATE_MIN_FIELDS = {"first_seen", "created_date"}
_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
_BOOL_FIELDS = {"verified", "exploited"}
def merge_entity_attributes(attr_list: list[dict]) -> dict:
"""Combina atributos de multiples candidatos de la misma entidad.
Para cada campo presente en cualquier candidato recopila todos los valores
non-null y aplica heuristicas de resolucion por tipo de campo:
- Numerico (risk_score, balance, cvss): max
- Fecha min (first_seen, created_date): min (mas antigua)
- Fecha max (last_seen, expires_date): max (mas reciente)
- Lista (cualquier valor de tipo list): union sin duplicados
- Boolean (verified, exploited): OR logico
- String: el mas largo
Args:
attr_list: Lista de dicts con los atributos de cada candidato.
Returns:
Dict con los atributos fusionados.
"""
if not attr_list:
return {}
# Recopilar todas las claves presentes en cualquier candidato
all_keys: set[str] = set()
for attrs in attr_list:
all_keys.update(attrs.keys())
merged: dict = {}
for key in all_keys:
# Recopilar valores non-null
values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
if not values:
merged[key] = None
continue
if len(values) == 1:
merged[key] = values[0]
continue
# Todos iguales
if all(v == values[0] for v in values):
merged[key] = values[0]
continue
# Resolver conflicto segun tipo de campo
if key in _NUMERIC_FIELDS:
merged[key] = max(values)
elif key in _DATE_MIN_FIELDS:
merged[key] = min(values)
elif key in _DATE_MAX_FIELDS:
merged[key] = max(values)
elif key in _BOOL_FIELDS:
merged[key] = any(values)
elif isinstance(values[0], list):
# Union de listas sin duplicados, preservando orden de aparicion
seen: list = []
for lst in values:
for item in lst:
if item not in seen:
seen.append(item)
merged[key] = seen
else:
# String u otro: usar el mas largo
str_values = [str(v) for v in values]
merged[key] = max(str_values, key=len)
return merged
+81
View File
@@ -0,0 +1,81 @@
"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
import re
_TITLES = re.compile(
r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
re.IGNORECASE,
)
_LEGAL_SUFFIXES = re.compile(
r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
re.IGNORECASE,
)
_MULTI_SPACE = re.compile(r"\s+")
def normalize_entity_name(name: str, entity_type: str = "") -> str:
"""Normaliza el nombre de una entidad para comparacion y deduplicacion.
Aplica reglas diferentes segun el tipo de entidad:
- ip / email / domain / crypto_wallet / phone: normalizacion tecnica
- person: normalizacion de nombre humano (titulos, formato apellido-nombre)
- organization: normalizacion corporativa (sufijos legales)
- default: lower + strip + colapsar espacios
Args:
name: nombre de la entidad a normalizar.
entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
person, organization). Vacio = default.
Returns:
nombre normalizado como string.
"""
name = name.strip()
et = entity_type.lower().strip()
if et == "ip":
return name.lower()
if et == "email":
return name.lower()
if et == "domain":
result = name.lower().rstrip(".")
if result.startswith("www."):
result = result[4:]
return result
if et == "crypto_wallet":
# Bitcoin addresses son case-sensitive — solo strip
return name
if et == "phone":
# Mantener solo digitos y el signo +
return re.sub(r"[^\d+]", "", name)
if et == "person":
# Remover titulos al inicio
result = _TITLES.sub("", name).strip()
# Detectar formato "Apellido, Nombre"
if "," in result:
parts = result.split(",", 1)
last = parts[0].strip()
first = parts[1].strip()
result = f"{first} {last}"
# Colapsar espacios y title case
result = _MULTI_SPACE.sub(" ", result).strip()
return result.title()
if et == "organization":
result = _LEGAL_SUFFIXES.sub("", name).strip()
result = _MULTI_SPACE.sub(" ", result).strip()
# Title case para consistencia
return result.title()
# Default: lower, strip, colapsar espacios
return _MULTI_SPACE.sub(" ", name.lower()).strip()
+35
View File
@@ -0,0 +1,35 @@
"""RelationCandidate — candidato de relacion extraido por el LLM."""
from dataclasses import dataclass
@dataclass
class RelationCandidate:
"""Candidato de relacion entre dos entidades extraido por el LLM.
`from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
contra los EntityCandidate finales.
"""
from_name: str
to_name: str
from_id: str = ""
to_id: str = ""
relation_type: str = ""
description: str = ""
confidence: float = 0.0
source_chunk_index: int = -1
def to_dict(self) -> dict:
"""Serializa el candidato a un diccionario."""
return {
"from_name": self.from_name,
"to_name": self.to_name,
"from_id": self.from_id,
"to_id": self.to_id,
"relation_type": self.relation_type,
"description": self.description,
"confidence": self.confidence,
"source_chunk_index": self.source_chunk_index,
}
+234
View File
@@ -0,0 +1,234 @@
"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
import json
import os
_HTML_TEMPLATE = """\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>{title}</title>
<script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
<style>
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
#container {{ width: 100vw; height: 100vh; }}
#panel {{
position: absolute; top: 12px; right: 12px;
background: rgba(10, 10, 30, 0.88);
border: 1px solid rgba(255,255,255,0.12);
padding: 16px; border-radius: 10px;
z-index: 10; min-width: 200px; max-width: 260px;
backdrop-filter: blur(6px);
}}
#panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
#stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
#filters {{ display: flex; flex-direction: column; gap: 6px; }}
.filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
.filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
.color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
#tooltip {{
position: absolute; display: none;
background: rgba(5, 5, 20, 0.95);
border: 1px solid rgba(255,255,255,0.15);
padding: 10px 14px; border-radius: 8px;
pointer-events: none; z-index: 20;
max-width: 300px; font-size: 12px; line-height: 1.6;
}}
#tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
#tooltip .tt-row {{ display: flex; gap: 6px; }}
#tooltip .tt-key {{ color: #888; min-width: 80px; }}
#tooltip .tt-val {{ color: #eee; word-break: break-all; }}
</style>
</head>
<body>
<div id="container"></div>
<div id="panel">
<h3>{title}</h3>
<div id="stats"></div>
<div id="filters"></div>
</div>
<div id="tooltip"></div>
<script>
(function () {{
const graphData = {json_data};
// ── Build graphology graph ──────────────────────────────────────────────
const Graph = graphology.Graph || graphology;
const g = new Graph({{ multi: true, type: 'directed' }});
// Assign random initial positions
graphData.nodes.forEach(function (n) {{
g.addNode(n.key, Object.assign({{
x: (Math.random() - 0.5) * 10,
y: (Math.random() - 0.5) * 10,
}}, n.attributes));
}});
graphData.edges.forEach(function (e) {{
try {{
g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
}} catch (err) {{
// skip duplicate edge keys gracefully
}}
}});
// ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
const FA2 = graphologyLibrary.layoutForceAtlas2;
FA2.assign(g, {{
iterations: 500,
settings: {{
gravity: 1,
scalingRatio: 2,
slowDown: 5,
barnesHutOptimize: g.order > 300,
}},
}});
// ── Sigma renderer ──────────────────────────────────────────────────────
const renderer = new Sigma(g, document.getElementById('container'), {{
renderEdgeLabels: false,
defaultEdgeColor: '#444',
defaultNodeColor: '#95a5a6',
labelColor: {{ color: '#ccc' }},
labelSize: 11,
edgeReducer: function (edge, data) {{
return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
}},
}});
// ── Stats panel ─────────────────────────────────────────────────────────
document.getElementById('stats').textContent =
graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
// ── Filter panel by node type ───────────────────────────────────────────
const typeColors = {{}};
graphData.nodes.forEach(function (n) {{
const t = n.attributes.entity_type || 'unknown';
typeColors[t] = n.attributes.color || '#95a5a6';
}});
const hiddenTypes = new Set();
const filtersDiv = document.getElementById('filters');
Object.keys(typeColors).sort().forEach(function (type) {{
const color = typeColors[type];
const label = document.createElement('label');
label.className = 'filter-item';
const cb = document.createElement('input');
cb.type = 'checkbox';
cb.checked = true;
cb.addEventListener('change', function () {{
if (cb.checked) hiddenTypes.delete(type);
else hiddenTypes.add(type);
renderer.refresh();
}});
const dot = document.createElement('span');
dot.className = 'color-dot';
dot.style.background = color;
label.appendChild(cb);
label.appendChild(dot);
label.appendChild(document.createTextNode(type));
filtersDiv.appendChild(label);
}});
// Node reducer applies type filter
renderer.setSetting('nodeReducer', function (node, data) {{
if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
return data;
}});
// ── Tooltip on hover ────────────────────────────────────────────────────
const tooltip = document.getElementById('tooltip');
renderer.on('enterNode', function (ref) {{
const nodeAttrs = g.getNodeAttributes(ref.node);
const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
Object.keys(nodeAttrs).sort().forEach(function (k) {{
if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
}}
}});
tooltip.innerHTML = html;
tooltip.style.display = 'block';
}});
renderer.on('leaveNode', function () {{
tooltip.style.display = 'none';
}});
document.getElementById('container').addEventListener('mousemove', function (e) {{
tooltip.style.left = (e.clientX + 16) + 'px';
tooltip.style.top = (e.clientY + 16) + 'px';
}});
function escHtml(str) {{
return String(str)
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;');
}}
}})();
</script>
</body>
</html>
"""
def render_sigma_html(
graph_data: dict,
output_path: str,
title: str = "OSINT Graph",
) -> str:
"""Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
tooltip con metadata al hacer hover.
Args:
graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
output_path: Ruta del archivo HTML a escribir.
title: Titulo del grafo mostrado en el panel y la pestana.
Returns:
Ruta absoluta del archivo HTML escrito.
Raises:
Exception: Si no se puede escribir el archivo en output_path.
"""
json_data = json.dumps(graph_data, ensure_ascii=False)
html = _HTML_TEMPLATE.format(
title=title,
json_data=json_data,
)
abs_path = os.path.abspath(output_path)
os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
try:
with open(abs_path, "w", encoding="utf-8") as f:
f.write(html)
except OSError as exc:
raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
return abs_path
+66
View File
@@ -0,0 +1,66 @@
"""Split text into overlapping chunks with sentence-boundary awareness."""
def split_text_into_chunks(
text: str, chunk_size: int = 500, overlap: int = 50
) -> list[str]:
"""Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
Args:
text: Texto a dividir.
chunk_size: Tamaño máximo de cada chunk en caracteres.
overlap: Número de caracteres de solapamiento entre chunks consecutivos.
Returns:
Lista de chunks. Vacía si el texto es vacío.
"""
if not text:
return []
if len(text) <= chunk_size:
stripped = text.strip()
return [stripped] if stripped else []
# Separadores en orden de prioridad (más específicos primero)
separators = ["", "", "", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
chunks: list[str] = []
start = 0
text_len = len(text)
while start < text_len:
end = start + chunk_size
if end < text_len:
# Buscar el último separador de oración dentro de text[start:end]
# Solo aceptar si está después del 30% del chunk
min_pos = start + int(chunk_size * 0.30)
best_end = None
for sep in separators:
sep_len = len(sep)
# Buscar la última ocurrencia del separador en text[start:end]
search_region = text[start:end]
pos = search_region.rfind(sep)
if pos == -1:
continue
abs_pos = start + pos + sep_len
if abs_pos > min_pos:
# Usar este separador solo si produce un corte más tarde que el mínimo
# y más temprano que chunk_size (ya garantizado por rfind en [start:end])
if best_end is None or abs_pos > best_end:
best_end = abs_pos
if best_end is not None:
end = best_end
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap
# Protección contra bucle infinito si overlap >= chunk_size o end no avanza
if start >= end:
start = end
return chunks
+6
View File
@@ -0,0 +1,6 @@
def main():
print("Hello from ontology-graph!")
if __name__ == "__main__":
main()
+935
View File
@@ -0,0 +1,935 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ontology Graph Extraction\n",
"\n",
"Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
"- LLM: `claude -p --model haiku`\n",
"- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m 4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 10\u001b[39m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"ROOT = '/home/lucas/fn_registry'\n",
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
"sys.path.insert(0, ROOT)\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('Registry root:', ROOT)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'FN_REGISTRY_ROOT'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m 4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
"\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
"\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"ROOT = os.environ['FN_REGISTRY_ROOT']\n",
"sys.path.insert(0, ROOT)\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('Registry root:', ROOT)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LLM wrapper: claude -p + haiku"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
" \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
" # Construir prompt desde messages\n",
" parts = []\n",
" for msg in messages:\n",
" role = msg['role']\n",
" content = msg['content']\n",
" if role == 'system':\n",
" parts.append(f\"[SYSTEM]\\n{content}\")\n",
" elif role == 'user':\n",
" parts.append(f\"[USER]\\n{content}\")\n",
" prompt = \"\\n\\n\".join(parts)\n",
" \n",
" result = subprocess.run(\n",
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
" capture_output=True, text=True, timeout=120\n",
" )\n",
" \n",
" if result.returncode != 0:\n",
" raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
" \n",
" # Extraer el campo 'result' del JSON envelope de claude\n",
" envelope = json.loads(result.stdout)\n",
" raw_text = envelope.get('result', '')\n",
" \n",
" # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
" return extract_json_from_llm(raw_text)\n",
"\n",
"# Test rapido\n",
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
"print('LLM wrapper OK:', test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Entity presets: OSINT + genéricos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# --- Presets OSINT (del registry) ---\n",
"OSINT_PRESETS = [\n",
" {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
" \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
" {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
" \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
" {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
" \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
" {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
" \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
" {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
" \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
" {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
" \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
" {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
" \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
" {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
" \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
" {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
" \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
" {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
" \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
" {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
" \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
" {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
" \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
" {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
" \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
"]\n",
"\n",
"# --- Presets genéricos (sin tipo Go, inline) ---\n",
"GENERIC_PRESETS = [\n",
" {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
" \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
" {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
" \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
" {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
" \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
" {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
" \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
" {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
" \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
" {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
" \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
"]\n",
"\n",
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
"print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Relation types"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"RELATION_TYPES = [\n",
" # Personas / orgs\n",
" \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
" \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
" # Comunicacion\n",
" \"communicates_with\", \"sent_to\", \"received_from\",\n",
" # Ubicacion\n",
" \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
" # Eventos\n",
" \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
" # Documentos / conceptos\n",
" \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
" # Financiero\n",
" \"funds\", \"transacted_with\", \"invested_in\",\n",
" # Tecnico\n",
" \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
" # Generico\n",
" \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
"]\n",
"\n",
"print(f'{len(RELATION_TYPES)} relation types')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extraer documento\n",
"\n",
"Pon tu documento en `data/` y cambia el path."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf') # <-- cambiar\n",
"\n",
"# Progreso visible\n",
"def on_progress(msg, pct):\n",
" print(f' [{pct*100:5.1f}%] {msg}')\n",
"\n",
"result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=ALL_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
")\n",
"\n",
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
"print(f'Relations: {result.stats.final_relations_count}')\n",
"print(f'Chunks: {result.stats.total_chunks}')\n",
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
"print(f'Entity types: {result.stats.entity_types_found}')\n",
"print(f'Relation types: {result.stats.relation_types_found}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explorar resultados"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Entities\n",
"ent_rows = []\n",
"for e in result.entities:\n",
" ent_rows.append({\n",
" 'id': e.id,\n",
" 'name': e.name,\n",
" 'type': e.type_ref,\n",
" 'confidence': e.confidence,\n",
" 'attributes': e.attributes,\n",
" })\n",
"df_entities = pd.DataFrame(ent_rows)\n",
"print(f'=== Entities ({len(df_entities)}) ===')\n",
"df_entities.sort_values('type')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Relations\n",
"rel_rows = []\n",
"for r in result.relations:\n",
" rel_rows.append({\n",
" 'from_name': r.from_name,\n",
" 'relation': r.relation_type,\n",
" 'to_name': r.to_name,\n",
" 'confidence': r.confidence,\n",
" 'description': r.description,\n",
" })\n",
"df_relations = pd.DataFrame(rel_rows)\n",
"print(f'=== Relations ({len(df_relations)}) ===')\n",
"df_relations.sort_values('relation')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualizar grafo con sigma.js"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Colores por tipo de entidad\n",
"TYPE_COLORS = {\n",
" 'osint_person_go_cybersecurity': '#e74c3c',\n",
" 'osint_organization_go_cybersecurity': '#3498db',\n",
" 'osint_location_go_cybersecurity': '#2ecc71',\n",
" 'osint_event_go_cybersecurity': '#f39c12',\n",
" 'osint_email_go_cybersecurity': '#9b59b6',\n",
" 'osint_domain_go_cybersecurity': '#1abc9c',\n",
" 'osint_ip_address_go_cybersecurity': '#e67e22',\n",
" 'osint_phone_go_cybersecurity': '#95a5a6',\n",
" 'osint_social_media_go_cybersecurity': '#e91e63',\n",
" 'osint_document_go_cybersecurity': '#607d8b',\n",
" 'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
" 'osint_malware_go_cybersecurity': '#f44336',\n",
" 'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
" 'concept': '#00bcd4',\n",
" 'url': '#8bc34a',\n",
" 'date_reference': '#cddc39',\n",
" 'quantity': '#ffc107',\n",
" 'coordinates': '#4caf50',\n",
" 'text_fragment': '#78909c',\n",
"}\n",
"DEFAULT_COLOR = '#aaaaaa'\n",
"\n",
"def extraction_to_sigma(result) -> dict:\n",
" \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
" # Contar degree para tamaño de nodo\n",
" degree = {}\n",
" for r in result.relations:\n",
" from_id = r.from_id or r.from_name\n",
" to_id = r.to_id or r.to_name\n",
" degree[from_id] = degree.get(from_id, 0) + 1\n",
" degree[to_id] = degree.get(to_id, 0) + 1\n",
"\n",
" nodes = []\n",
" for e in result.entities:\n",
" eid = e.id or e.name\n",
" nodes.append({\n",
" 'key': eid,\n",
" 'attributes': {\n",
" 'label': e.name,\n",
" 'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
" 'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
" 'type': e.type_ref,\n",
" **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
" }\n",
" })\n",
"\n",
" edges = []\n",
" node_keys = {n['key'] for n in nodes}\n",
" for i, r in enumerate(result.relations):\n",
" from_id = r.from_id or r.from_name\n",
" to_id = r.to_id or r.to_name\n",
" if from_id in node_keys and to_id in node_keys:\n",
" edges.append({\n",
" 'key': f'e{i}',\n",
" 'source': from_id,\n",
" 'target': to_id,\n",
" 'attributes': {\n",
" 'label': r.relation_type,\n",
" 'type': r.relation_type,\n",
" }\n",
" })\n",
"\n",
" return {'nodes': nodes, 'edges': edges}\n",
"\n",
"graph_data = extraction_to_sigma(result)\n",
"print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
"html_path = render_sigma_html(\n",
" graph_data=graph_data,\n",
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
" title='Ontology Graph',\n",
")\n",
"print(f'Graph saved: {html_path}')\n",
"print(f'Open in browser: file://{html_path}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Auto-discovery de nuevos tipos\n",
"\n",
"Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
" \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
" # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
" generic_entities = [\n",
" {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
" for e in result.entities\n",
" if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
" ]\n",
" \n",
" if not generic_entities:\n",
" print('No hay entidades genéricas — los presets cubren todo.')\n",
" return []\n",
"\n",
" existing_labels = [p['label'] for p in existing_presets]\n",
" \n",
" prompt_msg = [\n",
" {'role': 'system', 'content': (\n",
" 'You analyze entities extracted from a document and suggest new entity type presets. '\n",
" 'Existing types: ' + ', '.join(existing_labels) + '. '\n",
" 'For entities that dont fit existing types, suggest new type presets. '\n",
" 'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
" '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
" 'Only suggest types that are genuinely different from existing ones. '\n",
" 'Return {\"new_presets\": []} if no new types are needed.'\n",
" )},\n",
" {'role': 'user', 'content': (\n",
" 'These entities were classified as generic (concept/text_fragment) '\n",
" 'because they didnt fit existing types:\\n\\n'\n",
" + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
" )}\n",
" ]\n",
" \n",
" resp = claude_haiku_json(prompt_msg)\n",
" new_presets = resp.get('new_presets', [])\n",
" \n",
" if new_presets:\n",
" print(f'Discovered {len(new_presets)} new types:')\n",
" for p in new_presets:\n",
" print(f\" - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
" else:\n",
" print('No new types needed.')\n",
" \n",
" return new_presets\n",
"\n",
"new_types = discover_new_types(result, ALL_PRESETS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
"if new_types:\n",
" EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
" print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
" \n",
" result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=EXPANDED_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
" )\n",
" \n",
" print(f'\\nEntities: {result.stats.final_entities_count}')\n",
" print(f'Relations: {result.stats.final_relations_count}')\n",
" \n",
" # Re-generar grafo\n",
" graph_data = extraction_to_sigma(result)\n",
" html_path = render_sigma_html(\n",
" graph_data=graph_data,\n",
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
" title='Ontology Graph (expanded)',\n",
" )\n",
" print(f'Updated graph: file://{html_path}')\n",
"else:\n",
" print('No re-extraction needed.')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m 8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 12\u001b[39m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"from pathlib import Path\n",
"\n",
"ROOT = '/home/lucas/fn_registry'\n",
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
"sys.path.insert(0, ROOT)\n",
"sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('OK: imports loaded')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"imports OK\n"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"# Añadir lib/ al path\n",
"sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
"\n",
"from core_functions import extract_json_from_llm\n",
"from extraction_pipeline import extraction_pipeline\n",
"from render_sigma_html import render_sigma_html\n",
"\n",
"print('imports OK')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LLM wrapper OK: {'ok': True}\n"
]
}
],
"source": [
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
" \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
" parts = []\n",
" for msg in messages:\n",
" role = msg['role']\n",
" content = msg['content']\n",
" if role == 'system':\n",
" parts.append(f'[SYSTEM]\\n{content}')\n",
" elif role == 'user':\n",
" parts.append(f'[USER]\\n{content}')\n",
" prompt = '\\n\\n'.join(parts)\n",
" \n",
" result = subprocess.run(\n",
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
" capture_output=True, text=True, timeout=120\n",
" )\n",
" if result.returncode != 0:\n",
" raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
" \n",
" envelope = json.loads(result.stdout)\n",
" raw_text = envelope.get('result', '')\n",
" return extract_json_from_llm(raw_text)\n",
"\n",
"# Test\n",
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
"print('LLM wrapper OK:', test)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"19 presets, 35 relation types\n"
]
}
],
"source": [
"OSINT_PRESETS = [\n",
" {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
" 'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
" {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
" 'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
" {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
" 'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
" {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
" 'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
" {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
" 'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
" {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
" 'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
" {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
" 'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
" {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
" 'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
" {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
" 'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
" {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
" 'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
" {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
" 'metadata_fields': ['address', 'blockchain', 'balance']},\n",
" {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
" 'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
" {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
" 'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
"]\n",
"\n",
"GENERIC_PRESETS = [\n",
" {'type_ref': 'concept', 'label': 'Concept',\n",
" 'metadata_fields': ['name', 'category', 'definition']},\n",
" {'type_ref': 'url', 'label': 'URL/Link',\n",
" 'metadata_fields': ['url', 'domain', 'context']},\n",
" {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
" 'metadata_fields': ['date', 'precision', 'context']},\n",
" {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
" 'metadata_fields': ['value', 'unit', 'context']},\n",
" {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
" 'metadata_fields': ['lat', 'lon', 'label']},\n",
" {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
" 'metadata_fields': ['text', 'category', 'relevance']},\n",
"]\n",
"\n",
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
"\n",
"RELATION_TYPES = [\n",
" 'employs', 'works_for', 'founded', 'owns', 'controls',\n",
" 'member_of', 'affiliated_with', 'collaborates_with',\n",
" 'communicates_with', 'sent_to', 'received_from',\n",
" 'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
" 'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
" 'mentions', 'references', 'describes', 'authored', 'published',\n",
" 'funds', 'transacted_with', 'invested_in',\n",
" 'hosts', 'resolves_to', 'exploits', 'targets',\n",
" 'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
"]\n",
"\n",
"print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.0%] Extracting text from file...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.0%] Extracting entities from chunk 1/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.7%] Extracting entities from chunk 2/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 1.5%] Extracting entities from chunk 3/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 2.2%] Extracting entities from chunk 4/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 3.0%] Extracting entities from chunk 5/54\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
" candidates = extract_entities_llm(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 3.7%] Extracting entities from chunk 6/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 4.4%] Extracting entities from chunk 7/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 5.2%] Extracting entities from chunk 8/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 5.9%] Extracting entities from chunk 9/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 6.7%] Extracting entities from chunk 10/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 7.4%] Extracting entities from chunk 11/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 8.1%] Extracting entities from chunk 12/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 8.9%] Extracting entities from chunk 13/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 9.6%] Extracting entities from chunk 14/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 10.4%] Extracting entities from chunk 15/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 11.1%] Extracting entities from chunk 16/54\n"
]
}
],
"source": [
"DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
"\n",
"def on_progress(msg, pct):\n",
" print(f' [{pct*100:5.1f}%] {msg}')\n",
"\n",
"result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=ALL_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
")\n",
"\n",
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
"print(f'Relations: {result.stats.final_relations_count}')\n",
"print(f'Chunks: {result.stats.total_chunks}')\n",
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
"print(f'Entity types: {result.stats.entity_types_found}')\n",
"print(f'Relation types: {result.stats.relation_types_found}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline optimizado\n",
"\n",
"- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
"- Chunks de 2000 chars\n",
"- Paralelizado con ThreadPoolExecutor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"from extract_text_from_file import extract_text_from_file\n",
"from core_functions import preprocess_text\n",
"from split_text_into_chunks import split_text_into_chunks\n",
"from deduplicate_entities import deduplicate_entities\n",
"from deduplicate_relations import deduplicate_relations\n",
"from entity_candidate import EntityCandidate\n",
"from relation_candidate import RelationCandidate\n",
"\n",
"def build_unified_prompt(entity_presets, relation_types):\n",
" \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
" type_lines = []\n",
" for p in entity_presets:\n",
" fields = ', '.join(p.get('metadata_fields', []))\n",
" type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
"\n",
" return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
"\n",
"ENTITY TYPES:\n",
"{chr(10).join(type_lines)}\n",
"\n",
"RELATION TYPES: {', '.join(relation_types)}\n",
"\n",
"OUTPUT FORMAT (strict JSON):\n",
"{{\n",
" \"entities\": [\n",
" {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
" ],\n",
" \"relations\": [\n",
" {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
" ],\n",
" \"suggested_types\": [\n",
" {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
" ]\n",
"}}\n",
"\n",
"RULES:\n",
"- Extract ALL entities explicitly mentioned in the text\n",
"- Use exact type_ref from the schema. Leave unknown attributes as null\n",
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
"- Relations: from_name and to_name MUST match extracted entity names exactly\n",
"- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
"- If no suggested types are needed, return \"suggested_types\": []\n",
"- Respond in the same language as the text for descriptions'''\n",
"\n",
"UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
"print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
+15
View File
@@ -0,0 +1,15 @@
[project]
name = "ontology-graph"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"jupyter>=1.1.1",
"jupyter-collaboration>=4.3.0",
"jupyter-mcp-server>=0.4.0",
"jupyterlab>=4.5.6",
"matplotlib>=3.10.8",
"numpy>=2.4.4",
"pandas>=3.0.2",
]
View File
+45
View File
@@ -0,0 +1,45 @@
#!/bin/bash
# Jupyter Lab — modo colaborativo con autodeteccion de puerto
# Generado por write_jupyter_launcher (fn_registry)
find_free_port() {
for port in 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899; do
if ! ss -tln 2>/dev/null | grep -q ":${port} " && \
! lsof -i:"$port" >/dev/null 2>&1; then
echo $port
return
fi
done
echo 8888
}
PORT=${1:-$(find_free_port)}
cd "$(dirname "$0")"
echo $PORT > .jupyter-port
source .venv/bin/activate 2>/dev/null || true
if ! python -c "import jupyter_collaboration" 2>/dev/null; then
echo "ERROR: jupyter-collaboration no esta instalado"
echo "Instala con: uv add jupyter-collaboration"
exit 1
fi
echo "════════════════════════════════════════════════"
echo " Jupyter Lab + Colaboracion en puerto $PORT"
echo "════════════════════════════════════════════════"
echo ""
echo " Abre: http://localhost:$PORT"
echo " Ctrl+C para detener"
echo ""
jupyter lab \
--port=$PORT \
--no-browser \
--ServerApp.token='' \
--ServerApp.password='' \
--ServerApp.disable_check_xsrf=True \
--ServerApp.allow_origin='*' \
--ServerApp.root_dir="$(pwd)" \
--collaborative
Generated
+2549
View File
File diff suppressed because it is too large Load Diff