chore: initial sync
This commit is contained in:
@@ -0,0 +1,40 @@
|
|||||||
|
# JUPYTER HABILITADO EN ESTE ANALISIS
|
||||||
|
|
||||||
|
## Reglas OBLIGATORIAS para Claude
|
||||||
|
|
||||||
|
### 1. CODIGO INMUTABLE — NUNCA MODIFICAR CELDAS EXISTENTES
|
||||||
|
- **PROHIBIDO** usar NotebookEdit para reemplazar celdas existentes
|
||||||
|
- **SIEMPRE** anadir celdas NUEVAS al final del notebook
|
||||||
|
- Si hay un error en una celda, crear celda nueva con la correccion
|
||||||
|
- El historial de trabajo debe quedar intacto para trazabilidad
|
||||||
|
|
||||||
|
### 2. PROGRAMACION FUNCIONAL OBLIGATORIA
|
||||||
|
- **Funciones puras**: sin efectos secundarios, mismo input -> mismo output
|
||||||
|
- **Inmutabilidad**: nunca mutar datos, crear copias transformadas
|
||||||
|
- **Composicion**: funciones pequenas que se combinan
|
||||||
|
- Preferir: `map`, `filter`, `reduce`, list comprehensions
|
||||||
|
- Evitar: loops con mutacion, `global`, modificar argumentos in-place
|
||||||
|
|
||||||
|
### 3. SIEMPRE usar MCP jupyter para ejecutar codigo Python
|
||||||
|
- Las ejecuciones se ven en tiempo real en Jupyter Lab del usuario
|
||||||
|
- Compartimos variables y estado del kernel
|
||||||
|
- **NUNCA usar bash para ejecutar Python en este analisis**
|
||||||
|
|
||||||
|
### 4. Verificar Jupyter activo ANTES de ejecutar
|
||||||
|
- Si no esta activo: pedir al usuario que ejecute `./run-jupyter-lab.sh`
|
||||||
|
|
||||||
|
### 5. Gestion de notebooks
|
||||||
|
- Notebooks en la carpeta `notebooks/` o subcarpetas
|
||||||
|
- Si un notebook tiene >50 celdas, crear uno nuevo
|
||||||
|
- Nombrar descriptivamente: `01_exploracion.ipynb`, `02_limpieza.ipynb`
|
||||||
|
|
||||||
|
### 6. Gestion de Python
|
||||||
|
- **SIEMPRE usar `uv`** para gestionar dependencias
|
||||||
|
- Anadir paquetes con `uv add nombre_paquete`
|
||||||
|
|
||||||
|
### 7. Acceso al fn_registry
|
||||||
|
- `FN_REGISTRY_ROOT` apunta a la raiz del registry
|
||||||
|
- Para importar funciones Python: `sys.path.insert(0, os.path.join(os.environ["FN_REGISTRY_ROOT"], "python", "functions"))`
|
||||||
|
- Para consultar registry.db: `sqlite3` o `import sqlite3` con la ruta `$FN_REGISTRY_ROOT/registry.db`
|
||||||
|
|
||||||
|
|
||||||
+12
@@ -0,0 +1,12 @@
|
|||||||
|
.venv/
|
||||||
|
.mcp.json
|
||||||
|
.jupyter-port
|
||||||
|
.jupyter/
|
||||||
|
.jupyter_ystore.db
|
||||||
|
.ipython/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
bin/
|
||||||
|
data/
|
||||||
|
.DS_Store
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
3.13
|
||||||
+540
@@ -0,0 +1,540 @@
|
|||||||
|
"""Extracción de grafo ontológico desde un documento.
|
||||||
|
|
||||||
|
Uso: python extract.py <archivo>
|
||||||
|
python extract.py data/condiciones-generales-bizum.pdf
|
||||||
|
|
||||||
|
Optimizaciones vs extraction_pipeline:
|
||||||
|
- 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos)
|
||||||
|
- Chunks de 2000 chars
|
||||||
|
- Paralelizado con ThreadPoolExecutor
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib"))
|
||||||
|
|
||||||
|
from extract_text_from_file import extract_text_from_file
|
||||||
|
from core_functions import preprocess_text, extract_json_from_llm
|
||||||
|
from split_text_into_chunks import split_text_into_chunks
|
||||||
|
from deduplicate_entities import deduplicate_entities
|
||||||
|
from deduplicate_relations import deduplicate_relations
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
from relation_candidate import RelationCandidate
|
||||||
|
from render_sigma_html import render_sigma_html
|
||||||
|
|
||||||
|
# ── Presets ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
OSINT_PRESETS = [
|
||||||
|
{"type_ref": "person", "label": "Person",
|
||||||
|
"metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
|
||||||
|
{"type_ref": "organization", "label": "Organization",
|
||||||
|
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
|
||||||
|
{"type_ref": "location", "label": "Location",
|
||||||
|
"metadata_fields": ["lat", "lon", "address", "country", "city"]},
|
||||||
|
{"type_ref": "event", "label": "Event",
|
||||||
|
"metadata_fields": ["event_type", "date", "location", "description", "severity"]},
|
||||||
|
{"type_ref": "email", "label": "Email",
|
||||||
|
"metadata_fields": ["address", "provider", "verified", "breached"]},
|
||||||
|
{"type_ref": "domain", "label": "Domain",
|
||||||
|
"metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
|
||||||
|
{"type_ref": "ip_address", "label": "IP Address",
|
||||||
|
"metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
|
||||||
|
{"type_ref": "phone", "label": "Phone",
|
||||||
|
"metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
|
||||||
|
{"type_ref": "social_media", "label": "Social Media Account",
|
||||||
|
"metadata_fields": ["platform", "username", "url", "followers", "verified"]},
|
||||||
|
{"type_ref": "document", "label": "Document",
|
||||||
|
"metadata_fields": ["title", "format", "classification", "source"]},
|
||||||
|
{"type_ref": "crypto_wallet", "label": "Crypto Wallet",
|
||||||
|
"metadata_fields": ["address", "blockchain", "balance"]},
|
||||||
|
{"type_ref": "malware", "label": "Malware",
|
||||||
|
"metadata_fields": ["family", "hash_sha256", "threat_level"]},
|
||||||
|
{"type_ref": "vulnerability", "label": "Vulnerability",
|
||||||
|
"metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]},
|
||||||
|
]
|
||||||
|
|
||||||
|
GENERIC_PRESETS = [
|
||||||
|
{"type_ref": "concept", "label": "Concept",
|
||||||
|
"metadata_fields": ["name", "category", "definition"]},
|
||||||
|
{"type_ref": "url", "label": "URL/Link",
|
||||||
|
"metadata_fields": ["url", "domain", "context"]},
|
||||||
|
{"type_ref": "date_reference", "label": "Date/Time",
|
||||||
|
"metadata_fields": ["date", "precision", "context"]},
|
||||||
|
{"type_ref": "quantity", "label": "Quantity/Amount",
|
||||||
|
"metadata_fields": ["value", "unit", "context"]},
|
||||||
|
{"type_ref": "coordinates", "label": "Coordinates",
|
||||||
|
"metadata_fields": ["lat", "lon", "label"]},
|
||||||
|
{"type_ref": "text_fragment", "label": "Key Text Fragment",
|
||||||
|
"metadata_fields": ["text", "category", "relevance"]},
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Custom presets (acumulativo, pensado para promoción al registry) ───────────
|
||||||
|
|
||||||
|
CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json")
|
||||||
|
|
||||||
|
|
||||||
|
def load_custom_presets() -> list[dict]:
|
||||||
|
"""Carga presets custom desde data/custom_presets.json si existe."""
|
||||||
|
if not os.path.exists(CUSTOM_PRESETS_PATH):
|
||||||
|
return []
|
||||||
|
with open(CUSTOM_PRESETS_PATH) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data.get("presets", [])
|
||||||
|
|
||||||
|
|
||||||
|
def save_custom_presets(presets: list[dict]) -> None:
|
||||||
|
"""Guarda presets custom en data/custom_presets.json.
|
||||||
|
|
||||||
|
Formato pensado para promoción al registry:
|
||||||
|
{
|
||||||
|
"presets": [
|
||||||
|
{
|
||||||
|
"type_ref": "snake_case_id",
|
||||||
|
"label": "Human Label",
|
||||||
|
"metadata_fields": ["field1", "field2"],
|
||||||
|
"reason": "why this type exists",
|
||||||
|
"source_doc": "document where it was first discovered",
|
||||||
|
"promoted": false // true cuando se registre en el registry
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True)
|
||||||
|
with open(CUSTOM_PRESETS_PATH, "w") as f:
|
||||||
|
json.dump({"presets": presets}, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]:
|
||||||
|
"""Mergea tipos sugeridos con custom existentes. Dedup por type_ref."""
|
||||||
|
existing = load_custom_presets()
|
||||||
|
existing_refs = {p["type_ref"] for p in existing}
|
||||||
|
|
||||||
|
added = []
|
||||||
|
for s in suggested:
|
||||||
|
ref = s.get("type_ref", "")
|
||||||
|
if not ref or ref in existing_refs:
|
||||||
|
continue
|
||||||
|
existing_refs.add(ref)
|
||||||
|
preset = {
|
||||||
|
"type_ref": ref,
|
||||||
|
"label": s.get("label", ref),
|
||||||
|
"metadata_fields": s.get("metadata_fields", []),
|
||||||
|
"reason": s.get("reason", ""),
|
||||||
|
"source_doc": source_doc,
|
||||||
|
"promoted": False,
|
||||||
|
}
|
||||||
|
existing.append(preset)
|
||||||
|
added.append(preset)
|
||||||
|
|
||||||
|
if added:
|
||||||
|
save_custom_presets(existing)
|
||||||
|
|
||||||
|
return added
|
||||||
|
|
||||||
|
|
||||||
|
RELATION_TYPES = [
|
||||||
|
"employs", "works_for", "founded", "owns", "controls",
|
||||||
|
"member_of", "affiliated_with", "collaborates_with",
|
||||||
|
"communicates_with", "sent_to", "received_from",
|
||||||
|
"located_in", "headquartered_in", "traveled_to", "operates_in",
|
||||||
|
"participated_in", "caused", "occurred_at", "occurred_on",
|
||||||
|
"mentions", "references", "describes", "authored", "published",
|
||||||
|
"funds", "transacted_with", "invested_in",
|
||||||
|
"hosts", "resolves_to", "exploits", "targets",
|
||||||
|
"related_to", "part_of", "instance_of", "has_attribute",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── LLM wrapper ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def claude_haiku_json(messages: list[dict]) -> dict:
|
||||||
|
parts = []
|
||||||
|
for msg in messages:
|
||||||
|
if msg["role"] == "system":
|
||||||
|
parts.append(f"[SYSTEM]\n{msg['content']}")
|
||||||
|
elif msg["role"] == "user":
|
||||||
|
parts.append(f"[USER]\n{msg['content']}")
|
||||||
|
prompt = "\n\n".join(parts)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
|
||||||
|
capture_output=True, text=True, timeout=120,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise RuntimeError(f"claude -p failed: {result.stderr[:200]}")
|
||||||
|
|
||||||
|
envelope = json.loads(result.stdout)
|
||||||
|
return extract_json_from_llm(envelope.get("result", ""))
|
||||||
|
|
||||||
|
# ── Unified prompt ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def build_unified_prompt(presets, rel_types):
|
||||||
|
type_lines = []
|
||||||
|
for p in presets:
|
||||||
|
fields = ", ".join(p.get("metadata_fields", []))
|
||||||
|
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
|
||||||
|
|
||||||
|
return (
|
||||||
|
"You are an entity and relation extraction expert. "
|
||||||
|
"Given text, extract ALL entities and relations in a single pass.\n\n"
|
||||||
|
"ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
|
||||||
|
"RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
|
||||||
|
'OUTPUT FORMAT (strict JSON):\n'
|
||||||
|
'{\n'
|
||||||
|
' "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
|
||||||
|
' "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n'
|
||||||
|
' "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n'
|
||||||
|
'}\n\n'
|
||||||
|
"RULES:\n"
|
||||||
|
"- Extract ALL entities explicitly mentioned\n"
|
||||||
|
"- Use exact type_ref from schema. Unknown attributes = null\n"
|
||||||
|
"- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
|
||||||
|
"- Relations: from_name/to_name MUST match entity names exactly\n"
|
||||||
|
"- suggested_types: for important entities that do NOT fit any type, suggest a new type. "
|
||||||
|
"Use those suggested type_refs for those entities in the entities array.\n"
|
||||||
|
'- If no new types needed: "suggested_types": []\n'
|
||||||
|
"- Respond in the same language as the text for descriptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Process one chunk ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str):
|
||||||
|
"""Procesa un chunk: extrae entities + relations + suggested_types."""
|
||||||
|
try:
|
||||||
|
resp = claude_haiku_json([
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": chunk_text},
|
||||||
|
])
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [WARN] chunk {chunk_idx}: {e}")
|
||||||
|
return [], [], []
|
||||||
|
|
||||||
|
raw_entities = resp.get("entities", [])
|
||||||
|
raw_relations = resp.get("relations", [])
|
||||||
|
suggested = resp.get("suggested_types", [])
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
for ent in raw_entities:
|
||||||
|
name = ent.get("name", "").strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
entities.append(EntityCandidate(
|
||||||
|
name=name,
|
||||||
|
type_ref=ent.get("type_ref", "concept"),
|
||||||
|
attributes=ent.get("attributes", {}),
|
||||||
|
confidence=float(ent.get("confidence", 0.5)),
|
||||||
|
source_chunk_indices=[chunk_idx],
|
||||||
|
))
|
||||||
|
|
||||||
|
relations = []
|
||||||
|
for rel in raw_relations:
|
||||||
|
fn = rel.get("from_name", "").strip()
|
||||||
|
tn = rel.get("to_name", "").strip()
|
||||||
|
if not fn or not tn:
|
||||||
|
continue
|
||||||
|
relations.append(RelationCandidate(
|
||||||
|
from_name=fn,
|
||||||
|
to_name=tn,
|
||||||
|
relation_type=rel.get("relation_type", "related_to"),
|
||||||
|
confidence=float(rel.get("confidence", 0.5)),
|
||||||
|
description=rel.get("description", ""),
|
||||||
|
source_chunk_index=chunk_idx,
|
||||||
|
))
|
||||||
|
|
||||||
|
return entities, relations, suggested
|
||||||
|
|
||||||
|
# ── Sigma conversion ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TYPE_COLORS = {
|
||||||
|
"person": "#e74c3c",
|
||||||
|
"organization": "#3498db",
|
||||||
|
"location": "#2ecc71",
|
||||||
|
"event": "#f39c12",
|
||||||
|
"email": "#9b59b6",
|
||||||
|
"domain": "#1abc9c",
|
||||||
|
"ip_address": "#e67e22",
|
||||||
|
"phone": "#95a5a6",
|
||||||
|
"social_media": "#e91e63",
|
||||||
|
"document": "#607d8b",
|
||||||
|
"crypto_wallet": "#ff9800",
|
||||||
|
"malware": "#f44336",
|
||||||
|
"vulnerability": "#ff5722",
|
||||||
|
"concept": "#00bcd4",
|
||||||
|
"url": "#8bc34a",
|
||||||
|
"date_reference": "#cddc39",
|
||||||
|
"quantity": "#ffc107",
|
||||||
|
"coordinates": "#4caf50",
|
||||||
|
"text_fragment": "#78909c",
|
||||||
|
}
|
||||||
|
|
||||||
|
def to_sigma(entities, relations, entity_id_map):
|
||||||
|
# Build name→UUID lookup from dedup map
|
||||||
|
# entity_id_map: {name_variant -> uuid, ...}
|
||||||
|
# Invert to uuid→canonical_name using entities list
|
||||||
|
uuid_to_name = {}
|
||||||
|
name_to_uuid = {}
|
||||||
|
for e in entities:
|
||||||
|
# Find this entity's UUID in the map
|
||||||
|
uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name))
|
||||||
|
uuid_to_name[uuid] = e.name
|
||||||
|
name_to_uuid[e.name] = uuid
|
||||||
|
|
||||||
|
degree = {}
|
||||||
|
for r in relations:
|
||||||
|
fid = r.from_id or r.from_name
|
||||||
|
tid = r.to_id or r.to_name
|
||||||
|
degree[fid] = degree.get(fid, 0) + 1
|
||||||
|
degree[tid] = degree.get(tid, 0) + 1
|
||||||
|
|
||||||
|
nodes = []
|
||||||
|
seen_uuids = set()
|
||||||
|
for e in entities:
|
||||||
|
uuid = name_to_uuid.get(e.name, e.name)
|
||||||
|
if uuid in seen_uuids:
|
||||||
|
continue
|
||||||
|
seen_uuids.add(uuid)
|
||||||
|
# Filter out 'type' — sigma.js reserves it for node render program
|
||||||
|
reserved = {"type", "hidden", "x", "y"}
|
||||||
|
attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved}
|
||||||
|
nodes.append({
|
||||||
|
"key": uuid,
|
||||||
|
"attributes": {
|
||||||
|
"label": e.name,
|
||||||
|
"color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"),
|
||||||
|
"size": 4 + min(degree.get(uuid, 0) * 2, 20),
|
||||||
|
"entity_type": e.type_ref,
|
||||||
|
**attrs,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
node_keys = {n["key"] for n in nodes}
|
||||||
|
edges = []
|
||||||
|
seen_edges = set()
|
||||||
|
for i, r in enumerate(relations):
|
||||||
|
fid = r.from_id or r.from_name
|
||||||
|
tid = r.to_id or r.to_name
|
||||||
|
if fid in node_keys and tid in node_keys and fid != tid:
|
||||||
|
edge_key = (fid, tid, r.relation_type)
|
||||||
|
if edge_key in seen_edges:
|
||||||
|
continue
|
||||||
|
seen_edges.add(edge_key)
|
||||||
|
edges.append({
|
||||||
|
"key": f"e{i}",
|
||||||
|
"source": fid,
|
||||||
|
"target": tid,
|
||||||
|
"attributes": {"label": r.relation_type},
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"nodes": nodes, "edges": edges}
|
||||||
|
|
||||||
|
# ── Reclasificación de entidades genéricas ─────────────────────────────────────
|
||||||
|
|
||||||
|
GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"}
|
||||||
|
|
||||||
|
|
||||||
|
def reclassify_generic_entities(entities, new_presets, workers=4):
|
||||||
|
"""Reclasifica entidades genéricas usando los tipos recién descubiertos.
|
||||||
|
|
||||||
|
En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades
|
||||||
|
genéricas y los nuevos presets para reclasificarlas in-place.
|
||||||
|
"""
|
||||||
|
generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS]
|
||||||
|
if not generic or not new_presets:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Construir prompt de reclasificación
|
||||||
|
type_lines = []
|
||||||
|
for p in new_presets:
|
||||||
|
fields = ", ".join(p.get("metadata_fields", []))
|
||||||
|
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
|
||||||
|
|
||||||
|
system = (
|
||||||
|
"You reclassify entities into more specific types. "
|
||||||
|
"For each entity, decide if it fits one of the NEW types below better than its current generic type. "
|
||||||
|
"If it fits, return the new type_ref and updated attributes. If not, return null.\n\n"
|
||||||
|
"NEW TYPES:\n" + "\n".join(type_lines) + "\n\n"
|
||||||
|
'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n'
|
||||||
|
"Only include entities that should change. Omit those that should stay as-is."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Procesar en batches de 30 entidades para no exceder contexto
|
||||||
|
batch_size = 30
|
||||||
|
total_changed = 0
|
||||||
|
|
||||||
|
def _reclassify_batch(batch):
|
||||||
|
items = [{"index": idx, "name": e.name, "current_type": e.type_ref,
|
||||||
|
"attributes": e.attributes} for idx, e in batch]
|
||||||
|
try:
|
||||||
|
resp = claude_haiku_json([
|
||||||
|
{"role": "system", "content": system},
|
||||||
|
{"role": "user", "content": json.dumps(items, ensure_ascii=False)},
|
||||||
|
])
|
||||||
|
return resp.get("reclassified", [])
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)]
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||||
|
futures = {pool.submit(_reclassify_batch, b): b for b in batches}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
for item in future.result():
|
||||||
|
idx = item.get("index")
|
||||||
|
new_ref = item.get("type_ref", "")
|
||||||
|
if idx is not None and new_ref and 0 <= idx < len(entities):
|
||||||
|
entities[idx].type_ref = new_ref
|
||||||
|
if item.get("attributes"):
|
||||||
|
entities[idx].attributes.update(item["attributes"])
|
||||||
|
total_changed += 1
|
||||||
|
|
||||||
|
return total_changed
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Uso: python extract.py <archivo>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_path = sys.argv[1]
|
||||||
|
if not os.path.isabs(file_path):
|
||||||
|
file_path = os.path.join(os.path.dirname(__file__), file_path)
|
||||||
|
|
||||||
|
workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4
|
||||||
|
|
||||||
|
print(f"=== Ontology Graph Extraction ===")
|
||||||
|
print(f"File: {file_path}")
|
||||||
|
print(f"Workers: {workers}")
|
||||||
|
start = time.monotonic()
|
||||||
|
|
||||||
|
# 1. Extraer y preprocesar texto
|
||||||
|
print("\n[1/5] Extracting text...")
|
||||||
|
raw = extract_text_from_file(file_path)
|
||||||
|
text = preprocess_text(raw)
|
||||||
|
print(f" {len(text)} chars")
|
||||||
|
|
||||||
|
# 2. Chunking
|
||||||
|
print("[2/5] Chunking...")
|
||||||
|
chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
|
||||||
|
print(f" {len(chunks)} chunks")
|
||||||
|
|
||||||
|
# 3. Extracción paralela
|
||||||
|
custom = load_custom_presets()
|
||||||
|
# Solo usar custom no promovidos (los promovidos ya estarán en el registry)
|
||||||
|
active_custom = [p for p in custom if not p.get("promoted", False)]
|
||||||
|
all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom
|
||||||
|
print(f" Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom")
|
||||||
|
system_prompt = build_unified_prompt(all_presets, RELATION_TYPES)
|
||||||
|
|
||||||
|
print(f"[3/5] Extracting entities + relations ({workers} workers)...")
|
||||||
|
all_entities = []
|
||||||
|
all_relations = []
|
||||||
|
all_suggested = []
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||||
|
futures = {
|
||||||
|
pool.submit(process_chunk, i, chunk, system_prompt): i
|
||||||
|
for i, chunk in enumerate(chunks)
|
||||||
|
}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
idx = futures[future]
|
||||||
|
ents, rels, sugg = future.result()
|
||||||
|
all_entities.extend(ents)
|
||||||
|
all_relations.extend(rels)
|
||||||
|
all_suggested.extend(sugg)
|
||||||
|
print(f" chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" +
|
||||||
|
(f", {len(sugg)} new types" if sugg else ""))
|
||||||
|
|
||||||
|
# 4. Deduplicación
|
||||||
|
print(f"\n[4/5] Deduplicating...")
|
||||||
|
print(f" Raw: {len(all_entities)} entities, {len(all_relations)} relations")
|
||||||
|
|
||||||
|
dedup = deduplicate_entities(all_entities, name_threshold=0.85)
|
||||||
|
final_entities = dedup.entities
|
||||||
|
entity_id_map = dedup.name_to_id
|
||||||
|
|
||||||
|
final_relations = deduplicate_relations(all_relations, entity_id_map)
|
||||||
|
|
||||||
|
print(f" Final: {len(final_entities)} entities, {len(final_relations)} relations")
|
||||||
|
print(f" Merged: {dedup.total_before - dedup.total_after} entities, "
|
||||||
|
f"{len(all_relations) - len(final_relations)} relations")
|
||||||
|
|
||||||
|
# Registrar tipos sugeridos en custom_presets.json
|
||||||
|
unique_suggested = []
|
||||||
|
if all_suggested:
|
||||||
|
seen = set()
|
||||||
|
for s in all_suggested:
|
||||||
|
key = s.get("type_ref", "")
|
||||||
|
if key and key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_suggested.append(s)
|
||||||
|
|
||||||
|
source_doc = os.path.basename(file_path)
|
||||||
|
added = merge_suggested_into_custom(unique_suggested, source_doc)
|
||||||
|
total_custom = len(load_custom_presets())
|
||||||
|
|
||||||
|
if added:
|
||||||
|
print(f"\n New types registered ({len(added)}):")
|
||||||
|
for p in added:
|
||||||
|
print(f" + {p['label']} ({p['type_ref']}): {p['metadata_fields']}")
|
||||||
|
print(f" Reason: {p['reason']}")
|
||||||
|
print(f" Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})")
|
||||||
|
|
||||||
|
# Reclasificar entidades genéricas con los tipos recién descubiertos
|
||||||
|
n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS)
|
||||||
|
if n_generic > 0:
|
||||||
|
print(f"\n Reclassifying {n_generic} generic entities with new types...")
|
||||||
|
changed = reclassify_generic_entities(final_entities, added, workers=workers)
|
||||||
|
print(f" Reclassified: {changed}/{n_generic}")
|
||||||
|
else:
|
||||||
|
print(f"\n {len(unique_suggested)} suggested types already registered ({total_custom} total custom)")
|
||||||
|
|
||||||
|
# Stats por tipo
|
||||||
|
type_counts = {}
|
||||||
|
for e in final_entities:
|
||||||
|
type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1
|
||||||
|
print(f"\n Entity types:")
|
||||||
|
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {t}: {c}")
|
||||||
|
|
||||||
|
rel_counts = {}
|
||||||
|
for r in final_relations:
|
||||||
|
rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1
|
||||||
|
print(f" Relation types:")
|
||||||
|
for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {t}: {c}")
|
||||||
|
|
||||||
|
# 5. Visualización
|
||||||
|
print(f"\n[5/5] Generating graph...")
|
||||||
|
graph = to_sigma(final_entities, final_relations, entity_id_map)
|
||||||
|
out_dir = os.path.join(os.path.dirname(__file__), "data")
|
||||||
|
html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph")
|
||||||
|
print(f" {len(graph['nodes'])} nodes, {len(graph['edges'])} edges")
|
||||||
|
print(f" HTML: file://{html_path}")
|
||||||
|
|
||||||
|
# Guardar JSON intermedio
|
||||||
|
json_path = os.path.join(out_dir, "extraction_result.json")
|
||||||
|
with open(json_path, "w") as f:
|
||||||
|
json.dump({
|
||||||
|
"entities": [{"name": e.name, "type_ref": e.type_ref,
|
||||||
|
"confidence": e.confidence, "attributes": e.attributes}
|
||||||
|
for e in final_entities],
|
||||||
|
"relations": [{"from": r.from_name, "to": r.to_name,
|
||||||
|
"type": r.relation_type, "confidence": r.confidence,
|
||||||
|
"description": r.description}
|
||||||
|
for r in final_relations],
|
||||||
|
"suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])],
|
||||||
|
}, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f" JSON: {json_path}")
|
||||||
|
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
print(f"\nDone in {elapsed:.1f}s")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
|
||||||
|
|
||||||
|
|
||||||
|
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
|
||||||
|
"""Genera texto legible para el LLM describiendo los entity types disponibles.
|
||||||
|
|
||||||
|
Formatea los presets del registry en una seccion del system prompt que indica
|
||||||
|
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entity_presets: Lista de presets con campos 'label', 'type_ref' y
|
||||||
|
opcionalmente 'metadata_fields'. Ejemplo:
|
||||||
|
[{"type_ref": "osint_person_go_cybersecurity",
|
||||||
|
"label": "Person",
|
||||||
|
"metadata_fields": ["full_name", "alias"]}]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String formateado con la seccion del prompt. Retorna string vacio si
|
||||||
|
la lista de presets esta vacia.
|
||||||
|
"""
|
||||||
|
if not entity_presets:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
lines = ["Entity types available for extraction:", ""]
|
||||||
|
|
||||||
|
for i, preset in enumerate(entity_presets, start=1):
|
||||||
|
label = preset.get("label", "Unknown")
|
||||||
|
type_ref = preset.get("type_ref", "")
|
||||||
|
metadata_fields = preset.get("metadata_fields", [])
|
||||||
|
|
||||||
|
lines.append(f"{i}. {label} (type_ref: {type_ref})")
|
||||||
|
|
||||||
|
if metadata_fields:
|
||||||
|
attrs = ", ".join(metadata_fields)
|
||||||
|
lines.append(f" Attributes: {attrs}")
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Remove trailing blank line
|
||||||
|
if lines and lines[-1] == "":
|
||||||
|
lines.pop()
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
|
||||||
|
|
||||||
|
|
||||||
|
def build_relation_schema_prompt(relation_types: list[str]) -> str:
|
||||||
|
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
|
||||||
|
|
||||||
|
Formatea la lista de tipos de relacion en una seccion del system prompt que
|
||||||
|
indica al LLM que relaciones puede extraer entre entidades.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
relation_types: Lista de strings con los tipos de relacion permitidos.
|
||||||
|
Ejemplo: ["funds", "employs", "communicates_with"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String formateado con la seccion del prompt. Retorna string vacio si
|
||||||
|
la lista esta vacia.
|
||||||
|
"""
|
||||||
|
if not relation_types:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
joined = ", ".join(relation_types)
|
||||||
|
return f"Allowed relation types:\n{joined}"
|
||||||
@@ -0,0 +1,814 @@
|
|||||||
|
"""Core functional programming utilities — pure functions for list/collection operations."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
from functools import reduce as _reduce
|
||||||
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def filter_list(xs: list, pred: Callable) -> list:
|
||||||
|
"""Filter list by predicate. Does not mutate the original."""
|
||||||
|
return [x for x in xs if pred(x)]
|
||||||
|
|
||||||
|
|
||||||
|
def map_list(xs: list, fn: Callable) -> list:
|
||||||
|
"""Map function over list. Does not mutate the original."""
|
||||||
|
return [fn(x) for x in xs]
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
|
||||||
|
"""Reduce list with accumulator. fn(acc, x) -> acc."""
|
||||||
|
return _reduce(fn, xs, initial)
|
||||||
|
|
||||||
|
|
||||||
|
def flat_map(xs: list, fn: Callable) -> list:
|
||||||
|
"""Map function over list then flatten one level."""
|
||||||
|
result = []
|
||||||
|
for x in xs:
|
||||||
|
result.extend(fn(x))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def flatten(xss: list) -> list:
|
||||||
|
"""Flatten a list of lists one level."""
|
||||||
|
result = []
|
||||||
|
for xs in xss:
|
||||||
|
result.extend(xs)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def chunk(xs: list, size: int) -> list:
|
||||||
|
"""Split list into chunks of given size. Last chunk may be smaller."""
|
||||||
|
if size <= 0:
|
||||||
|
return []
|
||||||
|
return [xs[i : i + size] for i in range(0, len(xs), size)]
|
||||||
|
|
||||||
|
|
||||||
|
def take(xs: list, n: int) -> list:
|
||||||
|
"""Take first n elements from list."""
|
||||||
|
return xs[:n]
|
||||||
|
|
||||||
|
|
||||||
|
def drop(xs: list, n: int) -> list:
|
||||||
|
"""Drop first n elements from list."""
|
||||||
|
return xs[n:]
|
||||||
|
|
||||||
|
|
||||||
|
def unique(xs: list) -> list:
|
||||||
|
"""Remove duplicates preserving order. Uses identity for hashable elements."""
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for x in xs:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
result.append(x)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def group_by(xs: list, key_fn: Callable) -> Dict:
|
||||||
|
"""Group elements by key function. Returns dict of key -> list."""
|
||||||
|
groups: Dict = {}
|
||||||
|
for x in xs:
|
||||||
|
k = key_fn(x)
|
||||||
|
if k not in groups:
|
||||||
|
groups[k] = []
|
||||||
|
groups[k].append(x)
|
||||||
|
return groups
|
||||||
|
|
||||||
|
|
||||||
|
def partition(xs: list, pred: Callable) -> Tuple[list, list]:
|
||||||
|
"""Split list into (matches, non_matches) based on predicate."""
|
||||||
|
matches = []
|
||||||
|
non_matches = []
|
||||||
|
for x in xs:
|
||||||
|
if pred(x):
|
||||||
|
matches.append(x)
|
||||||
|
else:
|
||||||
|
non_matches.append(x)
|
||||||
|
return (matches, non_matches)
|
||||||
|
|
||||||
|
|
||||||
|
def find(xs: list, pred: Callable) -> Any:
|
||||||
|
"""Find first element matching predicate. Returns None if not found."""
|
||||||
|
for x in xs:
|
||||||
|
if pred(x):
|
||||||
|
return x
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_index(xs: list, pred: Callable) -> int:
|
||||||
|
"""Find index of first element matching predicate. Returns -1 if not found."""
|
||||||
|
for i, x in enumerate(xs):
|
||||||
|
if pred(x):
|
||||||
|
return i
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def zip_with(xs: list, ys: list, fn: Callable) -> list:
|
||||||
|
"""Zip two lists with a combining function. Stops at shorter list."""
|
||||||
|
return [fn(x, y) for x, y in zip(xs, ys)]
|
||||||
|
|
||||||
|
|
||||||
|
def all_of(xs: list, pred: Callable) -> bool:
|
||||||
|
"""Return True if all elements match predicate."""
|
||||||
|
return all(pred(x) for x in xs)
|
||||||
|
|
||||||
|
|
||||||
|
def any_of(xs: list, pred: Callable) -> bool:
|
||||||
|
"""Return True if any element matches predicate."""
|
||||||
|
return any(pred(x) for x in xs)
|
||||||
|
|
||||||
|
|
||||||
|
def pipe(value: Any, *fns: Callable) -> Any:
|
||||||
|
"""Pipe a value through a sequence of functions left-to-right."""
|
||||||
|
result = value
|
||||||
|
for fn in fns:
|
||||||
|
result = fn(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def compose(*fns: Callable) -> Callable:
|
||||||
|
"""Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
|
||||||
|
def composed(x: Any) -> Any:
|
||||||
|
result = x
|
||||||
|
for fn in reversed(fns):
|
||||||
|
result = fn(result)
|
||||||
|
return result
|
||||||
|
return composed
|
||||||
|
|
||||||
|
|
||||||
|
# ── Tree manipulation ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_tree(structure: Any) -> List[Dict]:
|
||||||
|
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
|
||||||
|
import copy
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
node = copy.deepcopy(structure)
|
||||||
|
node.pop('nodes', None)
|
||||||
|
nodes = [node]
|
||||||
|
for key in list(structure.keys()):
|
||||||
|
if 'nodes' in key:
|
||||||
|
nodes.extend(flatten_tree(structure[key]))
|
||||||
|
return nodes
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
nodes = []
|
||||||
|
for item in structure:
|
||||||
|
nodes.extend(flatten_tree(item))
|
||||||
|
return nodes
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def tree_to_flat_list(structure: Any) -> List[Dict]:
|
||||||
|
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
nodes = [structure]
|
||||||
|
if 'nodes' in structure:
|
||||||
|
nodes.extend(tree_to_flat_list(structure['nodes']))
|
||||||
|
return nodes
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
nodes = []
|
||||||
|
for item in structure:
|
||||||
|
nodes.extend(tree_to_flat_list(item))
|
||||||
|
return nodes
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_leaf_nodes(structure: Any) -> List[Dict]:
|
||||||
|
"""Extract only leaf nodes (no children) from a hierarchical tree."""
|
||||||
|
import copy
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
if not structure.get('nodes'):
|
||||||
|
node = copy.deepcopy(structure)
|
||||||
|
node.pop('nodes', None)
|
||||||
|
return [node]
|
||||||
|
leaf_nodes = []
|
||||||
|
for key in list(structure.keys()):
|
||||||
|
if 'nodes' in key:
|
||||||
|
leaf_nodes.extend(get_leaf_nodes(structure[key]))
|
||||||
|
return leaf_nodes
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
leaf_nodes = []
|
||||||
|
for item in structure:
|
||||||
|
leaf_nodes.extend(get_leaf_nodes(item))
|
||||||
|
return leaf_nodes
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def write_node_ids(data: Any, node_id: int = 0) -> int:
|
||||||
|
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data['node_id'] = str(node_id).zfill(4)
|
||||||
|
node_id += 1
|
||||||
|
for key in list(data.keys()):
|
||||||
|
if 'nodes' in key:
|
||||||
|
node_id = write_node_ids(data[key], node_id)
|
||||||
|
elif isinstance(data, list):
|
||||||
|
for item in data:
|
||||||
|
node_id = write_node_ids(item, node_id)
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
|
||||||
|
def list_to_tree(data: List[Dict]) -> List[Dict]:
|
||||||
|
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
|
||||||
|
def get_parent_structure(structure):
|
||||||
|
if not structure:
|
||||||
|
return None
|
||||||
|
parts = str(structure).split('.')
|
||||||
|
return '.'.join(parts[:-1]) if len(parts) > 1 else None
|
||||||
|
|
||||||
|
nodes = {}
|
||||||
|
root_nodes = []
|
||||||
|
|
||||||
|
for item in data:
|
||||||
|
structure = item.get('structure')
|
||||||
|
node = {
|
||||||
|
'title': item.get('title'),
|
||||||
|
'start_index': item.get('start_index'),
|
||||||
|
'end_index': item.get('end_index'),
|
||||||
|
'nodes': []
|
||||||
|
}
|
||||||
|
nodes[structure] = node
|
||||||
|
parent_structure = get_parent_structure(structure)
|
||||||
|
|
||||||
|
if parent_structure and parent_structure in nodes:
|
||||||
|
nodes[parent_structure]['nodes'].append(node)
|
||||||
|
else:
|
||||||
|
root_nodes.append(node)
|
||||||
|
|
||||||
|
def clean_node(node):
|
||||||
|
if not node['nodes']:
|
||||||
|
del node['nodes']
|
||||||
|
else:
|
||||||
|
for child in node['nodes']:
|
||||||
|
clean_node(child)
|
||||||
|
return node
|
||||||
|
|
||||||
|
return [clean_node(node) for node in root_nodes]
|
||||||
|
|
||||||
|
|
||||||
|
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
|
||||||
|
"""Recursively remove specified fields from a tree (dict/list)."""
|
||||||
|
if fields is None:
|
||||||
|
fields = ['text']
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
|
||||||
|
elif isinstance(data, list):
|
||||||
|
return [remove_tree_fields(item, fields) for item in data]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
|
||||||
|
"""Reorder fields of each node in a tree according to specified key order."""
|
||||||
|
if not order:
|
||||||
|
return structure
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
if 'nodes' in structure:
|
||||||
|
structure['nodes'] = format_tree_structure(structure['nodes'], order)
|
||||||
|
if not structure.get('nodes'):
|
||||||
|
structure.pop('nodes', None)
|
||||||
|
return {key: structure[key] for key in order if key in structure}
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
return [format_tree_structure(item, order) for item in structure]
|
||||||
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
|
||||||
|
"""Create flat dict mapping node_id to node for O(1) lookup."""
|
||||||
|
mapping = {}
|
||||||
|
def _traverse(nodes):
|
||||||
|
for node in nodes:
|
||||||
|
if node.get('node_id'):
|
||||||
|
mapping[node['node_id']] = node
|
||||||
|
if node.get('nodes'):
|
||||||
|
_traverse(node['nodes'])
|
||||||
|
_traverse(tree)
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
# ── Text / JSON extraction ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def extract_json_from_llm(content: str) -> Dict:
|
||||||
|
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
|
||||||
|
import json
|
||||||
|
try:
|
||||||
|
start_idx = content.find("```json")
|
||||||
|
if start_idx != -1:
|
||||||
|
start_idx += 7
|
||||||
|
end_idx = content.rfind("```")
|
||||||
|
json_content = content[start_idx:end_idx].strip()
|
||||||
|
else:
|
||||||
|
json_content = content.strip()
|
||||||
|
|
||||||
|
json_content = json_content.replace('None', 'null')
|
||||||
|
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
|
||||||
|
json_content = ' '.join(json_content.split())
|
||||||
|
|
||||||
|
return json.loads(json_content)
|
||||||
|
except (json.JSONDecodeError, Exception):
|
||||||
|
try:
|
||||||
|
json_content = json_content.replace(',]', ']').replace(',}', '}')
|
||||||
|
return json.loads(json_content)
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page_range(pages: str) -> List[int]:
|
||||||
|
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
|
||||||
|
result = []
|
||||||
|
for part in pages.split(','):
|
||||||
|
part = part.strip()
|
||||||
|
if '-' in part:
|
||||||
|
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
|
||||||
|
if start > end:
|
||||||
|
raise ValueError(f"Invalid range '{part}': start must be <= end")
|
||||||
|
result.extend(range(start, end + 1))
|
||||||
|
else:
|
||||||
|
result.append(int(part))
|
||||||
|
return sorted(set(result))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Markdown parsing ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
|
||||||
|
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
|
||||||
|
import re
|
||||||
|
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||||
|
code_block_pattern = r'^```'
|
||||||
|
node_list = []
|
||||||
|
lines = markdown_content.split('\n')
|
||||||
|
in_code_block = False
|
||||||
|
|
||||||
|
for line_num, line in enumerate(lines, 1):
|
||||||
|
stripped_line = line.strip()
|
||||||
|
if re.match(code_block_pattern, stripped_line):
|
||||||
|
in_code_block = not in_code_block
|
||||||
|
continue
|
||||||
|
if not stripped_line:
|
||||||
|
continue
|
||||||
|
if not in_code_block:
|
||||||
|
match = re.match(header_pattern, stripped_line)
|
||||||
|
if match:
|
||||||
|
level = len(match.group(1))
|
||||||
|
title = match.group(2).strip()
|
||||||
|
node_list.append({'title': title, 'level': level, 'line_num': line_num})
|
||||||
|
|
||||||
|
return node_list, lines
|
||||||
|
|
||||||
|
|
||||||
|
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
|
||||||
|
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
|
||||||
|
if not node_list:
|
||||||
|
return []
|
||||||
|
|
||||||
|
stack = []
|
||||||
|
root_nodes = []
|
||||||
|
node_counter = 1
|
||||||
|
|
||||||
|
for node in node_list:
|
||||||
|
current_level = node['level']
|
||||||
|
tree_node = {
|
||||||
|
'title': node['title'],
|
||||||
|
'node_id': str(node_counter).zfill(4),
|
||||||
|
'line_num': node['line_num'],
|
||||||
|
'nodes': []
|
||||||
|
}
|
||||||
|
node_counter += 1
|
||||||
|
|
||||||
|
while stack and stack[-1][1] >= current_level:
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
if not stack:
|
||||||
|
root_nodes.append(tree_node)
|
||||||
|
else:
|
||||||
|
parent_node, _ = stack[-1]
|
||||||
|
parent_node['nodes'].append(tree_node)
|
||||||
|
|
||||||
|
stack.append((tree_node, current_level))
|
||||||
|
|
||||||
|
def clean_empty_nodes(nodes):
|
||||||
|
for n in nodes:
|
||||||
|
if n['nodes']:
|
||||||
|
clean_empty_nodes(n['nodes'])
|
||||||
|
else:
|
||||||
|
del n['nodes']
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
return clean_empty_nodes(root_nodes)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Pagination / chunking ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
|
||||||
|
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
|
||||||
|
"""Group pages into text chunks respecting token limit with configurable overlap."""
|
||||||
|
import math
|
||||||
|
num_tokens = sum(token_lengths)
|
||||||
|
|
||||||
|
if num_tokens <= max_tokens:
|
||||||
|
return ["".join(page_contents)]
|
||||||
|
|
||||||
|
subsets = []
|
||||||
|
current_subset = []
|
||||||
|
current_token_count = 0
|
||||||
|
|
||||||
|
expected_parts = math.ceil(num_tokens / max_tokens)
|
||||||
|
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
|
||||||
|
|
||||||
|
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
|
||||||
|
if current_token_count + page_tokens > avg_tokens:
|
||||||
|
subsets.append(''.join(current_subset))
|
||||||
|
overlap_start = max(i - overlap_pages, 0)
|
||||||
|
current_subset = list(page_contents[overlap_start:i])
|
||||||
|
current_token_count = sum(token_lengths[overlap_start:i])
|
||||||
|
|
||||||
|
current_subset.append(page_content)
|
||||||
|
current_token_count += page_tokens
|
||||||
|
|
||||||
|
if current_subset:
|
||||||
|
subsets.append(''.join(current_subset))
|
||||||
|
|
||||||
|
return subsets
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_page_offset(pairs: List[Dict]) -> int:
|
||||||
|
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
|
||||||
|
differences = []
|
||||||
|
for pair in pairs:
|
||||||
|
try:
|
||||||
|
difference = pair['physical_index'] - pair['page']
|
||||||
|
differences.append(difference)
|
||||||
|
except (KeyError, TypeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not differences:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
counts: Dict[int, int] = {}
|
||||||
|
for diff in differences:
|
||||||
|
counts[diff] = counts.get(diff, 0) + 1
|
||||||
|
|
||||||
|
return max(counts.items(), key=lambda x: x[1])[0]
|
||||||
|
|
||||||
|
|
||||||
|
# ── Text preprocessing ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(text: str) -> str:
|
||||||
|
"""Normalize whitespace and newlines in raw text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw text to normalize.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized text with consistent newlines, stripped lines, and no
|
||||||
|
excessive blank lines.
|
||||||
|
"""
|
||||||
|
# Normalize line endings: \r\n and \r -> \n
|
||||||
|
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
|
# Reduce 3+ consecutive newlines to at most 2
|
||||||
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
# Strip whitespace from each line
|
||||||
|
text = '\n'.join(line.strip() for line in text.split('\n'))
|
||||||
|
# Strip globally
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_stats(text: str) -> dict:
|
||||||
|
"""Compute basic statistics of a text: characters, lines, words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text to analyze.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys total_chars (int), total_lines (int), total_words (int).
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'total_chars': len(text),
|
||||||
|
'total_lines': text.count('\n') + 1,
|
||||||
|
'total_words': len(text.split()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Git URL parsing ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_git_segment(segment: str) -> str:
|
||||||
|
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
|
||||||
|
if segment.endswith(".git"):
|
||||||
|
segment = segment[:-4]
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
|
||||||
|
"""Parse a code-hosting URL and return the 'org/repo' path component.
|
||||||
|
|
||||||
|
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
|
||||||
|
Returns None if the URL does not match any known host or is malformed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Repository URL in any supported format.
|
||||||
|
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
'org/repo' string or None.
|
||||||
|
"""
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||||
|
url = url.strip()
|
||||||
|
|
||||||
|
if url.startswith("git@"):
|
||||||
|
# git@github.com:org/repo.git
|
||||||
|
rest = url[len("git@"):]
|
||||||
|
if ":" not in rest:
|
||||||
|
return None
|
||||||
|
host, path = rest.split(":", 1)
|
||||||
|
if host not in hosts:
|
||||||
|
return None
|
||||||
|
segments = [s for s in path.split("/") if s]
|
||||||
|
if len(segments) < 2:
|
||||||
|
return None
|
||||||
|
org = _sanitize_git_segment(segments[0])
|
||||||
|
repo = _sanitize_git_segment(segments[1])
|
||||||
|
if not org or not repo:
|
||||||
|
return None
|
||||||
|
return f"{org}/{repo}"
|
||||||
|
|
||||||
|
for prefix in ("http://", "https://", "git://", "ssh://"):
|
||||||
|
if url.startswith(prefix):
|
||||||
|
parsed = urlparse(url)
|
||||||
|
netloc = parsed.hostname or ""
|
||||||
|
if netloc not in hosts:
|
||||||
|
return None
|
||||||
|
segments = [s for s in parsed.path.split("/") if s]
|
||||||
|
if len(segments) < 2:
|
||||||
|
return None
|
||||||
|
org = _sanitize_git_segment(segments[0])
|
||||||
|
repo = _sanitize_git_segment(segments[1])
|
||||||
|
if not org or not repo:
|
||||||
|
return None
|
||||||
|
return f"{org}/{repo}"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
|
||||||
|
"""Return True only if url points to a clonable git repository.
|
||||||
|
|
||||||
|
Accepts org/repo and org/repo/tree/<ref> paths.
|
||||||
|
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to verify.
|
||||||
|
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if url is a clonable repository URL.
|
||||||
|
"""
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||||
|
url = url.strip()
|
||||||
|
|
||||||
|
# SSH shorthand — always repo-level if host matches
|
||||||
|
if url.startswith("git@"):
|
||||||
|
rest = url[len("git@"):]
|
||||||
|
if ":" not in rest:
|
||||||
|
return False
|
||||||
|
host, _ = rest.split(":", 1)
|
||||||
|
return host in hosts
|
||||||
|
|
||||||
|
# git:// and ssh:// — always repo-level if host matches
|
||||||
|
for prefix in ("ssh://", "git://"):
|
||||||
|
if url.startswith(prefix):
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return (parsed.hostname or "") in hosts
|
||||||
|
|
||||||
|
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
|
||||||
|
for prefix in ("http://", "https://"):
|
||||||
|
if url.startswith(prefix):
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if (parsed.hostname or "") not in hosts:
|
||||||
|
return False
|
||||||
|
segments = [s for s in parsed.path.split("/") if s]
|
||||||
|
if len(segments) == 2:
|
||||||
|
return True
|
||||||
|
if len(segments) == 4 and segments[2] == "tree":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def validate_git_ssh_uri(url: str) -> None:
|
||||||
|
"""Validate a git SSH URI of the form git@host:path.
|
||||||
|
|
||||||
|
Raises ValueError with a descriptive message if the URI is malformed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URI string to validate.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the URI does not conform to git SSH format.
|
||||||
|
"""
|
||||||
|
if not url.startswith("git@"):
|
||||||
|
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
|
||||||
|
rest = url[len("git@"):]
|
||||||
|
if ":" not in rest:
|
||||||
|
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
|
||||||
|
_, path = rest.split(":", 1)
|
||||||
|
if not path:
|
||||||
|
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Markdown parsing utilities
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
|
||||||
|
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Raw markdown string, optionally starting with YAML frontmatter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (content_without_frontmatter, frontmatter_dict).
|
||||||
|
frontmatter_dict is None when no frontmatter is found.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
|
||||||
|
match = pattern.match(content)
|
||||||
|
if not match:
|
||||||
|
return content, None
|
||||||
|
|
||||||
|
raw = match.group(1)
|
||||||
|
remaining = content[match.end():]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml # type: ignore
|
||||||
|
data = yaml.safe_load(raw)
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
data = None
|
||||||
|
except Exception:
|
||||||
|
# Fallback: simple key: value parser (no yaml dependency)
|
||||||
|
data = {}
|
||||||
|
for line in raw.splitlines():
|
||||||
|
if ':' in line:
|
||||||
|
key, _, value = line.partition(':')
|
||||||
|
data[key.strip()] = value.strip()
|
||||||
|
|
||||||
|
return remaining, data
|
||||||
|
|
||||||
|
|
||||||
|
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
|
||||||
|
"""Find all markdown headings (# to ######), excluding those inside code blocks,
|
||||||
|
HTML comments, and indented blocks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Markdown text to search.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (start_pos, end_pos, title, level) for each heading found.
|
||||||
|
"""
|
||||||
|
excluded: List[Tuple[int, int]] = []
|
||||||
|
|
||||||
|
# Code blocks (triple backtick)
|
||||||
|
for m in re.finditer(r'```.*?```', content, re.DOTALL):
|
||||||
|
excluded.append((m.start(), m.end()))
|
||||||
|
|
||||||
|
# HTML comments
|
||||||
|
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
|
||||||
|
excluded.append((m.start(), m.end()))
|
||||||
|
|
||||||
|
# Indented blocks (lines starting with 4 spaces or a tab)
|
||||||
|
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
|
||||||
|
excluded.append((m.start(), m.end()))
|
||||||
|
|
||||||
|
def is_excluded(pos: int) -> bool:
|
||||||
|
return any(start <= pos < end for start, end in excluded)
|
||||||
|
|
||||||
|
results: List[Tuple[int, int, str, int]] = []
|
||||||
|
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||||
|
# Skip escaped headings (\#)
|
||||||
|
before = content[m.start() - 1] if m.start() > 0 else ''
|
||||||
|
if before == '\\':
|
||||||
|
continue
|
||||||
|
if is_excluded(m.start()):
|
||||||
|
continue
|
||||||
|
level = len(m.group(1))
|
||||||
|
title = m.group(2).strip()
|
||||||
|
results.append((m.start(), m.end(), title, level))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_token_count(content: str) -> int:
|
||||||
|
"""Estimate token count without a tokenizer.
|
||||||
|
|
||||||
|
CJK characters count as ~0.7 tokens each; other non-whitespace characters
|
||||||
|
count as ~0.3 tokens each.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Text to estimate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated integer token count.
|
||||||
|
"""
|
||||||
|
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
|
||||||
|
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
|
||||||
|
others = re.findall(r'\S', without_cjk)
|
||||||
|
return int(len(cjk) * 0.7 + len(others) * 0.3)
|
||||||
|
|
||||||
|
|
||||||
|
def smart_split_content(
|
||||||
|
content: str,
|
||||||
|
max_tokens: int = 1024,
|
||||||
|
max_chars: int = 8000,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Split large content into parts respecting token and character limits.
|
||||||
|
|
||||||
|
Splits by paragraphs (double newline). If a single paragraph exceeds the
|
||||||
|
limit it is force-cut into chunks of max_chars.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Text to split.
|
||||||
|
max_tokens: Maximum estimated tokens per part.
|
||||||
|
max_chars: Maximum characters per part.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of string parts.
|
||||||
|
"""
|
||||||
|
paragraphs = content.split('\n\n')
|
||||||
|
parts: List[str] = []
|
||||||
|
current_parts: List[str] = []
|
||||||
|
current_tokens = 0
|
||||||
|
current_chars = 0
|
||||||
|
|
||||||
|
def flush() -> None:
|
||||||
|
if current_parts:
|
||||||
|
parts.append('\n\n'.join(current_parts))
|
||||||
|
current_parts.clear()
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
para_tokens = estimate_token_count(para)
|
||||||
|
para_chars = len(para)
|
||||||
|
|
||||||
|
# Single paragraph exceeds limits — force-cut it
|
||||||
|
if para_tokens > max_tokens or para_chars > max_chars:
|
||||||
|
flush()
|
||||||
|
current_tokens = 0
|
||||||
|
current_chars = 0
|
||||||
|
for i in range(0, len(para), max_chars):
|
||||||
|
parts.append(para[i:i + max_chars])
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Would exceed limits if added — flush first
|
||||||
|
if (current_tokens + para_tokens > max_tokens or
|
||||||
|
current_chars + para_chars > max_chars):
|
||||||
|
flush()
|
||||||
|
current_tokens = 0
|
||||||
|
current_chars = 0
|
||||||
|
|
||||||
|
current_parts.append(para)
|
||||||
|
current_tokens += para_tokens
|
||||||
|
current_chars += para_chars
|
||||||
|
|
||||||
|
flush()
|
||||||
|
return parts if parts else [content]
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_for_path(text: str, max_length: int = 50) -> str:
|
||||||
|
"""Convert text to a safe string for use in file paths.
|
||||||
|
|
||||||
|
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
|
||||||
|
with underscores. Truncates with a sha256 suffix if the result exceeds
|
||||||
|
max_length.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text to sanitize.
|
||||||
|
max_length: Maximum length of the returned string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Safe path-friendly string.
|
||||||
|
"""
|
||||||
|
cleaned = re.sub(
|
||||||
|
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
|
||||||
|
'',
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
cleaned = cleaned.replace(' ', '_').strip('_')
|
||||||
|
|
||||||
|
if not cleaned:
|
||||||
|
return 'section'
|
||||||
|
|
||||||
|
if len(cleaned) <= max_length:
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
|
||||||
|
return cleaned[:max_length - len(suffix)] + suffix
|
||||||
@@ -0,0 +1,283 @@
|
|||||||
|
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||||
|
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
from deduplication_result import DeduplicationResult
|
||||||
|
from normalize_entity_name import normalize_entity_name
|
||||||
|
from merge_entity_attributes import merge_entity_attributes
|
||||||
|
|
||||||
|
|
||||||
|
# ── Similitud helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _levenshtein(a: str, b: str) -> int:
|
||||||
|
"""Distancia de edicion Levenshtein entre dos strings."""
|
||||||
|
if a == b:
|
||||||
|
return 0
|
||||||
|
if not a:
|
||||||
|
return len(b)
|
||||||
|
if not b:
|
||||||
|
return len(a)
|
||||||
|
prev = list(range(len(b) + 1))
|
||||||
|
for i, ca in enumerate(a, 1):
|
||||||
|
curr = [i]
|
||||||
|
for j, cb in enumerate(b, 1):
|
||||||
|
cost = 0 if ca == cb else 1
|
||||||
|
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
||||||
|
prev = curr
|
||||||
|
return prev[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
|
||||||
|
"""Similitud de Jaccard entre dos conjuntos de tokens."""
|
||||||
|
set_a = set(tokens_a)
|
||||||
|
set_b = set(tokens_b)
|
||||||
|
if not set_a and not set_b:
|
||||||
|
return 1.0
|
||||||
|
inter = len(set_a & set_b)
|
||||||
|
union = len(set_a | set_b)
|
||||||
|
return inter / union if union else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _name_similarity(a: str, b: str) -> float:
|
||||||
|
"""Score de similitud entre dos nombres normalizados.
|
||||||
|
|
||||||
|
Combina similitud de Levenshtein y Jaccard sobre tokens.
|
||||||
|
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
|
||||||
|
"""
|
||||||
|
if a == b:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
# Similitud Levenshtein
|
||||||
|
max_len = max(len(a), len(b))
|
||||||
|
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
|
||||||
|
|
||||||
|
# Similitud Jaccard sobre tokens
|
||||||
|
tokens_a = a.split()
|
||||||
|
tokens_b = b.split()
|
||||||
|
jac_sim = _jaccard(tokens_a, tokens_b)
|
||||||
|
|
||||||
|
score = max(lev_sim, jac_sim)
|
||||||
|
|
||||||
|
# Bonus de contencion: un nombre contiene al otro
|
||||||
|
if a in b or b in a:
|
||||||
|
score = min(1.0, score + 0.3)
|
||||||
|
|
||||||
|
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
|
||||||
|
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
|
||||||
|
score = min(1.0, score + 0.3)
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
|
||||||
|
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
|
||||||
|
if not candidate or not tokens:
|
||||||
|
return False
|
||||||
|
initials = "".join(t[0] for t in tokens if t).upper()
|
||||||
|
return candidate.upper() == initials
|
||||||
|
|
||||||
|
|
||||||
|
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_exact_type(entity_type: str) -> bool:
|
||||||
|
"""Tipos tecnicos donde solo se acepta matching exacto."""
|
||||||
|
return entity_type.lower() in _EXACT_TYPES
|
||||||
|
|
||||||
|
|
||||||
|
# ── Union-Find ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class _UnionFind:
|
||||||
|
def __init__(self, n: int) -> None:
|
||||||
|
self._parent = list(range(n))
|
||||||
|
self._rank = [0] * n
|
||||||
|
|
||||||
|
def find(self, x: int) -> int:
|
||||||
|
while self._parent[x] != x:
|
||||||
|
self._parent[x] = self._parent[self._parent[x]]
|
||||||
|
x = self._parent[x]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def union(self, x: int, y: int) -> None:
|
||||||
|
rx, ry = self.find(x), self.find(y)
|
||||||
|
if rx == ry:
|
||||||
|
return
|
||||||
|
if self._rank[rx] < self._rank[ry]:
|
||||||
|
rx, ry = ry, rx
|
||||||
|
self._parent[ry] = rx
|
||||||
|
if self._rank[rx] == self._rank[ry]:
|
||||||
|
self._rank[rx] += 1
|
||||||
|
|
||||||
|
|
||||||
|
# ── Implementacion principal ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def deduplicate_entities(
|
||||||
|
candidates: list[EntityCandidate],
|
||||||
|
name_threshold: float = 0.85,
|
||||||
|
same_type_only: bool = True,
|
||||||
|
) -> DeduplicationResult:
|
||||||
|
"""Agrupa entidades candidatas que refieren a la misma entidad real.
|
||||||
|
|
||||||
|
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
|
||||||
|
detectar clusters transitivos. Por cada cluster genera una entidad canonica
|
||||||
|
mergeando atributos de todos sus miembros.
|
||||||
|
|
||||||
|
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
|
||||||
|
acepta matching exacto normalizado, ignorando el umbral de nombre.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates: lista de EntityCandidate a deduplicar.
|
||||||
|
name_threshold: score minimo para considerar dos nombres iguales (0-1).
|
||||||
|
same_type_only: si True, solo compara entidades del mismo type_ref.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeduplicationResult con entidades deduplicadas, mapas de resolucion
|
||||||
|
e historial de merges.
|
||||||
|
"""
|
||||||
|
if not candidates:
|
||||||
|
return DeduplicationResult(
|
||||||
|
entities=[],
|
||||||
|
entity_id_map={},
|
||||||
|
name_to_id={},
|
||||||
|
merge_log=[],
|
||||||
|
total_before=0,
|
||||||
|
total_after=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
n = len(candidates)
|
||||||
|
|
||||||
|
# Paso 1: normalizar nombres
|
||||||
|
normalized: list[str] = []
|
||||||
|
for c in candidates:
|
||||||
|
norm = normalize_entity_name(c.name, c.type_ref)
|
||||||
|
normalized.append(norm)
|
||||||
|
|
||||||
|
# Paso 2: Union-Find sobre todos los indices
|
||||||
|
uf = _UnionFind(n)
|
||||||
|
|
||||||
|
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
|
||||||
|
merge_pairs: list[tuple[int, int, float]] = []
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ni, nj = normalized[i], normalized[j]
|
||||||
|
et = candidates[i].type_ref.lower()
|
||||||
|
|
||||||
|
if _is_exact_type(et):
|
||||||
|
if ni == nj:
|
||||||
|
uf.union(i, j)
|
||||||
|
merge_pairs.append((i, j, 1.0))
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = _name_similarity(ni, nj)
|
||||||
|
if score >= name_threshold:
|
||||||
|
uf.union(i, j)
|
||||||
|
merge_pairs.append((i, j, score))
|
||||||
|
|
||||||
|
# Paso 4: agrupar indices por raiz del Union-Find
|
||||||
|
clusters: dict[int, list[int]] = {}
|
||||||
|
for i in range(n):
|
||||||
|
root = uf.find(i)
|
||||||
|
clusters.setdefault(root, []).append(i)
|
||||||
|
|
||||||
|
# Paso 5: merge por cluster
|
||||||
|
merged_entities: list[EntityCandidate] = []
|
||||||
|
entity_id_map: dict[str, str] = {}
|
||||||
|
name_to_id: dict[str, str] = {}
|
||||||
|
merge_log: list[dict] = []
|
||||||
|
|
||||||
|
# Pares mergeados para construir el log
|
||||||
|
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
|
||||||
|
for i, j, score in merge_pairs:
|
||||||
|
root = uf.find(i)
|
||||||
|
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
|
||||||
|
|
||||||
|
for root, indices in clusters.items():
|
||||||
|
cluster_candidates = [candidates[idx] for idx in indices]
|
||||||
|
|
||||||
|
if len(cluster_candidates) == 1:
|
||||||
|
c = cluster_candidates[0]
|
||||||
|
canonical_name = c.name
|
||||||
|
canonical_norm = normalized[indices[0]]
|
||||||
|
merged_attrs = c.attributes
|
||||||
|
merged_confidence = c.confidence
|
||||||
|
merged_chunks = list(c.source_chunk_indices)
|
||||||
|
merged_from = list(c.merged_from) if c.merged_from else [c.name]
|
||||||
|
else:
|
||||||
|
# Candidato con mayor confidence es el canonico
|
||||||
|
best = max(cluster_candidates, key=lambda c: c.confidence)
|
||||||
|
canonical_name = best.name
|
||||||
|
canonical_norm = normalize_entity_name(best.name, best.type_ref)
|
||||||
|
|
||||||
|
merged_attrs = merge_entity_attributes(
|
||||||
|
[c.attributes for c in cluster_candidates]
|
||||||
|
)
|
||||||
|
merged_confidence = max(c.confidence for c in cluster_candidates)
|
||||||
|
|
||||||
|
merged_chunks: list[int] = []
|
||||||
|
seen_chunks: set[int] = set()
|
||||||
|
for c in cluster_candidates:
|
||||||
|
for idx in c.source_chunk_indices:
|
||||||
|
if idx not in seen_chunks:
|
||||||
|
merged_chunks.append(idx)
|
||||||
|
seen_chunks.add(idx)
|
||||||
|
|
||||||
|
merged_from: list[str] = []
|
||||||
|
seen_names: set[str] = set()
|
||||||
|
for c in cluster_candidates:
|
||||||
|
names_to_add = c.merged_from if c.merged_from else [c.name]
|
||||||
|
for nm in names_to_add:
|
||||||
|
if nm not in seen_names:
|
||||||
|
merged_from.append(nm)
|
||||||
|
seen_names.add(nm)
|
||||||
|
|
||||||
|
# Log de merge
|
||||||
|
other_names = [c.name for c in cluster_candidates if c is not best]
|
||||||
|
pairs = merged_pairs_by_root.get(root, [])
|
||||||
|
max_score = max((s for _, _, s in pairs), default=1.0)
|
||||||
|
merge_log.append(
|
||||||
|
{
|
||||||
|
"canonical": canonical_name,
|
||||||
|
"merged": other_names,
|
||||||
|
"score": round(max_score, 4),
|
||||||
|
"reason": "fuzzy_name",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
ent_id = str(uuid.uuid4())
|
||||||
|
entity = EntityCandidate(
|
||||||
|
name=canonical_name,
|
||||||
|
name_normalized=canonical_norm,
|
||||||
|
type_ref=cluster_candidates[0].type_ref,
|
||||||
|
type_label=cluster_candidates[0].type_label,
|
||||||
|
attributes=merged_attrs,
|
||||||
|
confidence=merged_confidence,
|
||||||
|
source_chunk_indices=merged_chunks,
|
||||||
|
merged_from=merged_from,
|
||||||
|
)
|
||||||
|
merged_entities.append(entity)
|
||||||
|
|
||||||
|
# Poblar mapas de resolucion
|
||||||
|
entity_id_map[canonical_norm] = ent_id
|
||||||
|
for orig_name in merged_from:
|
||||||
|
name_to_id[orig_name] = ent_id
|
||||||
|
name_to_id[canonical_norm] = ent_id
|
||||||
|
|
||||||
|
return DeduplicationResult(
|
||||||
|
entities=merged_entities,
|
||||||
|
entity_id_map=entity_id_map,
|
||||||
|
name_to_id=name_to_id,
|
||||||
|
merge_log=merge_log,
|
||||||
|
total_before=n,
|
||||||
|
total_after=len(merged_entities),
|
||||||
|
)
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# --- Importar levenshtein_distance desde cybersecurity ---
|
||||||
|
# Soporta dos contextos:
|
||||||
|
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
|
||||||
|
# 2. Ejecutado desde la raiz del registry (fn run)
|
||||||
|
def _levenshtein_distance(a: str, b: str) -> int:
|
||||||
|
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
|
||||||
|
if len(a) < len(b):
|
||||||
|
return _levenshtein_distance(b, a)
|
||||||
|
if len(b) == 0:
|
||||||
|
return len(a)
|
||||||
|
prev_row = list(range(len(b) + 1))
|
||||||
|
for i, ca in enumerate(a):
|
||||||
|
curr_row = [i + 1]
|
||||||
|
for j, cb in enumerate(b):
|
||||||
|
cost = 0 if ca == cb else 1
|
||||||
|
curr_row.append(
|
||||||
|
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
|
||||||
|
)
|
||||||
|
prev_row = curr_row
|
||||||
|
return prev_row[-1]
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
_here = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
_cyber_path = os.path.join(_here, "..", "cybersecurity")
|
||||||
|
if _cyber_path not in sys.path:
|
||||||
|
sys.path.insert(0, _cyber_path)
|
||||||
|
from cybersecurity import levenshtein_distance as _lev
|
||||||
|
except ImportError:
|
||||||
|
_lev = None # type: ignore
|
||||||
|
|
||||||
|
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
|
||||||
|
|
||||||
|
|
||||||
|
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
|
||||||
|
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
|
||||||
|
|
||||||
|
Recorre todas las claves de entity_id_map y busca la mas cercana segun
|
||||||
|
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
|
||||||
|
threshold, o '' si no hay match aceptable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: nombre a resolver (ya en lowercase strip).
|
||||||
|
entity_id_map: mapa nombre_normalizado -> entity_id.
|
||||||
|
threshold: distancia maxima de edicion para considerar match (default 3).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
entity_id del mejor match o '' si no hay match.
|
||||||
|
"""
|
||||||
|
best_id = ""
|
||||||
|
best_dist = threshold + 1
|
||||||
|
for key, entity_id in entity_id_map.items():
|
||||||
|
dist = levenshtein_distance(name, key)
|
||||||
|
if dist < best_dist:
|
||||||
|
best_dist = dist
|
||||||
|
best_id = entity_id
|
||||||
|
return best_id if best_dist <= threshold else ""
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate_relations(
|
||||||
|
relations: list,
|
||||||
|
entity_id_map: dict[str, str],
|
||||||
|
) -> list:
|
||||||
|
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
|
||||||
|
|
||||||
|
Algoritmo:
|
||||||
|
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
|
||||||
|
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
|
||||||
|
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
|
||||||
|
Si sigue sin match, descartar la relacion con warning.
|
||||||
|
2. Descartar self-loops (from_id == to_id).
|
||||||
|
3. Deduplicar por (from_id, to_id, relation_type):
|
||||||
|
- description: concatenar descripciones unicas separadas por '; '
|
||||||
|
- confidence: max del grupo
|
||||||
|
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
relations: lista de RelationCandidate con from_name/to_name originales.
|
||||||
|
entity_id_map: mapa nombre_normalizado -> entity_id (output de
|
||||||
|
deduplicate_entities). Permite resolver nombres que fueron mergeados.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
|
||||||
|
"""
|
||||||
|
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
|
||||||
|
try:
|
||||||
|
_types_path = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)),
|
||||||
|
"..", "..", "..", "python", "types", "datascience",
|
||||||
|
)
|
||||||
|
if _types_path not in sys.path:
|
||||||
|
sys.path.insert(0, _types_path)
|
||||||
|
from relation_candidate import RelationCandidate
|
||||||
|
except ImportError:
|
||||||
|
from relation_candidate import RelationCandidate # type: ignore
|
||||||
|
|
||||||
|
resolved: list = []
|
||||||
|
|
||||||
|
for rel in relations:
|
||||||
|
# --- Resolver from_name ---
|
||||||
|
from_key = rel.from_name.lower().strip()
|
||||||
|
from_id = entity_id_map.get(from_key, "")
|
||||||
|
if not from_id:
|
||||||
|
from_id = _fuzzy_resolve(from_key, entity_id_map)
|
||||||
|
if not from_id:
|
||||||
|
logger.warning(
|
||||||
|
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
|
||||||
|
rel.from_name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Resolver to_name ---
|
||||||
|
to_key = rel.to_name.lower().strip()
|
||||||
|
to_id = entity_id_map.get(to_key, "")
|
||||||
|
if not to_id:
|
||||||
|
to_id = _fuzzy_resolve(to_key, entity_id_map)
|
||||||
|
if not to_id:
|
||||||
|
logger.warning(
|
||||||
|
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
|
||||||
|
rel.to_name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Descartar self-loops ---
|
||||||
|
if from_id == to_id:
|
||||||
|
logger.debug(
|
||||||
|
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
|
||||||
|
rel.from_name,
|
||||||
|
rel.to_name,
|
||||||
|
rel.relation_type,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
resolved.append(
|
||||||
|
RelationCandidate(
|
||||||
|
from_name=rel.from_name,
|
||||||
|
to_name=rel.to_name,
|
||||||
|
from_id=from_id,
|
||||||
|
to_id=to_id,
|
||||||
|
relation_type=rel.relation_type,
|
||||||
|
description=rel.description,
|
||||||
|
confidence=rel.confidence,
|
||||||
|
source_chunk_index=rel.source_chunk_index,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Deduplicar por (from_id, to_id, relation_type) ---
|
||||||
|
groups: dict[tuple, list] = {}
|
||||||
|
for rel in resolved:
|
||||||
|
key = (rel.from_id, rel.to_id, rel.relation_type)
|
||||||
|
groups.setdefault(key, []).append(rel)
|
||||||
|
|
||||||
|
result: list = []
|
||||||
|
for (from_id, to_id, rel_type), group in groups.items():
|
||||||
|
if len(group) == 1:
|
||||||
|
result.append(group[0])
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Mergear: max confidence + union de descripciones unicas
|
||||||
|
best_confidence = max(r.confidence for r in group)
|
||||||
|
seen_desc: set[str] = set()
|
||||||
|
descriptions: list[str] = []
|
||||||
|
for r in group:
|
||||||
|
if r.description and r.description not in seen_desc:
|
||||||
|
descriptions.append(r.description)
|
||||||
|
seen_desc.add(r.description)
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
RelationCandidate(
|
||||||
|
from_name=group[0].from_name,
|
||||||
|
to_name=group[0].to_name,
|
||||||
|
from_id=from_id,
|
||||||
|
to_id=to_id,
|
||||||
|
relation_type=rel_type,
|
||||||
|
description="; ".join(descriptions),
|
||||||
|
confidence=best_confidence,
|
||||||
|
source_chunk_index=group[0].source_chunk_index,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DeduplicationResult:
|
||||||
|
"""Resultado de deduplicacion de entidades.
|
||||||
|
|
||||||
|
El `name_to_id` mapea TODOS los nombres originales (incluyendo los
|
||||||
|
mergeados) a su ID final, permitiendo resolver relaciones que usan
|
||||||
|
cualquier variante del nombre.
|
||||||
|
"""
|
||||||
|
|
||||||
|
entities: list[EntityCandidate]
|
||||||
|
entity_id_map: dict[str, str]
|
||||||
|
name_to_id: dict[str, str]
|
||||||
|
merge_log: list[dict] = field(default_factory=list)
|
||||||
|
total_before: int = 0
|
||||||
|
total_after: int = 0
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
"""EntityCandidate — candidato de entidad extraido por el LLM."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EntityCandidate:
|
||||||
|
"""Candidato de entidad extraido por el LLM.
|
||||||
|
|
||||||
|
Puede venir de un solo chunk o ser el resultado de mergear multiples
|
||||||
|
extracciones. `merged_from` rastrea los nombres originales para debugging.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
name_normalized: str = ""
|
||||||
|
type_ref: str = ""
|
||||||
|
type_label: str = ""
|
||||||
|
attributes: dict = field(default_factory=dict)
|
||||||
|
confidence: float = 0.0
|
||||||
|
source_chunk_indices: list[int] = field(default_factory=list)
|
||||||
|
merged_from: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""Serializa el candidato a un diccionario."""
|
||||||
|
return {
|
||||||
|
"name": self.name,
|
||||||
|
"name_normalized": self.name_normalized,
|
||||||
|
"type_ref": self.type_ref,
|
||||||
|
"type_label": self.type_label,
|
||||||
|
"attributes": self.attributes,
|
||||||
|
"confidence": self.confidence,
|
||||||
|
"source_chunk_indices": self.source_chunk_indices,
|
||||||
|
"merged_from": self.merged_from,
|
||||||
|
}
|
||||||
@@ -0,0 +1,145 @@
|
|||||||
|
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||||
|
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
|
||||||
|
|
||||||
|
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
|
||||||
|
"""Construye el system prompt para extraccion de entidades."""
|
||||||
|
lines = [
|
||||||
|
"You are an entity extraction expert. Given text, extract all entities",
|
||||||
|
"matching these types. For each entity, provide: name, type_ref,",
|
||||||
|
"attributes (matching the metadata_fields for that type), and a",
|
||||||
|
"confidence score (0.0-1.0).",
|
||||||
|
"",
|
||||||
|
"Entity types:",
|
||||||
|
]
|
||||||
|
|
||||||
|
for schema_entry in entity_schema:
|
||||||
|
label = schema_entry.get("label", "Unknown")
|
||||||
|
type_ref = schema_entry.get("type_ref", "")
|
||||||
|
metadata_fields = schema_entry.get("metadata_fields", [])
|
||||||
|
lines.append(f"- {label} (type_ref: {type_ref})")
|
||||||
|
if metadata_fields:
|
||||||
|
lines.append(f" fields: {', '.join(metadata_fields)}")
|
||||||
|
|
||||||
|
lines += [
|
||||||
|
"",
|
||||||
|
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
|
||||||
|
"",
|
||||||
|
"Rules:",
|
||||||
|
"- Only extract entities explicitly mentioned in the text",
|
||||||
|
"- Use the exact type_ref from the schema",
|
||||||
|
"- Leave unknown attributes as null",
|
||||||
|
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
|
||||||
|
f"- {language_instruction}",
|
||||||
|
]
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_entities_llm(
|
||||||
|
text: str,
|
||||||
|
entity_schema: list[dict],
|
||||||
|
llm_chat_json: Callable[[list[dict]], dict],
|
||||||
|
language_instruction: str = "Respond in English.",
|
||||||
|
) -> list[EntityCandidate]:
|
||||||
|
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
|
||||||
|
|
||||||
|
Construye un system prompt con el schema de entity types, llama al LLM
|
||||||
|
y valida la respuesta retornando una lista de EntityCandidate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Chunk de texto a analizar.
|
||||||
|
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
|
||||||
|
dict con las claves 'type_ref', 'label' y opcionalmente
|
||||||
|
'metadata_fields'. Ejemplo:
|
||||||
|
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
|
||||||
|
"metadata_fields": ["full_name", "alias"]}]
|
||||||
|
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
|
||||||
|
y retorna un dict con la respuesta JSON del LLM. Interfaz:
|
||||||
|
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
|
||||||
|
language_instruction: Instruccion de idioma para el LLM. Por defecto
|
||||||
|
"Respond in English."
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
|
||||||
|
no retorna JSON valido o si no se encuentran entidades.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: Si entity_schema esta vacio.
|
||||||
|
"""
|
||||||
|
if not entity_schema:
|
||||||
|
raise ValueError("entity_schema no puede estar vacio")
|
||||||
|
|
||||||
|
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
|
||||||
|
type_ref_to_label = {
|
||||||
|
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
|
||||||
|
}
|
||||||
|
|
||||||
|
system_prompt = _build_system_prompt(entity_schema, language_instruction)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": text},
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = llm_chat_json(messages)
|
||||||
|
except Exception as exc:
|
||||||
|
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw_entities = response.get("entities", [])
|
||||||
|
if not isinstance(raw_entities, list):
|
||||||
|
warnings.warn(
|
||||||
|
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
candidates: list[EntityCandidate] = []
|
||||||
|
for item in raw_entities:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = item.get("name", "")
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
type_ref = item.get("type_ref", "")
|
||||||
|
if type_ref not in valid_type_refs:
|
||||||
|
warnings.warn(
|
||||||
|
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
attributes = item.get("attributes", {})
|
||||||
|
if not isinstance(attributes, dict):
|
||||||
|
attributes = {}
|
||||||
|
# Normalizar null values a None
|
||||||
|
attributes = {k: v for k, v in attributes.items() if v is not None}
|
||||||
|
|
||||||
|
confidence = item.get("confidence", 0.0)
|
||||||
|
if not isinstance(confidence, (int, float)):
|
||||||
|
confidence = 0.0
|
||||||
|
confidence = float(max(0.0, min(1.0, confidence)))
|
||||||
|
|
||||||
|
candidates.append(
|
||||||
|
EntityCandidate(
|
||||||
|
name=name,
|
||||||
|
type_ref=type_ref,
|
||||||
|
type_label=type_ref_to_label.get(type_ref, ""),
|
||||||
|
attributes=attributes,
|
||||||
|
confidence=confidence,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return candidates
|
||||||
@@ -0,0 +1,141 @@
|
|||||||
|
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
|
||||||
|
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
from relation_candidate import RelationCandidate
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_relations_llm(
|
||||||
|
text: str,
|
||||||
|
entities: list[EntityCandidate],
|
||||||
|
relation_types: list[str],
|
||||||
|
llm_chat_json: Callable[[list[dict]], dict],
|
||||||
|
language_instruction: str = "Respond in English.",
|
||||||
|
) -> list[RelationCandidate]:
|
||||||
|
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
|
||||||
|
|
||||||
|
Dado el texto original y las entidades ya extraidas, pide al LLM que
|
||||||
|
identifique relaciones entre pares de entidades. Las relaciones cuyo
|
||||||
|
from_name o to_name no coincidan con ninguna entidad existente se descartan.
|
||||||
|
Los tipos de relacion no permitidos se reemplazan por "related_to".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: chunk de texto (el mismo que se uso para extraer las entidades).
|
||||||
|
entities: entidades ya extraidas del chunk.
|
||||||
|
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
|
||||||
|
"communicates_with", "owns", "related_to"].
|
||||||
|
llm_chat_json: funcion inyectada que recibe una lista de mensajes
|
||||||
|
(dicts con "role" y "content") y retorna un dict con la respuesta
|
||||||
|
JSON del LLM.
|
||||||
|
language_instruction: instruccion de idioma para el LLM.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
|
||||||
|
o si el LLM no encuentra relaciones.
|
||||||
|
"""
|
||||||
|
if len(entities) < 2:
|
||||||
|
return []
|
||||||
|
|
||||||
|
entity_names = {e.name for e in entities}
|
||||||
|
relation_types_set = set(relation_types)
|
||||||
|
|
||||||
|
# Construir lista de entidades para el prompt
|
||||||
|
entity_lines = "\n".join(
|
||||||
|
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
|
||||||
|
)
|
||||||
|
|
||||||
|
# Construir tipos de relacion para el prompt
|
||||||
|
relation_types_str = ", ".join(relation_types)
|
||||||
|
|
||||||
|
system_prompt = f"""\
|
||||||
|
You are a relation extraction expert. Given text and a list of entities already \
|
||||||
|
extracted, identify relationships between them.
|
||||||
|
|
||||||
|
Entities found in this text:
|
||||||
|
{entity_lines}
|
||||||
|
|
||||||
|
Allowed relation types: {relation_types_str}
|
||||||
|
|
||||||
|
Output JSON: {{"relations": [
|
||||||
|
{{"from_name": "Entity A", "to_name": "Entity B",
|
||||||
|
"relation_type": "employs", "description": "...", "confidence": 0.8}}
|
||||||
|
]}}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Only extract relations explicitly stated or strongly implied in the text
|
||||||
|
- from_name and to_name must match entity names exactly as listed above
|
||||||
|
- relation_type must be one of the allowed types
|
||||||
|
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
|
||||||
|
- Do not invent entities not in the list above
|
||||||
|
- {language_instruction}"""
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": text},
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = llm_chat_json(messages)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw_relations = response.get("relations", [])
|
||||||
|
if not isinstance(raw_relations, list):
|
||||||
|
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
|
||||||
|
return []
|
||||||
|
|
||||||
|
results: list[RelationCandidate] = []
|
||||||
|
for item in raw_relations:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
from_name = item.get("from_name", "")
|
||||||
|
to_name = item.get("to_name", "")
|
||||||
|
|
||||||
|
# Validar que ambos nombres corresponden a entidades existentes
|
||||||
|
if from_name not in entity_names:
|
||||||
|
logger.debug(
|
||||||
|
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
|
||||||
|
from_name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if to_name not in entity_names:
|
||||||
|
logger.debug(
|
||||||
|
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
|
||||||
|
to_name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
relation_type = item.get("relation_type", "")
|
||||||
|
if relation_type not in relation_types_set:
|
||||||
|
logger.debug(
|
||||||
|
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
|
||||||
|
relation_type,
|
||||||
|
)
|
||||||
|
relation_type = "related_to"
|
||||||
|
|
||||||
|
confidence = item.get("confidence", 0.0)
|
||||||
|
if not isinstance(confidence, (int, float)):
|
||||||
|
confidence = 0.0
|
||||||
|
confidence = float(max(0.0, min(1.0, confidence)))
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
RelationCandidate(
|
||||||
|
from_name=from_name,
|
||||||
|
to_name=to_name,
|
||||||
|
relation_type=relation_type,
|
||||||
|
description=item.get("description", ""),
|
||||||
|
confidence=confidence,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
"""Extract plain text from PDF, Markdown, or TXT files."""
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_encoding(data: bytes) -> str:
|
||||||
|
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
||||||
|
# Strategy 1: UTF-8
|
||||||
|
try:
|
||||||
|
data.decode("utf-8")
|
||||||
|
return "utf-8"
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 2: charset_normalizer
|
||||||
|
try:
|
||||||
|
from charset_normalizer import from_bytes
|
||||||
|
|
||||||
|
result = from_bytes(data).best()
|
||||||
|
if result is not None and result.encoding:
|
||||||
|
return result.encoding
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 3: chardet
|
||||||
|
try:
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
detected = chardet.detect(data)
|
||||||
|
if detected and detected.get("encoding"):
|
||||||
|
return detected["encoding"]
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Last resort: UTF-8 with replacement
|
||||||
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_file(file_path: str) -> str:
|
||||||
|
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
||||||
|
|
||||||
|
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
||||||
|
joining them with double newlines. For text-based files (.md, .markdown,
|
||||||
|
.txt) reads the file with automatic encoding detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Absolute or relative path to the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted plain text content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the file does not exist.
|
||||||
|
ValueError: If the file extension is not supported.
|
||||||
|
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
_, ext = os.path.splitext(file_path.lower())
|
||||||
|
|
||||||
|
if ext == ".pdf":
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"PyMuPDF is required for PDF extraction. "
|
||||||
|
"Install it with: pip install PyMuPDF"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
doc = fitz.open(file_path)
|
||||||
|
pages = [page.get_text() for page in doc]
|
||||||
|
return "\n\n".join(pages)
|
||||||
|
|
||||||
|
elif ext in {".md", ".markdown", ".txt"}:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
raw = f.read()
|
||||||
|
|
||||||
|
encoding = _detect_encoding(raw)
|
||||||
|
try:
|
||||||
|
return raw.decode(encoding)
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
return raw.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported file extension: '{ext}'. "
|
||||||
|
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||||
|
)
|
||||||
@@ -0,0 +1,208 @@
|
|||||||
|
"""Pipeline de extraccion de entidades y relaciones desde un documento."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import warnings
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
|
||||||
|
|
||||||
|
from extract_text_from_file import extract_text_from_file
|
||||||
|
from core_functions import preprocess_text
|
||||||
|
from split_text_into_chunks import split_text_into_chunks
|
||||||
|
from build_entity_schema_prompt import build_entity_schema_prompt
|
||||||
|
from build_relation_schema_prompt import build_relation_schema_prompt
|
||||||
|
from extract_entities_llm import extract_entities_llm
|
||||||
|
from extract_relations_llm import extract_relations_llm
|
||||||
|
from deduplicate_entities import deduplicate_entities
|
||||||
|
from deduplicate_relations import deduplicate_relations
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
from extraction_result import ExtractionResult
|
||||||
|
from extraction_stats import ExtractionStats
|
||||||
|
|
||||||
|
|
||||||
|
def extraction_pipeline(
|
||||||
|
file_path: str,
|
||||||
|
entity_presets: list[dict],
|
||||||
|
relation_types: list[str],
|
||||||
|
llm_chat_json: Callable[[list[dict]], dict],
|
||||||
|
chunk_size: int = 500,
|
||||||
|
chunk_overlap: int = 50,
|
||||||
|
confidence_threshold: float = 0.5,
|
||||||
|
dedup_threshold: float = 0.85,
|
||||||
|
on_progress: Callable[[str, float], None] | None = None,
|
||||||
|
) -> ExtractionResult:
|
||||||
|
"""Pipeline completa de extraccion de entidades y relaciones desde un documento.
|
||||||
|
|
||||||
|
Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
|
||||||
|
-> extract_entities_llm por chunk -> deduplicate_entities ->
|
||||||
|
extract_relations_llm por chunk -> deduplicate_relations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
|
||||||
|
entity_presets: lista de dicts con type_ref, label y metadata_fields.
|
||||||
|
Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
|
||||||
|
"label": "Person",
|
||||||
|
"metadata_fields": ["full_name", "nationality"]}]
|
||||||
|
relation_types: tipos de relacion permitidos para extraccion.
|
||||||
|
Ejemplo: ["funds", "employs", "communicates_with", "owns"]
|
||||||
|
llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
|
||||||
|
con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
|
||||||
|
chunk_size: numero de caracteres por chunk (default 500).
|
||||||
|
chunk_overlap: overlap entre chunks consecutivos (default 50).
|
||||||
|
confidence_threshold: umbral minimo de confidence para aceptar entidades
|
||||||
|
candidatas antes de deduplicar (default 0.5).
|
||||||
|
dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
|
||||||
|
on_progress: callback opcional de progreso (message: str, pct: float 0-1).
|
||||||
|
0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
|
||||||
|
80-100%: deduplicacion.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: si file_path no existe.
|
||||||
|
ValueError: si entity_presets esta vacio.
|
||||||
|
"""
|
||||||
|
if not entity_presets:
|
||||||
|
raise ValueError("entity_presets no puede estar vacio")
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
|
||||||
|
|
||||||
|
def _progress(msg: str, pct: float) -> None:
|
||||||
|
if on_progress is not None:
|
||||||
|
try:
|
||||||
|
on_progress(msg, pct)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
start_time = time.monotonic()
|
||||||
|
stats = ExtractionStats()
|
||||||
|
|
||||||
|
# ── Paso 1: Extraer texto ──────────────────────────────────────────────────
|
||||||
|
_progress("Extracting text from file...", 0.0)
|
||||||
|
try:
|
||||||
|
raw_text = extract_text_from_file(file_path)
|
||||||
|
except Exception as exc:
|
||||||
|
warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
|
||||||
|
raw_text = ""
|
||||||
|
|
||||||
|
# ── Paso 2: Preprocesar ────────────────────────────────────────────────────
|
||||||
|
clean_text = preprocess_text(raw_text)
|
||||||
|
stats.total_chars = len(clean_text)
|
||||||
|
|
||||||
|
# ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
|
||||||
|
chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
|
||||||
|
n = len(chunks)
|
||||||
|
stats.total_chunks = n
|
||||||
|
|
||||||
|
if n == 0:
|
||||||
|
stats.processing_time_seconds = time.monotonic() - start_time
|
||||||
|
return ExtractionResult(entities=[], relations=[], stats=stats)
|
||||||
|
|
||||||
|
# ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
|
||||||
|
all_raw_entities: list[EntityCandidate] = []
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
_progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
|
||||||
|
try:
|
||||||
|
candidates = extract_entities_llm(
|
||||||
|
text=chunk,
|
||||||
|
entity_schema=entity_presets,
|
||||||
|
llm_chat_json=llm_chat_json,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
warnings.warn(
|
||||||
|
f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
|
||||||
|
)
|
||||||
|
candidates = []
|
||||||
|
|
||||||
|
for candidate in candidates:
|
||||||
|
# Anotar el chunk de origen
|
||||||
|
if i not in candidate.source_chunk_indices:
|
||||||
|
candidate.source_chunk_indices.append(i)
|
||||||
|
all_raw_entities.append(candidate)
|
||||||
|
|
||||||
|
# ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
|
||||||
|
filtered_entities = [
|
||||||
|
e for e in all_raw_entities if e.confidence >= confidence_threshold
|
||||||
|
]
|
||||||
|
stats.raw_entities_count = len(filtered_entities)
|
||||||
|
|
||||||
|
# Actualizar stats de tipos
|
||||||
|
for ent in filtered_entities:
|
||||||
|
stats.entity_types_found[ent.type_ref] = (
|
||||||
|
stats.entity_types_found.get(ent.type_ref, 0) + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
|
||||||
|
_progress("Deduplicating entities...", 0.4)
|
||||||
|
dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
|
||||||
|
|
||||||
|
stats.final_entities_count = dedup_result.total_after
|
||||||
|
stats.entities_merged = dedup_result.total_before - dedup_result.total_after
|
||||||
|
|
||||||
|
final_entities = dedup_result.entities
|
||||||
|
entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id
|
||||||
|
|
||||||
|
# ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
|
||||||
|
all_raw_relations = []
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
_progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
|
||||||
|
|
||||||
|
# Obtener entidades relevantes de este chunk
|
||||||
|
chunk_entities = [
|
||||||
|
e for e in final_entities if i in e.source_chunk_indices
|
||||||
|
]
|
||||||
|
# Si no hay entidades en este chunk especifico, usar todas
|
||||||
|
if not chunk_entities:
|
||||||
|
chunk_entities = final_entities
|
||||||
|
|
||||||
|
if len(chunk_entities) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunk_relations = extract_relations_llm(
|
||||||
|
text=chunk,
|
||||||
|
entities=chunk_entities,
|
||||||
|
relation_types=relation_types,
|
||||||
|
llm_chat_json=llm_chat_json,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
warnings.warn(
|
||||||
|
f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
|
||||||
|
)
|
||||||
|
chunk_relations = []
|
||||||
|
|
||||||
|
for rel in chunk_relations:
|
||||||
|
rel.source_chunk_index = i
|
||||||
|
all_raw_relations.extend(chunk_relations)
|
||||||
|
|
||||||
|
stats.raw_relations_count = len(all_raw_relations)
|
||||||
|
|
||||||
|
# Actualizar stats de tipos de relacion
|
||||||
|
for rel in all_raw_relations:
|
||||||
|
stats.relation_types_found[rel.relation_type] = (
|
||||||
|
stats.relation_types_found.get(rel.relation_type, 0) + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
|
||||||
|
_progress("Deduplicating relations...", 0.8)
|
||||||
|
final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
|
||||||
|
|
||||||
|
stats.final_relations_count = len(final_relations)
|
||||||
|
stats.relations_merged = stats.raw_relations_count - len(final_relations)
|
||||||
|
stats.processing_time_seconds = time.monotonic() - start_time
|
||||||
|
|
||||||
|
_progress("Done", 1.0)
|
||||||
|
|
||||||
|
return ExtractionResult(
|
||||||
|
entities=final_entities,
|
||||||
|
relations=final_relations,
|
||||||
|
stats=stats,
|
||||||
|
)
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
"""ExtractionResult — resultado final del pipeline de extraccion."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from entity_candidate import EntityCandidate
|
||||||
|
from extraction_stats import ExtractionStats
|
||||||
|
from relation_candidate import RelationCandidate
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionResult:
|
||||||
|
"""Resultado final del pipeline de extraccion de entidades y relaciones.
|
||||||
|
|
||||||
|
Contiene las listas deduplicadas de entidades y relaciones junto con
|
||||||
|
las estadisticas del proceso completo.
|
||||||
|
"""
|
||||||
|
|
||||||
|
entities: list[EntityCandidate]
|
||||||
|
relations: list[RelationCandidate]
|
||||||
|
stats: ExtractionStats = field(default_factory=ExtractionStats)
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
"""ExtractionStats — estadisticas del proceso de extraccion."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionStats:
|
||||||
|
"""Estadisticas del proceso de extraccion.
|
||||||
|
|
||||||
|
Util para reporting y debugging. Registra conteos antes y despues de
|
||||||
|
deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
|
||||||
|
"""
|
||||||
|
|
||||||
|
total_chunks: int = 0
|
||||||
|
total_chars: int = 0
|
||||||
|
raw_entities_count: int = 0
|
||||||
|
final_entities_count: int = 0
|
||||||
|
entities_merged: int = 0
|
||||||
|
raw_relations_count: int = 0
|
||||||
|
final_relations_count: int = 0
|
||||||
|
relations_merged: int = 0
|
||||||
|
relations_discarded: int = 0
|
||||||
|
entity_types_found: dict[str, int] = field(default_factory=dict)
|
||||||
|
relation_types_found: dict[str, int] = field(default_factory=dict)
|
||||||
|
processing_time_seconds: float = 0.0
|
||||||
@@ -0,0 +1,78 @@
|
|||||||
|
"""Combina atributos de multiples candidatos de la misma entidad."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
|
||||||
|
_DATE_MIN_FIELDS = {"first_seen", "created_date"}
|
||||||
|
_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
|
||||||
|
_BOOL_FIELDS = {"verified", "exploited"}
|
||||||
|
|
||||||
|
|
||||||
|
def merge_entity_attributes(attr_list: list[dict]) -> dict:
|
||||||
|
"""Combina atributos de multiples candidatos de la misma entidad.
|
||||||
|
|
||||||
|
Para cada campo presente en cualquier candidato recopila todos los valores
|
||||||
|
non-null y aplica heuristicas de resolucion por tipo de campo:
|
||||||
|
- Numerico (risk_score, balance, cvss): max
|
||||||
|
- Fecha min (first_seen, created_date): min (mas antigua)
|
||||||
|
- Fecha max (last_seen, expires_date): max (mas reciente)
|
||||||
|
- Lista (cualquier valor de tipo list): union sin duplicados
|
||||||
|
- Boolean (verified, exploited): OR logico
|
||||||
|
- String: el mas largo
|
||||||
|
|
||||||
|
Args:
|
||||||
|
attr_list: Lista de dicts con los atributos de cada candidato.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict con los atributos fusionados.
|
||||||
|
"""
|
||||||
|
if not attr_list:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Recopilar todas las claves presentes en cualquier candidato
|
||||||
|
all_keys: set[str] = set()
|
||||||
|
for attrs in attr_list:
|
||||||
|
all_keys.update(attrs.keys())
|
||||||
|
|
||||||
|
merged: dict = {}
|
||||||
|
|
||||||
|
for key in all_keys:
|
||||||
|
# Recopilar valores non-null
|
||||||
|
values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
|
||||||
|
|
||||||
|
if not values:
|
||||||
|
merged[key] = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(values) == 1:
|
||||||
|
merged[key] = values[0]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Todos iguales
|
||||||
|
if all(v == values[0] for v in values):
|
||||||
|
merged[key] = values[0]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Resolver conflicto segun tipo de campo
|
||||||
|
if key in _NUMERIC_FIELDS:
|
||||||
|
merged[key] = max(values)
|
||||||
|
elif key in _DATE_MIN_FIELDS:
|
||||||
|
merged[key] = min(values)
|
||||||
|
elif key in _DATE_MAX_FIELDS:
|
||||||
|
merged[key] = max(values)
|
||||||
|
elif key in _BOOL_FIELDS:
|
||||||
|
merged[key] = any(values)
|
||||||
|
elif isinstance(values[0], list):
|
||||||
|
# Union de listas sin duplicados, preservando orden de aparicion
|
||||||
|
seen: list = []
|
||||||
|
for lst in values:
|
||||||
|
for item in lst:
|
||||||
|
if item not in seen:
|
||||||
|
seen.append(item)
|
||||||
|
merged[key] = seen
|
||||||
|
else:
|
||||||
|
# String u otro: usar el mas largo
|
||||||
|
str_values = [str(v) for v in values]
|
||||||
|
merged[key] = max(str_values, key=len)
|
||||||
|
|
||||||
|
return merged
|
||||||
@@ -0,0 +1,81 @@
|
|||||||
|
"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
_TITLES = re.compile(
|
||||||
|
r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
_LEGAL_SUFFIXES = re.compile(
|
||||||
|
r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
|
||||||
|
r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
|
||||||
|
r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
_MULTI_SPACE = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_entity_name(name: str, entity_type: str = "") -> str:
|
||||||
|
"""Normaliza el nombre de una entidad para comparacion y deduplicacion.
|
||||||
|
|
||||||
|
Aplica reglas diferentes segun el tipo de entidad:
|
||||||
|
- ip / email / domain / crypto_wallet / phone: normalizacion tecnica
|
||||||
|
- person: normalizacion de nombre humano (titulos, formato apellido-nombre)
|
||||||
|
- organization: normalizacion corporativa (sufijos legales)
|
||||||
|
- default: lower + strip + colapsar espacios
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: nombre de la entidad a normalizar.
|
||||||
|
entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
|
||||||
|
person, organization). Vacio = default.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nombre normalizado como string.
|
||||||
|
"""
|
||||||
|
name = name.strip()
|
||||||
|
et = entity_type.lower().strip()
|
||||||
|
|
||||||
|
if et == "ip":
|
||||||
|
return name.lower()
|
||||||
|
|
||||||
|
if et == "email":
|
||||||
|
return name.lower()
|
||||||
|
|
||||||
|
if et == "domain":
|
||||||
|
result = name.lower().rstrip(".")
|
||||||
|
if result.startswith("www."):
|
||||||
|
result = result[4:]
|
||||||
|
return result
|
||||||
|
|
||||||
|
if et == "crypto_wallet":
|
||||||
|
# Bitcoin addresses son case-sensitive — solo strip
|
||||||
|
return name
|
||||||
|
|
||||||
|
if et == "phone":
|
||||||
|
# Mantener solo digitos y el signo +
|
||||||
|
return re.sub(r"[^\d+]", "", name)
|
||||||
|
|
||||||
|
if et == "person":
|
||||||
|
# Remover titulos al inicio
|
||||||
|
result = _TITLES.sub("", name).strip()
|
||||||
|
# Detectar formato "Apellido, Nombre"
|
||||||
|
if "," in result:
|
||||||
|
parts = result.split(",", 1)
|
||||||
|
last = parts[0].strip()
|
||||||
|
first = parts[1].strip()
|
||||||
|
result = f"{first} {last}"
|
||||||
|
# Colapsar espacios y title case
|
||||||
|
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||||
|
return result.title()
|
||||||
|
|
||||||
|
if et == "organization":
|
||||||
|
result = _LEGAL_SUFFIXES.sub("", name).strip()
|
||||||
|
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||||
|
# Title case para consistencia
|
||||||
|
return result.title()
|
||||||
|
|
||||||
|
# Default: lower, strip, colapsar espacios
|
||||||
|
return _MULTI_SPACE.sub(" ", name.lower()).strip()
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
"""RelationCandidate — candidato de relacion extraido por el LLM."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RelationCandidate:
|
||||||
|
"""Candidato de relacion entre dos entidades extraido por el LLM.
|
||||||
|
|
||||||
|
`from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
|
||||||
|
y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
|
||||||
|
contra los EntityCandidate finales.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from_name: str
|
||||||
|
to_name: str
|
||||||
|
from_id: str = ""
|
||||||
|
to_id: str = ""
|
||||||
|
relation_type: str = ""
|
||||||
|
description: str = ""
|
||||||
|
confidence: float = 0.0
|
||||||
|
source_chunk_index: int = -1
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""Serializa el candidato a un diccionario."""
|
||||||
|
return {
|
||||||
|
"from_name": self.from_name,
|
||||||
|
"to_name": self.to_name,
|
||||||
|
"from_id": self.from_id,
|
||||||
|
"to_id": self.to_id,
|
||||||
|
"relation_type": self.relation_type,
|
||||||
|
"description": self.description,
|
||||||
|
"confidence": self.confidence,
|
||||||
|
"source_chunk_index": self.source_chunk_index,
|
||||||
|
}
|
||||||
@@ -0,0 +1,234 @@
|
|||||||
|
"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
_HTML_TEMPLATE = """\
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>{title}</title>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
|
||||||
|
<style>
|
||||||
|
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
||||||
|
body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
|
||||||
|
#container {{ width: 100vw; height: 100vh; }}
|
||||||
|
#panel {{
|
||||||
|
position: absolute; top: 12px; right: 12px;
|
||||||
|
background: rgba(10, 10, 30, 0.88);
|
||||||
|
border: 1px solid rgba(255,255,255,0.12);
|
||||||
|
padding: 16px; border-radius: 10px;
|
||||||
|
z-index: 10; min-width: 200px; max-width: 260px;
|
||||||
|
backdrop-filter: blur(6px);
|
||||||
|
}}
|
||||||
|
#panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
|
||||||
|
#stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
|
||||||
|
#filters {{ display: flex; flex-direction: column; gap: 6px; }}
|
||||||
|
.filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
|
||||||
|
.filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
|
||||||
|
.color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
|
||||||
|
#tooltip {{
|
||||||
|
position: absolute; display: none;
|
||||||
|
background: rgba(5, 5, 20, 0.95);
|
||||||
|
border: 1px solid rgba(255,255,255,0.15);
|
||||||
|
padding: 10px 14px; border-radius: 8px;
|
||||||
|
pointer-events: none; z-index: 20;
|
||||||
|
max-width: 300px; font-size: 12px; line-height: 1.6;
|
||||||
|
}}
|
||||||
|
#tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
|
||||||
|
#tooltip .tt-row {{ display: flex; gap: 6px; }}
|
||||||
|
#tooltip .tt-key {{ color: #888; min-width: 80px; }}
|
||||||
|
#tooltip .tt-val {{ color: #eee; word-break: break-all; }}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="container"></div>
|
||||||
|
<div id="panel">
|
||||||
|
<h3>{title}</h3>
|
||||||
|
<div id="stats"></div>
|
||||||
|
<div id="filters"></div>
|
||||||
|
</div>
|
||||||
|
<div id="tooltip"></div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function () {{
|
||||||
|
const graphData = {json_data};
|
||||||
|
|
||||||
|
// ── Build graphology graph ──────────────────────────────────────────────
|
||||||
|
const Graph = graphology.Graph || graphology;
|
||||||
|
const g = new Graph({{ multi: true, type: 'directed' }});
|
||||||
|
|
||||||
|
// Assign random initial positions
|
||||||
|
graphData.nodes.forEach(function (n) {{
|
||||||
|
g.addNode(n.key, Object.assign({{
|
||||||
|
x: (Math.random() - 0.5) * 10,
|
||||||
|
y: (Math.random() - 0.5) * 10,
|
||||||
|
}}, n.attributes));
|
||||||
|
}});
|
||||||
|
|
||||||
|
graphData.edges.forEach(function (e) {{
|
||||||
|
try {{
|
||||||
|
g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
|
||||||
|
}} catch (err) {{
|
||||||
|
// skip duplicate edge keys gracefully
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
// ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
|
||||||
|
const FA2 = graphologyLibrary.layoutForceAtlas2;
|
||||||
|
FA2.assign(g, {{
|
||||||
|
iterations: 500,
|
||||||
|
settings: {{
|
||||||
|
gravity: 1,
|
||||||
|
scalingRatio: 2,
|
||||||
|
slowDown: 5,
|
||||||
|
barnesHutOptimize: g.order > 300,
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
|
||||||
|
// ── Sigma renderer ──────────────────────────────────────────────────────
|
||||||
|
const renderer = new Sigma(g, document.getElementById('container'), {{
|
||||||
|
renderEdgeLabels: false,
|
||||||
|
defaultEdgeColor: '#444',
|
||||||
|
defaultNodeColor: '#95a5a6',
|
||||||
|
labelColor: {{ color: '#ccc' }},
|
||||||
|
labelSize: 11,
|
||||||
|
edgeReducer: function (edge, data) {{
|
||||||
|
return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
|
||||||
|
// ── Stats panel ─────────────────────────────────────────────────────────
|
||||||
|
document.getElementById('stats').textContent =
|
||||||
|
graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
|
||||||
|
|
||||||
|
// ── Filter panel by node type ───────────────────────────────────────────
|
||||||
|
const typeColors = {{}};
|
||||||
|
graphData.nodes.forEach(function (n) {{
|
||||||
|
const t = n.attributes.entity_type || 'unknown';
|
||||||
|
typeColors[t] = n.attributes.color || '#95a5a6';
|
||||||
|
}});
|
||||||
|
|
||||||
|
const hiddenTypes = new Set();
|
||||||
|
const filtersDiv = document.getElementById('filters');
|
||||||
|
|
||||||
|
Object.keys(typeColors).sort().forEach(function (type) {{
|
||||||
|
const color = typeColors[type];
|
||||||
|
const label = document.createElement('label');
|
||||||
|
label.className = 'filter-item';
|
||||||
|
|
||||||
|
const cb = document.createElement('input');
|
||||||
|
cb.type = 'checkbox';
|
||||||
|
cb.checked = true;
|
||||||
|
cb.addEventListener('change', function () {{
|
||||||
|
if (cb.checked) hiddenTypes.delete(type);
|
||||||
|
else hiddenTypes.add(type);
|
||||||
|
renderer.refresh();
|
||||||
|
}});
|
||||||
|
|
||||||
|
const dot = document.createElement('span');
|
||||||
|
dot.className = 'color-dot';
|
||||||
|
dot.style.background = color;
|
||||||
|
|
||||||
|
label.appendChild(cb);
|
||||||
|
label.appendChild(dot);
|
||||||
|
label.appendChild(document.createTextNode(type));
|
||||||
|
filtersDiv.appendChild(label);
|
||||||
|
}});
|
||||||
|
|
||||||
|
// Node reducer applies type filter
|
||||||
|
renderer.setSetting('nodeReducer', function (node, data) {{
|
||||||
|
if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
|
||||||
|
return data;
|
||||||
|
}});
|
||||||
|
|
||||||
|
// ── Tooltip on hover ────────────────────────────────────────────────────
|
||||||
|
const tooltip = document.getElementById('tooltip');
|
||||||
|
|
||||||
|
renderer.on('enterNode', function (ref) {{
|
||||||
|
const nodeAttrs = g.getNodeAttributes(ref.node);
|
||||||
|
const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
|
||||||
|
|
||||||
|
let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
|
||||||
|
html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
|
||||||
|
html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
|
||||||
|
html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
|
||||||
|
|
||||||
|
Object.keys(nodeAttrs).sort().forEach(function (k) {{
|
||||||
|
if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
|
||||||
|
html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
tooltip.innerHTML = html;
|
||||||
|
tooltip.style.display = 'block';
|
||||||
|
}});
|
||||||
|
|
||||||
|
renderer.on('leaveNode', function () {{
|
||||||
|
tooltip.style.display = 'none';
|
||||||
|
}});
|
||||||
|
|
||||||
|
document.getElementById('container').addEventListener('mousemove', function (e) {{
|
||||||
|
tooltip.style.left = (e.clientX + 16) + 'px';
|
||||||
|
tooltip.style.top = (e.clientY + 16) + 'px';
|
||||||
|
}});
|
||||||
|
|
||||||
|
function escHtml(str) {{
|
||||||
|
return String(str)
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"');
|
||||||
|
}}
|
||||||
|
}})();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def render_sigma_html(
|
||||||
|
graph_data: dict,
|
||||||
|
output_path: str,
|
||||||
|
title: str = "OSINT Graph",
|
||||||
|
) -> str:
|
||||||
|
"""Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
|
||||||
|
|
||||||
|
Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
|
||||||
|
en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
|
||||||
|
sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
|
||||||
|
tooltip con metadata al hacer hover.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
|
||||||
|
output_path: Ruta del archivo HTML a escribir.
|
||||||
|
title: Titulo del grafo mostrado en el panel y la pestana.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Ruta absoluta del archivo HTML escrito.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: Si no se puede escribir el archivo en output_path.
|
||||||
|
"""
|
||||||
|
json_data = json.dumps(graph_data, ensure_ascii=False)
|
||||||
|
|
||||||
|
html = _HTML_TEMPLATE.format(
|
||||||
|
title=title,
|
||||||
|
json_data=json_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
abs_path = os.path.abspath(output_path)
|
||||||
|
os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(abs_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
except OSError as exc:
|
||||||
|
raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
|
||||||
|
|
||||||
|
return abs_path
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
"""Split text into overlapping chunks with sentence-boundary awareness."""
|
||||||
|
|
||||||
|
|
||||||
|
def split_text_into_chunks(
|
||||||
|
text: str, chunk_size: int = 500, overlap: int = 50
|
||||||
|
) -> list[str]:
|
||||||
|
"""Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Texto a dividir.
|
||||||
|
chunk_size: Tamaño máximo de cada chunk en caracteres.
|
||||||
|
overlap: Número de caracteres de solapamiento entre chunks consecutivos.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Lista de chunks. Vacía si el texto es vacío.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if len(text) <= chunk_size:
|
||||||
|
stripped = text.strip()
|
||||||
|
return [stripped] if stripped else []
|
||||||
|
|
||||||
|
# Separadores en orden de prioridad (más específicos primero)
|
||||||
|
separators = ["。", "!", "?", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
text_len = len(text)
|
||||||
|
|
||||||
|
while start < text_len:
|
||||||
|
end = start + chunk_size
|
||||||
|
|
||||||
|
if end < text_len:
|
||||||
|
# Buscar el último separador de oración dentro de text[start:end]
|
||||||
|
# Solo aceptar si está después del 30% del chunk
|
||||||
|
min_pos = start + int(chunk_size * 0.30)
|
||||||
|
best_end = None
|
||||||
|
|
||||||
|
for sep in separators:
|
||||||
|
sep_len = len(sep)
|
||||||
|
# Buscar la última ocurrencia del separador en text[start:end]
|
||||||
|
search_region = text[start:end]
|
||||||
|
pos = search_region.rfind(sep)
|
||||||
|
if pos == -1:
|
||||||
|
continue
|
||||||
|
abs_pos = start + pos + sep_len
|
||||||
|
if abs_pos > min_pos:
|
||||||
|
# Usar este separador solo si produce un corte más tarde que el mínimo
|
||||||
|
# y más temprano que chunk_size (ya garantizado por rfind en [start:end])
|
||||||
|
if best_end is None or abs_pos > best_end:
|
||||||
|
best_end = abs_pos
|
||||||
|
|
||||||
|
if best_end is not None:
|
||||||
|
end = best_end
|
||||||
|
|
||||||
|
chunk = text[start:end].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
start = end - overlap
|
||||||
|
# Protección contra bucle infinito si overlap >= chunk_size o end no avanza
|
||||||
|
if start >= end:
|
||||||
|
start = end
|
||||||
|
|
||||||
|
return chunks
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
def main():
|
||||||
|
print("Hello from ontology-graph!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,935 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Ontology Graph Extraction\n",
|
||||||
|
"\n",
|
||||||
|
"Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
|
||||||
|
"- LLM: `claude -p --model haiku`\n",
|
||||||
|
"- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ModuleNotFoundError",
|
||||||
|
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m 4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 10\u001b[39m \n",
|
||||||
|
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import sys, os, json, subprocess\n",
|
||||||
|
"\n",
|
||||||
|
"ROOT = '/home/lucas/fn_registry'\n",
|
||||||
|
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
|
||||||
|
"sys.path.insert(0, ROOT)\n",
|
||||||
|
"\n",
|
||||||
|
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||||
|
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||||
|
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||||
|
"\n",
|
||||||
|
"print('Registry root:', ROOT)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "KeyError",
|
||||||
|
"evalue": "'FN_REGISTRY_ROOT'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m 4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
|
||||||
|
"\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import sys, os, json, subprocess\n",
|
||||||
|
"\n",
|
||||||
|
"ROOT = os.environ['FN_REGISTRY_ROOT']\n",
|
||||||
|
"sys.path.insert(0, ROOT)\n",
|
||||||
|
"\n",
|
||||||
|
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||||
|
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||||
|
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||||
|
"\n",
|
||||||
|
"print('Registry root:', ROOT)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## LLM wrapper: claude -p + haiku"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
|
||||||
|
" \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
|
||||||
|
" # Construir prompt desde messages\n",
|
||||||
|
" parts = []\n",
|
||||||
|
" for msg in messages:\n",
|
||||||
|
" role = msg['role']\n",
|
||||||
|
" content = msg['content']\n",
|
||||||
|
" if role == 'system':\n",
|
||||||
|
" parts.append(f\"[SYSTEM]\\n{content}\")\n",
|
||||||
|
" elif role == 'user':\n",
|
||||||
|
" parts.append(f\"[USER]\\n{content}\")\n",
|
||||||
|
" prompt = \"\\n\\n\".join(parts)\n",
|
||||||
|
" \n",
|
||||||
|
" result = subprocess.run(\n",
|
||||||
|
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
|
||||||
|
" capture_output=True, text=True, timeout=120\n",
|
||||||
|
" )\n",
|
||||||
|
" \n",
|
||||||
|
" if result.returncode != 0:\n",
|
||||||
|
" raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
|
||||||
|
" \n",
|
||||||
|
" # Extraer el campo 'result' del JSON envelope de claude\n",
|
||||||
|
" envelope = json.loads(result.stdout)\n",
|
||||||
|
" raw_text = envelope.get('result', '')\n",
|
||||||
|
" \n",
|
||||||
|
" # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
|
||||||
|
" return extract_json_from_llm(raw_text)\n",
|
||||||
|
"\n",
|
||||||
|
"# Test rapido\n",
|
||||||
|
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
|
||||||
|
"print('LLM wrapper OK:', test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Entity presets: OSINT + genéricos"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# --- Presets OSINT (del registry) ---\n",
|
||||||
|
"OSINT_PRESETS = [\n",
|
||||||
|
" {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
|
||||||
|
" \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
|
||||||
|
" \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
|
||||||
|
" \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
|
||||||
|
" \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
|
||||||
|
" \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
|
||||||
|
" \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
|
||||||
|
" \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
|
||||||
|
" \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
|
||||||
|
" \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
|
||||||
|
" \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
|
||||||
|
" \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
|
||||||
|
" \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
|
||||||
|
" {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
|
||||||
|
" \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# --- Presets genéricos (sin tipo Go, inline) ---\n",
|
||||||
|
"GENERIC_PRESETS = [\n",
|
||||||
|
" {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
|
||||||
|
" \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
|
||||||
|
" {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
|
||||||
|
" \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
|
||||||
|
" {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
|
||||||
|
" \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
|
||||||
|
" {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
|
||||||
|
" \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
|
||||||
|
" {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
|
||||||
|
" \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
|
||||||
|
" {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
|
||||||
|
" \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
|
||||||
|
"print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Relation types"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"RELATION_TYPES = [\n",
|
||||||
|
" # Personas / orgs\n",
|
||||||
|
" \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
|
||||||
|
" \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
|
||||||
|
" # Comunicacion\n",
|
||||||
|
" \"communicates_with\", \"sent_to\", \"received_from\",\n",
|
||||||
|
" # Ubicacion\n",
|
||||||
|
" \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
|
||||||
|
" # Eventos\n",
|
||||||
|
" \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
|
||||||
|
" # Documentos / conceptos\n",
|
||||||
|
" \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
|
||||||
|
" # Financiero\n",
|
||||||
|
" \"funds\", \"transacted_with\", \"invested_in\",\n",
|
||||||
|
" # Tecnico\n",
|
||||||
|
" \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
|
||||||
|
" # Generico\n",
|
||||||
|
" \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{len(RELATION_TYPES)} relation types')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Extraer documento\n",
|
||||||
|
"\n",
|
||||||
|
"Pon tu documento en `data/` y cambia el path."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf') # <-- cambiar\n",
|
||||||
|
"\n",
|
||||||
|
"# Progreso visible\n",
|
||||||
|
"def on_progress(msg, pct):\n",
|
||||||
|
" print(f' [{pct*100:5.1f}%] {msg}')\n",
|
||||||
|
"\n",
|
||||||
|
"result = extraction_pipeline(\n",
|
||||||
|
" file_path=DOC_PATH,\n",
|
||||||
|
" entity_presets=ALL_PRESETS,\n",
|
||||||
|
" relation_types=RELATION_TYPES,\n",
|
||||||
|
" llm_chat_json=claude_haiku_json,\n",
|
||||||
|
" chunk_size=800,\n",
|
||||||
|
" chunk_overlap=100,\n",
|
||||||
|
" confidence_threshold=0.5,\n",
|
||||||
|
" dedup_threshold=0.85,\n",
|
||||||
|
" on_progress=on_progress,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||||
|
"print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||||
|
"print(f'Chunks: {result.stats.total_chunks}')\n",
|
||||||
|
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
|
||||||
|
"print(f'Entity types: {result.stats.entity_types_found}')\n",
|
||||||
|
"print(f'Relation types: {result.stats.relation_types_found}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Explorar resultados"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"# Entities\n",
|
||||||
|
"ent_rows = []\n",
|
||||||
|
"for e in result.entities:\n",
|
||||||
|
" ent_rows.append({\n",
|
||||||
|
" 'id': e.id,\n",
|
||||||
|
" 'name': e.name,\n",
|
||||||
|
" 'type': e.type_ref,\n",
|
||||||
|
" 'confidence': e.confidence,\n",
|
||||||
|
" 'attributes': e.attributes,\n",
|
||||||
|
" })\n",
|
||||||
|
"df_entities = pd.DataFrame(ent_rows)\n",
|
||||||
|
"print(f'=== Entities ({len(df_entities)}) ===')\n",
|
||||||
|
"df_entities.sort_values('type')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Relations\n",
|
||||||
|
"rel_rows = []\n",
|
||||||
|
"for r in result.relations:\n",
|
||||||
|
" rel_rows.append({\n",
|
||||||
|
" 'from_name': r.from_name,\n",
|
||||||
|
" 'relation': r.relation_type,\n",
|
||||||
|
" 'to_name': r.to_name,\n",
|
||||||
|
" 'confidence': r.confidence,\n",
|
||||||
|
" 'description': r.description,\n",
|
||||||
|
" })\n",
|
||||||
|
"df_relations = pd.DataFrame(rel_rows)\n",
|
||||||
|
"print(f'=== Relations ({len(df_relations)}) ===')\n",
|
||||||
|
"df_relations.sort_values('relation')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualizar grafo con sigma.js"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Colores por tipo de entidad\n",
|
||||||
|
"TYPE_COLORS = {\n",
|
||||||
|
" 'osint_person_go_cybersecurity': '#e74c3c',\n",
|
||||||
|
" 'osint_organization_go_cybersecurity': '#3498db',\n",
|
||||||
|
" 'osint_location_go_cybersecurity': '#2ecc71',\n",
|
||||||
|
" 'osint_event_go_cybersecurity': '#f39c12',\n",
|
||||||
|
" 'osint_email_go_cybersecurity': '#9b59b6',\n",
|
||||||
|
" 'osint_domain_go_cybersecurity': '#1abc9c',\n",
|
||||||
|
" 'osint_ip_address_go_cybersecurity': '#e67e22',\n",
|
||||||
|
" 'osint_phone_go_cybersecurity': '#95a5a6',\n",
|
||||||
|
" 'osint_social_media_go_cybersecurity': '#e91e63',\n",
|
||||||
|
" 'osint_document_go_cybersecurity': '#607d8b',\n",
|
||||||
|
" 'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
|
||||||
|
" 'osint_malware_go_cybersecurity': '#f44336',\n",
|
||||||
|
" 'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
|
||||||
|
" 'concept': '#00bcd4',\n",
|
||||||
|
" 'url': '#8bc34a',\n",
|
||||||
|
" 'date_reference': '#cddc39',\n",
|
||||||
|
" 'quantity': '#ffc107',\n",
|
||||||
|
" 'coordinates': '#4caf50',\n",
|
||||||
|
" 'text_fragment': '#78909c',\n",
|
||||||
|
"}\n",
|
||||||
|
"DEFAULT_COLOR = '#aaaaaa'\n",
|
||||||
|
"\n",
|
||||||
|
"def extraction_to_sigma(result) -> dict:\n",
|
||||||
|
" \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
|
||||||
|
" # Contar degree para tamaño de nodo\n",
|
||||||
|
" degree = {}\n",
|
||||||
|
" for r in result.relations:\n",
|
||||||
|
" from_id = r.from_id or r.from_name\n",
|
||||||
|
" to_id = r.to_id or r.to_name\n",
|
||||||
|
" degree[from_id] = degree.get(from_id, 0) + 1\n",
|
||||||
|
" degree[to_id] = degree.get(to_id, 0) + 1\n",
|
||||||
|
"\n",
|
||||||
|
" nodes = []\n",
|
||||||
|
" for e in result.entities:\n",
|
||||||
|
" eid = e.id or e.name\n",
|
||||||
|
" nodes.append({\n",
|
||||||
|
" 'key': eid,\n",
|
||||||
|
" 'attributes': {\n",
|
||||||
|
" 'label': e.name,\n",
|
||||||
|
" 'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
|
||||||
|
" 'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
|
||||||
|
" 'type': e.type_ref,\n",
|
||||||
|
" **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
|
||||||
|
" }\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" edges = []\n",
|
||||||
|
" node_keys = {n['key'] for n in nodes}\n",
|
||||||
|
" for i, r in enumerate(result.relations):\n",
|
||||||
|
" from_id = r.from_id or r.from_name\n",
|
||||||
|
" to_id = r.to_id or r.to_name\n",
|
||||||
|
" if from_id in node_keys and to_id in node_keys:\n",
|
||||||
|
" edges.append({\n",
|
||||||
|
" 'key': f'e{i}',\n",
|
||||||
|
" 'source': from_id,\n",
|
||||||
|
" 'target': to_id,\n",
|
||||||
|
" 'attributes': {\n",
|
||||||
|
" 'label': r.relation_type,\n",
|
||||||
|
" 'type': r.relation_type,\n",
|
||||||
|
" }\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" return {'nodes': nodes, 'edges': edges}\n",
|
||||||
|
"\n",
|
||||||
|
"graph_data = extraction_to_sigma(result)\n",
|
||||||
|
"print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
|
||||||
|
"html_path = render_sigma_html(\n",
|
||||||
|
" graph_data=graph_data,\n",
|
||||||
|
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
|
||||||
|
" title='Ontology Graph',\n",
|
||||||
|
")\n",
|
||||||
|
"print(f'Graph saved: {html_path}')\n",
|
||||||
|
"print(f'Open in browser: file://{html_path}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Auto-discovery de nuevos tipos\n",
|
||||||
|
"\n",
|
||||||
|
"Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
|
||||||
|
" \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
|
||||||
|
" # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
|
||||||
|
" generic_entities = [\n",
|
||||||
|
" {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
|
||||||
|
" for e in result.entities\n",
|
||||||
|
" if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
|
||||||
|
" ]\n",
|
||||||
|
" \n",
|
||||||
|
" if not generic_entities:\n",
|
||||||
|
" print('No hay entidades genéricas — los presets cubren todo.')\n",
|
||||||
|
" return []\n",
|
||||||
|
"\n",
|
||||||
|
" existing_labels = [p['label'] for p in existing_presets]\n",
|
||||||
|
" \n",
|
||||||
|
" prompt_msg = [\n",
|
||||||
|
" {'role': 'system', 'content': (\n",
|
||||||
|
" 'You analyze entities extracted from a document and suggest new entity type presets. '\n",
|
||||||
|
" 'Existing types: ' + ', '.join(existing_labels) + '. '\n",
|
||||||
|
" 'For entities that dont fit existing types, suggest new type presets. '\n",
|
||||||
|
" 'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
|
||||||
|
" '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
|
||||||
|
" 'Only suggest types that are genuinely different from existing ones. '\n",
|
||||||
|
" 'Return {\"new_presets\": []} if no new types are needed.'\n",
|
||||||
|
" )},\n",
|
||||||
|
" {'role': 'user', 'content': (\n",
|
||||||
|
" 'These entities were classified as generic (concept/text_fragment) '\n",
|
||||||
|
" 'because they didnt fit existing types:\\n\\n'\n",
|
||||||
|
" + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
|
||||||
|
" )}\n",
|
||||||
|
" ]\n",
|
||||||
|
" \n",
|
||||||
|
" resp = claude_haiku_json(prompt_msg)\n",
|
||||||
|
" new_presets = resp.get('new_presets', [])\n",
|
||||||
|
" \n",
|
||||||
|
" if new_presets:\n",
|
||||||
|
" print(f'Discovered {len(new_presets)} new types:')\n",
|
||||||
|
" for p in new_presets:\n",
|
||||||
|
" print(f\" - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" print('No new types needed.')\n",
|
||||||
|
" \n",
|
||||||
|
" return new_presets\n",
|
||||||
|
"\n",
|
||||||
|
"new_types = discover_new_types(result, ALL_PRESETS)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
|
||||||
|
"if new_types:\n",
|
||||||
|
" EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
|
||||||
|
" print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
|
||||||
|
" \n",
|
||||||
|
" result = extraction_pipeline(\n",
|
||||||
|
" file_path=DOC_PATH,\n",
|
||||||
|
" entity_presets=EXPANDED_PRESETS,\n",
|
||||||
|
" relation_types=RELATION_TYPES,\n",
|
||||||
|
" llm_chat_json=claude_haiku_json,\n",
|
||||||
|
" chunk_size=800,\n",
|
||||||
|
" chunk_overlap=100,\n",
|
||||||
|
" confidence_threshold=0.5,\n",
|
||||||
|
" dedup_threshold=0.85,\n",
|
||||||
|
" on_progress=on_progress,\n",
|
||||||
|
" )\n",
|
||||||
|
" \n",
|
||||||
|
" print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||||
|
" print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||||
|
" \n",
|
||||||
|
" # Re-generar grafo\n",
|
||||||
|
" graph_data = extraction_to_sigma(result)\n",
|
||||||
|
" html_path = render_sigma_html(\n",
|
||||||
|
" graph_data=graph_data,\n",
|
||||||
|
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
|
||||||
|
" title='Ontology Graph (expanded)',\n",
|
||||||
|
" )\n",
|
||||||
|
" print(f'Updated graph: file://{html_path}')\n",
|
||||||
|
"else:\n",
|
||||||
|
" print('No re-extraction needed.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ModuleNotFoundError",
|
||||||
|
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m 8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 12\u001b[39m \n",
|
||||||
|
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import sys, os, json, subprocess\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"ROOT = '/home/lucas/fn_registry'\n",
|
||||||
|
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
|
||||||
|
"sys.path.insert(0, ROOT)\n",
|
||||||
|
"sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
|
||||||
|
"\n",
|
||||||
|
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||||
|
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||||
|
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||||
|
"\n",
|
||||||
|
"print('OK: imports loaded')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"imports OK\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import sys, os, json, subprocess\n",
|
||||||
|
"\n",
|
||||||
|
"# Añadir lib/ al path\n",
|
||||||
|
"sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
|
||||||
|
"\n",
|
||||||
|
"from core_functions import extract_json_from_llm\n",
|
||||||
|
"from extraction_pipeline import extraction_pipeline\n",
|
||||||
|
"from render_sigma_html import render_sigma_html\n",
|
||||||
|
"\n",
|
||||||
|
"print('imports OK')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"LLM wrapper OK: {'ok': True}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
|
||||||
|
" \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
|
||||||
|
" parts = []\n",
|
||||||
|
" for msg in messages:\n",
|
||||||
|
" role = msg['role']\n",
|
||||||
|
" content = msg['content']\n",
|
||||||
|
" if role == 'system':\n",
|
||||||
|
" parts.append(f'[SYSTEM]\\n{content}')\n",
|
||||||
|
" elif role == 'user':\n",
|
||||||
|
" parts.append(f'[USER]\\n{content}')\n",
|
||||||
|
" prompt = '\\n\\n'.join(parts)\n",
|
||||||
|
" \n",
|
||||||
|
" result = subprocess.run(\n",
|
||||||
|
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
|
||||||
|
" capture_output=True, text=True, timeout=120\n",
|
||||||
|
" )\n",
|
||||||
|
" if result.returncode != 0:\n",
|
||||||
|
" raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
|
||||||
|
" \n",
|
||||||
|
" envelope = json.loads(result.stdout)\n",
|
||||||
|
" raw_text = envelope.get('result', '')\n",
|
||||||
|
" return extract_json_from_llm(raw_text)\n",
|
||||||
|
"\n",
|
||||||
|
"# Test\n",
|
||||||
|
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
|
||||||
|
"print('LLM wrapper OK:', test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"19 presets, 35 relation types\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"OSINT_PRESETS = [\n",
|
||||||
|
" {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
|
||||||
|
" 'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
|
||||||
|
" {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
|
||||||
|
" 'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
|
||||||
|
" {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
|
||||||
|
" 'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
|
||||||
|
" {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
|
||||||
|
" 'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
|
||||||
|
" {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
|
||||||
|
" 'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
|
||||||
|
" {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
|
||||||
|
" 'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
|
||||||
|
" {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
|
||||||
|
" 'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
|
||||||
|
" {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
|
||||||
|
" 'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
|
||||||
|
" {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
|
||||||
|
" 'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
|
||||||
|
" {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
|
||||||
|
" 'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
|
||||||
|
" {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
|
||||||
|
" 'metadata_fields': ['address', 'blockchain', 'balance']},\n",
|
||||||
|
" {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
|
||||||
|
" 'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
|
||||||
|
" {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
|
||||||
|
" 'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"GENERIC_PRESETS = [\n",
|
||||||
|
" {'type_ref': 'concept', 'label': 'Concept',\n",
|
||||||
|
" 'metadata_fields': ['name', 'category', 'definition']},\n",
|
||||||
|
" {'type_ref': 'url', 'label': 'URL/Link',\n",
|
||||||
|
" 'metadata_fields': ['url', 'domain', 'context']},\n",
|
||||||
|
" {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
|
||||||
|
" 'metadata_fields': ['date', 'precision', 'context']},\n",
|
||||||
|
" {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
|
||||||
|
" 'metadata_fields': ['value', 'unit', 'context']},\n",
|
||||||
|
" {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
|
||||||
|
" 'metadata_fields': ['lat', 'lon', 'label']},\n",
|
||||||
|
" {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
|
||||||
|
" 'metadata_fields': ['text', 'category', 'relevance']},\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
|
||||||
|
"\n",
|
||||||
|
"RELATION_TYPES = [\n",
|
||||||
|
" 'employs', 'works_for', 'founded', 'owns', 'controls',\n",
|
||||||
|
" 'member_of', 'affiliated_with', 'collaborates_with',\n",
|
||||||
|
" 'communicates_with', 'sent_to', 'received_from',\n",
|
||||||
|
" 'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
|
||||||
|
" 'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
|
||||||
|
" 'mentions', 'references', 'describes', 'authored', 'published',\n",
|
||||||
|
" 'funds', 'transacted_with', 'invested_in',\n",
|
||||||
|
" 'hosts', 'resolves_to', 'exploits', 'targets',\n",
|
||||||
|
" 'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 0.0%] Extracting text from file...\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 0.0%] Extracting entities from chunk 1/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 0.7%] Extracting entities from chunk 2/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 1.5%] Extracting entities from chunk 3/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 2.2%] Extracting entities from chunk 4/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 3.0%] Extracting entities from chunk 5/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
|
||||||
|
" candidates = extract_entities_llm(\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 3.7%] Extracting entities from chunk 6/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 4.4%] Extracting entities from chunk 7/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 5.2%] Extracting entities from chunk 8/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 5.9%] Extracting entities from chunk 9/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 6.7%] Extracting entities from chunk 10/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 7.4%] Extracting entities from chunk 11/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 8.1%] Extracting entities from chunk 12/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 8.9%] Extracting entities from chunk 13/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 9.6%] Extracting entities from chunk 14/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 10.4%] Extracting entities from chunk 15/54\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" [ 11.1%] Extracting entities from chunk 16/54\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
|
||||||
|
"\n",
|
||||||
|
"def on_progress(msg, pct):\n",
|
||||||
|
" print(f' [{pct*100:5.1f}%] {msg}')\n",
|
||||||
|
"\n",
|
||||||
|
"result = extraction_pipeline(\n",
|
||||||
|
" file_path=DOC_PATH,\n",
|
||||||
|
" entity_presets=ALL_PRESETS,\n",
|
||||||
|
" relation_types=RELATION_TYPES,\n",
|
||||||
|
" llm_chat_json=claude_haiku_json,\n",
|
||||||
|
" chunk_size=800,\n",
|
||||||
|
" chunk_overlap=100,\n",
|
||||||
|
" confidence_threshold=0.5,\n",
|
||||||
|
" dedup_threshold=0.85,\n",
|
||||||
|
" on_progress=on_progress,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||||
|
"print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||||
|
"print(f'Chunks: {result.stats.total_chunks}')\n",
|
||||||
|
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
|
||||||
|
"print(f'Entity types: {result.stats.entity_types_found}')\n",
|
||||||
|
"print(f'Relation types: {result.stats.relation_types_found}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Pipeline optimizado\n",
|
||||||
|
"\n",
|
||||||
|
"- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
|
||||||
|
"- Chunks de 2000 chars\n",
|
||||||
|
"- Paralelizado con ThreadPoolExecutor"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||||
|
"from extract_text_from_file import extract_text_from_file\n",
|
||||||
|
"from core_functions import preprocess_text\n",
|
||||||
|
"from split_text_into_chunks import split_text_into_chunks\n",
|
||||||
|
"from deduplicate_entities import deduplicate_entities\n",
|
||||||
|
"from deduplicate_relations import deduplicate_relations\n",
|
||||||
|
"from entity_candidate import EntityCandidate\n",
|
||||||
|
"from relation_candidate import RelationCandidate\n",
|
||||||
|
"\n",
|
||||||
|
"def build_unified_prompt(entity_presets, relation_types):\n",
|
||||||
|
" \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
|
||||||
|
" type_lines = []\n",
|
||||||
|
" for p in entity_presets:\n",
|
||||||
|
" fields = ', '.join(p.get('metadata_fields', []))\n",
|
||||||
|
" type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
|
||||||
|
"\n",
|
||||||
|
" return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
|
||||||
|
"\n",
|
||||||
|
"ENTITY TYPES:\n",
|
||||||
|
"{chr(10).join(type_lines)}\n",
|
||||||
|
"\n",
|
||||||
|
"RELATION TYPES: {', '.join(relation_types)}\n",
|
||||||
|
"\n",
|
||||||
|
"OUTPUT FORMAT (strict JSON):\n",
|
||||||
|
"{{\n",
|
||||||
|
" \"entities\": [\n",
|
||||||
|
" {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"relations\": [\n",
|
||||||
|
" {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"suggested_types\": [\n",
|
||||||
|
" {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
|
||||||
|
" ]\n",
|
||||||
|
"}}\n",
|
||||||
|
"\n",
|
||||||
|
"RULES:\n",
|
||||||
|
"- Extract ALL entities explicitly mentioned in the text\n",
|
||||||
|
"- Use exact type_ref from the schema. Leave unknown attributes as null\n",
|
||||||
|
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
|
||||||
|
"- Relations: from_name and to_name MUST match extracted entity names exactly\n",
|
||||||
|
"- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
|
||||||
|
"- If no suggested types are needed, return \"suggested_types\": []\n",
|
||||||
|
"- Respond in the same language as the text for descriptions'''\n",
|
||||||
|
"\n",
|
||||||
|
"UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
|
||||||
|
"print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
[project]
|
||||||
|
name = "ontology-graph"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.13"
|
||||||
|
dependencies = [
|
||||||
|
"jupyter>=1.1.1",
|
||||||
|
"jupyter-collaboration>=4.3.0",
|
||||||
|
"jupyter-mcp-server>=0.4.0",
|
||||||
|
"jupyterlab>=4.5.6",
|
||||||
|
"matplotlib>=3.10.8",
|
||||||
|
"numpy>=2.4.4",
|
||||||
|
"pandas>=3.0.2",
|
||||||
|
]
|
||||||
Executable
+45
@@ -0,0 +1,45 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Jupyter Lab — modo colaborativo con autodeteccion de puerto
|
||||||
|
# Generado por write_jupyter_launcher (fn_registry)
|
||||||
|
|
||||||
|
find_free_port() {
|
||||||
|
for port in 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899; do
|
||||||
|
if ! ss -tln 2>/dev/null | grep -q ":${port} " && \
|
||||||
|
! lsof -i:"$port" >/dev/null 2>&1; then
|
||||||
|
echo $port
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo 8888
|
||||||
|
}
|
||||||
|
|
||||||
|
PORT=${1:-$(find_free_port)}
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
echo $PORT > .jupyter-port
|
||||||
|
|
||||||
|
source .venv/bin/activate 2>/dev/null || true
|
||||||
|
|
||||||
|
if ! python -c "import jupyter_collaboration" 2>/dev/null; then
|
||||||
|
echo "ERROR: jupyter-collaboration no esta instalado"
|
||||||
|
echo "Instala con: uv add jupyter-collaboration"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "════════════════════════════════════════════════"
|
||||||
|
echo " Jupyter Lab + Colaboracion en puerto $PORT"
|
||||||
|
echo "════════════════════════════════════════════════"
|
||||||
|
echo ""
|
||||||
|
echo " Abre: http://localhost:$PORT"
|
||||||
|
echo " Ctrl+C para detener"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
jupyter lab \
|
||||||
|
--port=$PORT \
|
||||||
|
--no-browser \
|
||||||
|
--ServerApp.token='' \
|
||||||
|
--ServerApp.password='' \
|
||||||
|
--ServerApp.disable_check_xsrf=True \
|
||||||
|
--ServerApp.allow_origin='*' \
|
||||||
|
--ServerApp.root_dir="$(pwd)" \
|
||||||
|
--collaborative
|
||||||
Reference in New Issue
Block a user