chore: initial sync
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
# JUPYTER HABILITADO EN ESTE ANALISIS
|
||||
|
||||
## Reglas OBLIGATORIAS para Claude
|
||||
|
||||
### 1. CODIGO INMUTABLE — NUNCA MODIFICAR CELDAS EXISTENTES
|
||||
- **PROHIBIDO** usar NotebookEdit para reemplazar celdas existentes
|
||||
- **SIEMPRE** anadir celdas NUEVAS al final del notebook
|
||||
- Si hay un error en una celda, crear celda nueva con la correccion
|
||||
- El historial de trabajo debe quedar intacto para trazabilidad
|
||||
|
||||
### 2. PROGRAMACION FUNCIONAL OBLIGATORIA
|
||||
- **Funciones puras**: sin efectos secundarios, mismo input -> mismo output
|
||||
- **Inmutabilidad**: nunca mutar datos, crear copias transformadas
|
||||
- **Composicion**: funciones pequenas que se combinan
|
||||
- Preferir: `map`, `filter`, `reduce`, list comprehensions
|
||||
- Evitar: loops con mutacion, `global`, modificar argumentos in-place
|
||||
|
||||
### 3. SIEMPRE usar MCP jupyter para ejecutar codigo Python
|
||||
- Las ejecuciones se ven en tiempo real en Jupyter Lab del usuario
|
||||
- Compartimos variables y estado del kernel
|
||||
- **NUNCA usar bash para ejecutar Python en este analisis**
|
||||
|
||||
### 4. Verificar Jupyter activo ANTES de ejecutar
|
||||
- Si no esta activo: pedir al usuario que ejecute `./run-jupyter-lab.sh`
|
||||
|
||||
### 5. Gestion de notebooks
|
||||
- Notebooks en la carpeta `notebooks/` o subcarpetas
|
||||
- Si un notebook tiene >50 celdas, crear uno nuevo
|
||||
- Nombrar descriptivamente: `01_exploracion.ipynb`, `02_limpieza.ipynb`
|
||||
|
||||
### 6. Gestion de Python
|
||||
- **SIEMPRE usar `uv`** para gestionar dependencias
|
||||
- Anadir paquetes con `uv add nombre_paquete`
|
||||
|
||||
### 7. Acceso al fn_registry
|
||||
- `FN_REGISTRY_ROOT` apunta a la raiz del registry
|
||||
- Para importar funciones Python: `sys.path.insert(0, os.path.join(os.environ["FN_REGISTRY_ROOT"], "python", "functions"))`
|
||||
- Para consultar registry.db: `sqlite3` o `import sqlite3` con la ruta `$FN_REGISTRY_ROOT/registry.db`
|
||||
|
||||
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
.venv/
|
||||
.mcp.json
|
||||
.jupyter-port
|
||||
.jupyter/
|
||||
.jupyter_ystore.db
|
||||
.ipython/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.ipynb_checkpoints/
|
||||
bin/
|
||||
data/
|
||||
.DS_Store
|
||||
@@ -0,0 +1 @@
|
||||
3.13
|
||||
+540
@@ -0,0 +1,540 @@
|
||||
"""Extracción de grafo ontológico desde un documento.
|
||||
|
||||
Uso: python extract.py <archivo>
|
||||
python extract.py data/condiciones-generales-bizum.pdf
|
||||
|
||||
Optimizaciones vs extraction_pipeline:
|
||||
- 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos)
|
||||
- Chunks de 2000 chars
|
||||
- Paralelizado con ThreadPoolExecutor
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib"))
|
||||
|
||||
from extract_text_from_file import extract_text_from_file
|
||||
from core_functions import preprocess_text, extract_json_from_llm
|
||||
from split_text_into_chunks import split_text_into_chunks
|
||||
from deduplicate_entities import deduplicate_entities
|
||||
from deduplicate_relations import deduplicate_relations
|
||||
from entity_candidate import EntityCandidate
|
||||
from relation_candidate import RelationCandidate
|
||||
from render_sigma_html import render_sigma_html
|
||||
|
||||
# ── Presets ────────────────────────────────────────────────────────────────────
|
||||
|
||||
OSINT_PRESETS = [
|
||||
{"type_ref": "person", "label": "Person",
|
||||
"metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
|
||||
{"type_ref": "organization", "label": "Organization",
|
||||
"metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
|
||||
{"type_ref": "location", "label": "Location",
|
||||
"metadata_fields": ["lat", "lon", "address", "country", "city"]},
|
||||
{"type_ref": "event", "label": "Event",
|
||||
"metadata_fields": ["event_type", "date", "location", "description", "severity"]},
|
||||
{"type_ref": "email", "label": "Email",
|
||||
"metadata_fields": ["address", "provider", "verified", "breached"]},
|
||||
{"type_ref": "domain", "label": "Domain",
|
||||
"metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
|
||||
{"type_ref": "ip_address", "label": "IP Address",
|
||||
"metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
|
||||
{"type_ref": "phone", "label": "Phone",
|
||||
"metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
|
||||
{"type_ref": "social_media", "label": "Social Media Account",
|
||||
"metadata_fields": ["platform", "username", "url", "followers", "verified"]},
|
||||
{"type_ref": "document", "label": "Document",
|
||||
"metadata_fields": ["title", "format", "classification", "source"]},
|
||||
{"type_ref": "crypto_wallet", "label": "Crypto Wallet",
|
||||
"metadata_fields": ["address", "blockchain", "balance"]},
|
||||
{"type_ref": "malware", "label": "Malware",
|
||||
"metadata_fields": ["family", "hash_sha256", "threat_level"]},
|
||||
{"type_ref": "vulnerability", "label": "Vulnerability",
|
||||
"metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]},
|
||||
]
|
||||
|
||||
GENERIC_PRESETS = [
|
||||
{"type_ref": "concept", "label": "Concept",
|
||||
"metadata_fields": ["name", "category", "definition"]},
|
||||
{"type_ref": "url", "label": "URL/Link",
|
||||
"metadata_fields": ["url", "domain", "context"]},
|
||||
{"type_ref": "date_reference", "label": "Date/Time",
|
||||
"metadata_fields": ["date", "precision", "context"]},
|
||||
{"type_ref": "quantity", "label": "Quantity/Amount",
|
||||
"metadata_fields": ["value", "unit", "context"]},
|
||||
{"type_ref": "coordinates", "label": "Coordinates",
|
||||
"metadata_fields": ["lat", "lon", "label"]},
|
||||
{"type_ref": "text_fragment", "label": "Key Text Fragment",
|
||||
"metadata_fields": ["text", "category", "relevance"]},
|
||||
]
|
||||
|
||||
# ── Custom presets (acumulativo, pensado para promoción al registry) ───────────
|
||||
|
||||
CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json")
|
||||
|
||||
|
||||
def load_custom_presets() -> list[dict]:
|
||||
"""Carga presets custom desde data/custom_presets.json si existe."""
|
||||
if not os.path.exists(CUSTOM_PRESETS_PATH):
|
||||
return []
|
||||
with open(CUSTOM_PRESETS_PATH) as f:
|
||||
data = json.load(f)
|
||||
return data.get("presets", [])
|
||||
|
||||
|
||||
def save_custom_presets(presets: list[dict]) -> None:
|
||||
"""Guarda presets custom en data/custom_presets.json.
|
||||
|
||||
Formato pensado para promoción al registry:
|
||||
{
|
||||
"presets": [
|
||||
{
|
||||
"type_ref": "snake_case_id",
|
||||
"label": "Human Label",
|
||||
"metadata_fields": ["field1", "field2"],
|
||||
"reason": "why this type exists",
|
||||
"source_doc": "document where it was first discovered",
|
||||
"promoted": false // true cuando se registre en el registry
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True)
|
||||
with open(CUSTOM_PRESETS_PATH, "w") as f:
|
||||
json.dump({"presets": presets}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]:
|
||||
"""Mergea tipos sugeridos con custom existentes. Dedup por type_ref."""
|
||||
existing = load_custom_presets()
|
||||
existing_refs = {p["type_ref"] for p in existing}
|
||||
|
||||
added = []
|
||||
for s in suggested:
|
||||
ref = s.get("type_ref", "")
|
||||
if not ref or ref in existing_refs:
|
||||
continue
|
||||
existing_refs.add(ref)
|
||||
preset = {
|
||||
"type_ref": ref,
|
||||
"label": s.get("label", ref),
|
||||
"metadata_fields": s.get("metadata_fields", []),
|
||||
"reason": s.get("reason", ""),
|
||||
"source_doc": source_doc,
|
||||
"promoted": False,
|
||||
}
|
||||
existing.append(preset)
|
||||
added.append(preset)
|
||||
|
||||
if added:
|
||||
save_custom_presets(existing)
|
||||
|
||||
return added
|
||||
|
||||
|
||||
RELATION_TYPES = [
|
||||
"employs", "works_for", "founded", "owns", "controls",
|
||||
"member_of", "affiliated_with", "collaborates_with",
|
||||
"communicates_with", "sent_to", "received_from",
|
||||
"located_in", "headquartered_in", "traveled_to", "operates_in",
|
||||
"participated_in", "caused", "occurred_at", "occurred_on",
|
||||
"mentions", "references", "describes", "authored", "published",
|
||||
"funds", "transacted_with", "invested_in",
|
||||
"hosts", "resolves_to", "exploits", "targets",
|
||||
"related_to", "part_of", "instance_of", "has_attribute",
|
||||
]
|
||||
|
||||
# ── LLM wrapper ───────────────────────────────────────────────────────────────
|
||||
|
||||
def claude_haiku_json(messages: list[dict]) -> dict:
|
||||
parts = []
|
||||
for msg in messages:
|
||||
if msg["role"] == "system":
|
||||
parts.append(f"[SYSTEM]\n{msg['content']}")
|
||||
elif msg["role"] == "user":
|
||||
parts.append(f"[USER]\n{msg['content']}")
|
||||
prompt = "\n\n".join(parts)
|
||||
|
||||
result = subprocess.run(
|
||||
["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"claude -p failed: {result.stderr[:200]}")
|
||||
|
||||
envelope = json.loads(result.stdout)
|
||||
return extract_json_from_llm(envelope.get("result", ""))
|
||||
|
||||
# ── Unified prompt ─────────────────────────────────────────────────────────────
|
||||
|
||||
def build_unified_prompt(presets, rel_types):
|
||||
type_lines = []
|
||||
for p in presets:
|
||||
fields = ", ".join(p.get("metadata_fields", []))
|
||||
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
|
||||
|
||||
return (
|
||||
"You are an entity and relation extraction expert. "
|
||||
"Given text, extract ALL entities and relations in a single pass.\n\n"
|
||||
"ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
|
||||
"RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
|
||||
'OUTPUT FORMAT (strict JSON):\n'
|
||||
'{\n'
|
||||
' "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
|
||||
' "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n'
|
||||
' "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n'
|
||||
'}\n\n'
|
||||
"RULES:\n"
|
||||
"- Extract ALL entities explicitly mentioned\n"
|
||||
"- Use exact type_ref from schema. Unknown attributes = null\n"
|
||||
"- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
|
||||
"- Relations: from_name/to_name MUST match entity names exactly\n"
|
||||
"- suggested_types: for important entities that do NOT fit any type, suggest a new type. "
|
||||
"Use those suggested type_refs for those entities in the entities array.\n"
|
||||
'- If no new types needed: "suggested_types": []\n'
|
||||
"- Respond in the same language as the text for descriptions"
|
||||
)
|
||||
|
||||
# ── Process one chunk ──────────────────────────────────────────────────────────
|
||||
|
||||
def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str):
|
||||
"""Procesa un chunk: extrae entities + relations + suggested_types."""
|
||||
try:
|
||||
resp = claude_haiku_json([
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": chunk_text},
|
||||
])
|
||||
except Exception as e:
|
||||
print(f" [WARN] chunk {chunk_idx}: {e}")
|
||||
return [], [], []
|
||||
|
||||
raw_entities = resp.get("entities", [])
|
||||
raw_relations = resp.get("relations", [])
|
||||
suggested = resp.get("suggested_types", [])
|
||||
|
||||
entities = []
|
||||
for ent in raw_entities:
|
||||
name = ent.get("name", "").strip()
|
||||
if not name:
|
||||
continue
|
||||
entities.append(EntityCandidate(
|
||||
name=name,
|
||||
type_ref=ent.get("type_ref", "concept"),
|
||||
attributes=ent.get("attributes", {}),
|
||||
confidence=float(ent.get("confidence", 0.5)),
|
||||
source_chunk_indices=[chunk_idx],
|
||||
))
|
||||
|
||||
relations = []
|
||||
for rel in raw_relations:
|
||||
fn = rel.get("from_name", "").strip()
|
||||
tn = rel.get("to_name", "").strip()
|
||||
if not fn or not tn:
|
||||
continue
|
||||
relations.append(RelationCandidate(
|
||||
from_name=fn,
|
||||
to_name=tn,
|
||||
relation_type=rel.get("relation_type", "related_to"),
|
||||
confidence=float(rel.get("confidence", 0.5)),
|
||||
description=rel.get("description", ""),
|
||||
source_chunk_index=chunk_idx,
|
||||
))
|
||||
|
||||
return entities, relations, suggested
|
||||
|
||||
# ── Sigma conversion ───────────────────────────────────────────────────────────
|
||||
|
||||
TYPE_COLORS = {
|
||||
"person": "#e74c3c",
|
||||
"organization": "#3498db",
|
||||
"location": "#2ecc71",
|
||||
"event": "#f39c12",
|
||||
"email": "#9b59b6",
|
||||
"domain": "#1abc9c",
|
||||
"ip_address": "#e67e22",
|
||||
"phone": "#95a5a6",
|
||||
"social_media": "#e91e63",
|
||||
"document": "#607d8b",
|
||||
"crypto_wallet": "#ff9800",
|
||||
"malware": "#f44336",
|
||||
"vulnerability": "#ff5722",
|
||||
"concept": "#00bcd4",
|
||||
"url": "#8bc34a",
|
||||
"date_reference": "#cddc39",
|
||||
"quantity": "#ffc107",
|
||||
"coordinates": "#4caf50",
|
||||
"text_fragment": "#78909c",
|
||||
}
|
||||
|
||||
def to_sigma(entities, relations, entity_id_map):
|
||||
# Build name→UUID lookup from dedup map
|
||||
# entity_id_map: {name_variant -> uuid, ...}
|
||||
# Invert to uuid→canonical_name using entities list
|
||||
uuid_to_name = {}
|
||||
name_to_uuid = {}
|
||||
for e in entities:
|
||||
# Find this entity's UUID in the map
|
||||
uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name))
|
||||
uuid_to_name[uuid] = e.name
|
||||
name_to_uuid[e.name] = uuid
|
||||
|
||||
degree = {}
|
||||
for r in relations:
|
||||
fid = r.from_id or r.from_name
|
||||
tid = r.to_id or r.to_name
|
||||
degree[fid] = degree.get(fid, 0) + 1
|
||||
degree[tid] = degree.get(tid, 0) + 1
|
||||
|
||||
nodes = []
|
||||
seen_uuids = set()
|
||||
for e in entities:
|
||||
uuid = name_to_uuid.get(e.name, e.name)
|
||||
if uuid in seen_uuids:
|
||||
continue
|
||||
seen_uuids.add(uuid)
|
||||
# Filter out 'type' — sigma.js reserves it for node render program
|
||||
reserved = {"type", "hidden", "x", "y"}
|
||||
attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved}
|
||||
nodes.append({
|
||||
"key": uuid,
|
||||
"attributes": {
|
||||
"label": e.name,
|
||||
"color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"),
|
||||
"size": 4 + min(degree.get(uuid, 0) * 2, 20),
|
||||
"entity_type": e.type_ref,
|
||||
**attrs,
|
||||
},
|
||||
})
|
||||
|
||||
node_keys = {n["key"] for n in nodes}
|
||||
edges = []
|
||||
seen_edges = set()
|
||||
for i, r in enumerate(relations):
|
||||
fid = r.from_id or r.from_name
|
||||
tid = r.to_id or r.to_name
|
||||
if fid in node_keys and tid in node_keys and fid != tid:
|
||||
edge_key = (fid, tid, r.relation_type)
|
||||
if edge_key in seen_edges:
|
||||
continue
|
||||
seen_edges.add(edge_key)
|
||||
edges.append({
|
||||
"key": f"e{i}",
|
||||
"source": fid,
|
||||
"target": tid,
|
||||
"attributes": {"label": r.relation_type},
|
||||
})
|
||||
|
||||
return {"nodes": nodes, "edges": edges}
|
||||
|
||||
# ── Reclasificación de entidades genéricas ─────────────────────────────────────
|
||||
|
||||
GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"}
|
||||
|
||||
|
||||
def reclassify_generic_entities(entities, new_presets, workers=4):
|
||||
"""Reclasifica entidades genéricas usando los tipos recién descubiertos.
|
||||
|
||||
En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades
|
||||
genéricas y los nuevos presets para reclasificarlas in-place.
|
||||
"""
|
||||
generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS]
|
||||
if not generic or not new_presets:
|
||||
return 0
|
||||
|
||||
# Construir prompt de reclasificación
|
||||
type_lines = []
|
||||
for p in new_presets:
|
||||
fields = ", ".join(p.get("metadata_fields", []))
|
||||
type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
|
||||
|
||||
system = (
|
||||
"You reclassify entities into more specific types. "
|
||||
"For each entity, decide if it fits one of the NEW types below better than its current generic type. "
|
||||
"If it fits, return the new type_ref and updated attributes. If not, return null.\n\n"
|
||||
"NEW TYPES:\n" + "\n".join(type_lines) + "\n\n"
|
||||
'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n'
|
||||
"Only include entities that should change. Omit those that should stay as-is."
|
||||
)
|
||||
|
||||
# Procesar en batches de 30 entidades para no exceder contexto
|
||||
batch_size = 30
|
||||
total_changed = 0
|
||||
|
||||
def _reclassify_batch(batch):
|
||||
items = [{"index": idx, "name": e.name, "current_type": e.type_ref,
|
||||
"attributes": e.attributes} for idx, e in batch]
|
||||
try:
|
||||
resp = claude_haiku_json([
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": json.dumps(items, ensure_ascii=False)},
|
||||
])
|
||||
return resp.get("reclassified", [])
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {pool.submit(_reclassify_batch, b): b for b in batches}
|
||||
for future in as_completed(futures):
|
||||
for item in future.result():
|
||||
idx = item.get("index")
|
||||
new_ref = item.get("type_ref", "")
|
||||
if idx is not None and new_ref and 0 <= idx < len(entities):
|
||||
entities[idx].type_ref = new_ref
|
||||
if item.get("attributes"):
|
||||
entities[idx].attributes.update(item["attributes"])
|
||||
total_changed += 1
|
||||
|
||||
return total_changed
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Uso: python extract.py <archivo>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
if not os.path.isabs(file_path):
|
||||
file_path = os.path.join(os.path.dirname(__file__), file_path)
|
||||
|
||||
workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4
|
||||
|
||||
print(f"=== Ontology Graph Extraction ===")
|
||||
print(f"File: {file_path}")
|
||||
print(f"Workers: {workers}")
|
||||
start = time.monotonic()
|
||||
|
||||
# 1. Extraer y preprocesar texto
|
||||
print("\n[1/5] Extracting text...")
|
||||
raw = extract_text_from_file(file_path)
|
||||
text = preprocess_text(raw)
|
||||
print(f" {len(text)} chars")
|
||||
|
||||
# 2. Chunking
|
||||
print("[2/5] Chunking...")
|
||||
chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
|
||||
print(f" {len(chunks)} chunks")
|
||||
|
||||
# 3. Extracción paralela
|
||||
custom = load_custom_presets()
|
||||
# Solo usar custom no promovidos (los promovidos ya estarán en el registry)
|
||||
active_custom = [p for p in custom if not p.get("promoted", False)]
|
||||
all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom
|
||||
print(f" Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom")
|
||||
system_prompt = build_unified_prompt(all_presets, RELATION_TYPES)
|
||||
|
||||
print(f"[3/5] Extracting entities + relations ({workers} workers)...")
|
||||
all_entities = []
|
||||
all_relations = []
|
||||
all_suggested = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {
|
||||
pool.submit(process_chunk, i, chunk, system_prompt): i
|
||||
for i, chunk in enumerate(chunks)
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
idx = futures[future]
|
||||
ents, rels, sugg = future.result()
|
||||
all_entities.extend(ents)
|
||||
all_relations.extend(rels)
|
||||
all_suggested.extend(sugg)
|
||||
print(f" chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" +
|
||||
(f", {len(sugg)} new types" if sugg else ""))
|
||||
|
||||
# 4. Deduplicación
|
||||
print(f"\n[4/5] Deduplicating...")
|
||||
print(f" Raw: {len(all_entities)} entities, {len(all_relations)} relations")
|
||||
|
||||
dedup = deduplicate_entities(all_entities, name_threshold=0.85)
|
||||
final_entities = dedup.entities
|
||||
entity_id_map = dedup.name_to_id
|
||||
|
||||
final_relations = deduplicate_relations(all_relations, entity_id_map)
|
||||
|
||||
print(f" Final: {len(final_entities)} entities, {len(final_relations)} relations")
|
||||
print(f" Merged: {dedup.total_before - dedup.total_after} entities, "
|
||||
f"{len(all_relations) - len(final_relations)} relations")
|
||||
|
||||
# Registrar tipos sugeridos en custom_presets.json
|
||||
unique_suggested = []
|
||||
if all_suggested:
|
||||
seen = set()
|
||||
for s in all_suggested:
|
||||
key = s.get("type_ref", "")
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
unique_suggested.append(s)
|
||||
|
||||
source_doc = os.path.basename(file_path)
|
||||
added = merge_suggested_into_custom(unique_suggested, source_doc)
|
||||
total_custom = len(load_custom_presets())
|
||||
|
||||
if added:
|
||||
print(f"\n New types registered ({len(added)}):")
|
||||
for p in added:
|
||||
print(f" + {p['label']} ({p['type_ref']}): {p['metadata_fields']}")
|
||||
print(f" Reason: {p['reason']}")
|
||||
print(f" Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})")
|
||||
|
||||
# Reclasificar entidades genéricas con los tipos recién descubiertos
|
||||
n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS)
|
||||
if n_generic > 0:
|
||||
print(f"\n Reclassifying {n_generic} generic entities with new types...")
|
||||
changed = reclassify_generic_entities(final_entities, added, workers=workers)
|
||||
print(f" Reclassified: {changed}/{n_generic}")
|
||||
else:
|
||||
print(f"\n {len(unique_suggested)} suggested types already registered ({total_custom} total custom)")
|
||||
|
||||
# Stats por tipo
|
||||
type_counts = {}
|
||||
for e in final_entities:
|
||||
type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1
|
||||
print(f"\n Entity types:")
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t}: {c}")
|
||||
|
||||
rel_counts = {}
|
||||
for r in final_relations:
|
||||
rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1
|
||||
print(f" Relation types:")
|
||||
for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t}: {c}")
|
||||
|
||||
# 5. Visualización
|
||||
print(f"\n[5/5] Generating graph...")
|
||||
graph = to_sigma(final_entities, final_relations, entity_id_map)
|
||||
out_dir = os.path.join(os.path.dirname(__file__), "data")
|
||||
html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph")
|
||||
print(f" {len(graph['nodes'])} nodes, {len(graph['edges'])} edges")
|
||||
print(f" HTML: file://{html_path}")
|
||||
|
||||
# Guardar JSON intermedio
|
||||
json_path = os.path.join(out_dir, "extraction_result.json")
|
||||
with open(json_path, "w") as f:
|
||||
json.dump({
|
||||
"entities": [{"name": e.name, "type_ref": e.type_ref,
|
||||
"confidence": e.confidence, "attributes": e.attributes}
|
||||
for e in final_entities],
|
||||
"relations": [{"from": r.from_name, "to": r.to_name,
|
||||
"type": r.relation_type, "confidence": r.confidence,
|
||||
"description": r.description}
|
||||
for r in final_relations],
|
||||
"suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])],
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
print(f" JSON: {json_path}")
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
print(f"\nDone in {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,43 @@
|
||||
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
|
||||
|
||||
|
||||
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los entity types disponibles.
|
||||
|
||||
Formatea los presets del registry en una seccion del system prompt que indica
|
||||
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
|
||||
|
||||
Args:
|
||||
entity_presets: Lista de presets con campos 'label', 'type_ref' y
|
||||
opcionalmente 'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista de presets esta vacia.
|
||||
"""
|
||||
if not entity_presets:
|
||||
return ""
|
||||
|
||||
lines = ["Entity types available for extraction:", ""]
|
||||
|
||||
for i, preset in enumerate(entity_presets, start=1):
|
||||
label = preset.get("label", "Unknown")
|
||||
type_ref = preset.get("type_ref", "")
|
||||
metadata_fields = preset.get("metadata_fields", [])
|
||||
|
||||
lines.append(f"{i}. {label} (type_ref: {type_ref})")
|
||||
|
||||
if metadata_fields:
|
||||
attrs = ", ".join(metadata_fields)
|
||||
lines.append(f" Attributes: {attrs}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Remove trailing blank line
|
||||
if lines and lines[-1] == "":
|
||||
lines.pop()
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,22 @@
|
||||
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
|
||||
|
||||
|
||||
def build_relation_schema_prompt(relation_types: list[str]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
|
||||
|
||||
Formatea la lista de tipos de relacion en una seccion del system prompt que
|
||||
indica al LLM que relaciones puede extraer entre entidades.
|
||||
|
||||
Args:
|
||||
relation_types: Lista de strings con los tipos de relacion permitidos.
|
||||
Ejemplo: ["funds", "employs", "communicates_with"]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista esta vacia.
|
||||
"""
|
||||
if not relation_types:
|
||||
return ""
|
||||
|
||||
joined = ", ".join(relation_types)
|
||||
return f"Allowed relation types:\n{joined}"
|
||||
@@ -0,0 +1,814 @@
|
||||
"""Core functional programming utilities — pure functions for list/collection operations."""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from functools import reduce as _reduce
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def filter_list(xs: list, pred: Callable) -> list:
|
||||
"""Filter list by predicate. Does not mutate the original."""
|
||||
return [x for x in xs if pred(x)]
|
||||
|
||||
|
||||
def map_list(xs: list, fn: Callable) -> list:
|
||||
"""Map function over list. Does not mutate the original."""
|
||||
return [fn(x) for x in xs]
|
||||
|
||||
|
||||
def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
|
||||
"""Reduce list with accumulator. fn(acc, x) -> acc."""
|
||||
return _reduce(fn, xs, initial)
|
||||
|
||||
|
||||
def flat_map(xs: list, fn: Callable) -> list:
|
||||
"""Map function over list then flatten one level."""
|
||||
result = []
|
||||
for x in xs:
|
||||
result.extend(fn(x))
|
||||
return result
|
||||
|
||||
|
||||
def flatten(xss: list) -> list:
|
||||
"""Flatten a list of lists one level."""
|
||||
result = []
|
||||
for xs in xss:
|
||||
result.extend(xs)
|
||||
return result
|
||||
|
||||
|
||||
def chunk(xs: list, size: int) -> list:
|
||||
"""Split list into chunks of given size. Last chunk may be smaller."""
|
||||
if size <= 0:
|
||||
return []
|
||||
return [xs[i : i + size] for i in range(0, len(xs), size)]
|
||||
|
||||
|
||||
def take(xs: list, n: int) -> list:
|
||||
"""Take first n elements from list."""
|
||||
return xs[:n]
|
||||
|
||||
|
||||
def drop(xs: list, n: int) -> list:
|
||||
"""Drop first n elements from list."""
|
||||
return xs[n:]
|
||||
|
||||
|
||||
def unique(xs: list) -> list:
|
||||
"""Remove duplicates preserving order. Uses identity for hashable elements."""
|
||||
seen = set()
|
||||
result = []
|
||||
for x in xs:
|
||||
if x not in seen:
|
||||
seen.add(x)
|
||||
result.append(x)
|
||||
return result
|
||||
|
||||
|
||||
def group_by(xs: list, key_fn: Callable) -> Dict:
|
||||
"""Group elements by key function. Returns dict of key -> list."""
|
||||
groups: Dict = {}
|
||||
for x in xs:
|
||||
k = key_fn(x)
|
||||
if k not in groups:
|
||||
groups[k] = []
|
||||
groups[k].append(x)
|
||||
return groups
|
||||
|
||||
|
||||
def partition(xs: list, pred: Callable) -> Tuple[list, list]:
|
||||
"""Split list into (matches, non_matches) based on predicate."""
|
||||
matches = []
|
||||
non_matches = []
|
||||
for x in xs:
|
||||
if pred(x):
|
||||
matches.append(x)
|
||||
else:
|
||||
non_matches.append(x)
|
||||
return (matches, non_matches)
|
||||
|
||||
|
||||
def find(xs: list, pred: Callable) -> Any:
|
||||
"""Find first element matching predicate. Returns None if not found."""
|
||||
for x in xs:
|
||||
if pred(x):
|
||||
return x
|
||||
return None
|
||||
|
||||
|
||||
def find_index(xs: list, pred: Callable) -> int:
|
||||
"""Find index of first element matching predicate. Returns -1 if not found."""
|
||||
for i, x in enumerate(xs):
|
||||
if pred(x):
|
||||
return i
|
||||
return -1
|
||||
|
||||
|
||||
def zip_with(xs: list, ys: list, fn: Callable) -> list:
|
||||
"""Zip two lists with a combining function. Stops at shorter list."""
|
||||
return [fn(x, y) for x, y in zip(xs, ys)]
|
||||
|
||||
|
||||
def all_of(xs: list, pred: Callable) -> bool:
|
||||
"""Return True if all elements match predicate."""
|
||||
return all(pred(x) for x in xs)
|
||||
|
||||
|
||||
def any_of(xs: list, pred: Callable) -> bool:
|
||||
"""Return True if any element matches predicate."""
|
||||
return any(pred(x) for x in xs)
|
||||
|
||||
|
||||
def pipe(value: Any, *fns: Callable) -> Any:
|
||||
"""Pipe a value through a sequence of functions left-to-right."""
|
||||
result = value
|
||||
for fn in fns:
|
||||
result = fn(result)
|
||||
return result
|
||||
|
||||
|
||||
def compose(*fns: Callable) -> Callable:
|
||||
"""Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
|
||||
def composed(x: Any) -> Any:
|
||||
result = x
|
||||
for fn in reversed(fns):
|
||||
result = fn(result)
|
||||
return result
|
||||
return composed
|
||||
|
||||
|
||||
# ── Tree manipulation ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def flatten_tree(structure: Any) -> List[Dict]:
|
||||
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
nodes = [node]
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
nodes.extend(flatten_tree(structure[key]))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(flatten_tree(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def tree_to_flat_list(structure: Any) -> List[Dict]:
|
||||
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
|
||||
if isinstance(structure, dict):
|
||||
nodes = [structure]
|
||||
if 'nodes' in structure:
|
||||
nodes.extend(tree_to_flat_list(structure['nodes']))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(tree_to_flat_list(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def get_leaf_nodes(structure: Any) -> List[Dict]:
|
||||
"""Extract only leaf nodes (no children) from a hierarchical tree."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
if not structure.get('nodes'):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
return [node]
|
||||
leaf_nodes = []
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
leaf_nodes.extend(get_leaf_nodes(structure[key]))
|
||||
return leaf_nodes
|
||||
elif isinstance(structure, list):
|
||||
leaf_nodes = []
|
||||
for item in structure:
|
||||
leaf_nodes.extend(get_leaf_nodes(item))
|
||||
return leaf_nodes
|
||||
return []
|
||||
|
||||
|
||||
def write_node_ids(data: Any, node_id: int = 0) -> int:
|
||||
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
|
||||
if isinstance(data, dict):
|
||||
data['node_id'] = str(node_id).zfill(4)
|
||||
node_id += 1
|
||||
for key in list(data.keys()):
|
||||
if 'nodes' in key:
|
||||
node_id = write_node_ids(data[key], node_id)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
node_id = write_node_ids(item, node_id)
|
||||
return node_id
|
||||
|
||||
|
||||
def list_to_tree(data: List[Dict]) -> List[Dict]:
|
||||
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
|
||||
def get_parent_structure(structure):
|
||||
if not structure:
|
||||
return None
|
||||
parts = str(structure).split('.')
|
||||
return '.'.join(parts[:-1]) if len(parts) > 1 else None
|
||||
|
||||
nodes = {}
|
||||
root_nodes = []
|
||||
|
||||
for item in data:
|
||||
structure = item.get('structure')
|
||||
node = {
|
||||
'title': item.get('title'),
|
||||
'start_index': item.get('start_index'),
|
||||
'end_index': item.get('end_index'),
|
||||
'nodes': []
|
||||
}
|
||||
nodes[structure] = node
|
||||
parent_structure = get_parent_structure(structure)
|
||||
|
||||
if parent_structure and parent_structure in nodes:
|
||||
nodes[parent_structure]['nodes'].append(node)
|
||||
else:
|
||||
root_nodes.append(node)
|
||||
|
||||
def clean_node(node):
|
||||
if not node['nodes']:
|
||||
del node['nodes']
|
||||
else:
|
||||
for child in node['nodes']:
|
||||
clean_node(child)
|
||||
return node
|
||||
|
||||
return [clean_node(node) for node in root_nodes]
|
||||
|
||||
|
||||
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
|
||||
"""Recursively remove specified fields from a tree (dict/list)."""
|
||||
if fields is None:
|
||||
fields = ['text']
|
||||
if isinstance(data, dict):
|
||||
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
|
||||
elif isinstance(data, list):
|
||||
return [remove_tree_fields(item, fields) for item in data]
|
||||
return data
|
||||
|
||||
|
||||
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
|
||||
"""Reorder fields of each node in a tree according to specified key order."""
|
||||
if not order:
|
||||
return structure
|
||||
if isinstance(structure, dict):
|
||||
if 'nodes' in structure:
|
||||
structure['nodes'] = format_tree_structure(structure['nodes'], order)
|
||||
if not structure.get('nodes'):
|
||||
structure.pop('nodes', None)
|
||||
return {key: structure[key] for key in order if key in structure}
|
||||
elif isinstance(structure, list):
|
||||
return [format_tree_structure(item, order) for item in structure]
|
||||
return structure
|
||||
|
||||
|
||||
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
|
||||
"""Create flat dict mapping node_id to node for O(1) lookup."""
|
||||
mapping = {}
|
||||
def _traverse(nodes):
|
||||
for node in nodes:
|
||||
if node.get('node_id'):
|
||||
mapping[node['node_id']] = node
|
||||
if node.get('nodes'):
|
||||
_traverse(node['nodes'])
|
||||
_traverse(tree)
|
||||
return mapping
|
||||
|
||||
|
||||
# ── Text / JSON extraction ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_json_from_llm(content: str) -> Dict:
|
||||
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
|
||||
import json
|
||||
try:
|
||||
start_idx = content.find("```json")
|
||||
if start_idx != -1:
|
||||
start_idx += 7
|
||||
end_idx = content.rfind("```")
|
||||
json_content = content[start_idx:end_idx].strip()
|
||||
else:
|
||||
json_content = content.strip()
|
||||
|
||||
json_content = json_content.replace('None', 'null')
|
||||
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
|
||||
json_content = ' '.join(json_content.split())
|
||||
|
||||
return json.loads(json_content)
|
||||
except (json.JSONDecodeError, Exception):
|
||||
try:
|
||||
json_content = json_content.replace(',]', ']').replace(',}', '}')
|
||||
return json.loads(json_content)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def parse_page_range(pages: str) -> List[int]:
|
||||
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
|
||||
result = []
|
||||
for part in pages.split(','):
|
||||
part = part.strip()
|
||||
if '-' in part:
|
||||
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
|
||||
if start > end:
|
||||
raise ValueError(f"Invalid range '{part}': start must be <= end")
|
||||
result.extend(range(start, end + 1))
|
||||
else:
|
||||
result.append(int(part))
|
||||
return sorted(set(result))
|
||||
|
||||
|
||||
# ── Markdown parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
|
||||
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
|
||||
import re
|
||||
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||
code_block_pattern = r'^```'
|
||||
node_list = []
|
||||
lines = markdown_content.split('\n')
|
||||
in_code_block = False
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
stripped_line = line.strip()
|
||||
if re.match(code_block_pattern, stripped_line):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
if not stripped_line:
|
||||
continue
|
||||
if not in_code_block:
|
||||
match = re.match(header_pattern, stripped_line)
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
title = match.group(2).strip()
|
||||
node_list.append({'title': title, 'level': level, 'line_num': line_num})
|
||||
|
||||
return node_list, lines
|
||||
|
||||
|
||||
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
|
||||
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
|
||||
if not node_list:
|
||||
return []
|
||||
|
||||
stack = []
|
||||
root_nodes = []
|
||||
node_counter = 1
|
||||
|
||||
for node in node_list:
|
||||
current_level = node['level']
|
||||
tree_node = {
|
||||
'title': node['title'],
|
||||
'node_id': str(node_counter).zfill(4),
|
||||
'line_num': node['line_num'],
|
||||
'nodes': []
|
||||
}
|
||||
node_counter += 1
|
||||
|
||||
while stack and stack[-1][1] >= current_level:
|
||||
stack.pop()
|
||||
|
||||
if not stack:
|
||||
root_nodes.append(tree_node)
|
||||
else:
|
||||
parent_node, _ = stack[-1]
|
||||
parent_node['nodes'].append(tree_node)
|
||||
|
||||
stack.append((tree_node, current_level))
|
||||
|
||||
def clean_empty_nodes(nodes):
|
||||
for n in nodes:
|
||||
if n['nodes']:
|
||||
clean_empty_nodes(n['nodes'])
|
||||
else:
|
||||
del n['nodes']
|
||||
return nodes
|
||||
|
||||
return clean_empty_nodes(root_nodes)
|
||||
|
||||
|
||||
# ── Pagination / chunking ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
|
||||
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
|
||||
"""Group pages into text chunks respecting token limit with configurable overlap."""
|
||||
import math
|
||||
num_tokens = sum(token_lengths)
|
||||
|
||||
if num_tokens <= max_tokens:
|
||||
return ["".join(page_contents)]
|
||||
|
||||
subsets = []
|
||||
current_subset = []
|
||||
current_token_count = 0
|
||||
|
||||
expected_parts = math.ceil(num_tokens / max_tokens)
|
||||
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
|
||||
|
||||
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
|
||||
if current_token_count + page_tokens > avg_tokens:
|
||||
subsets.append(''.join(current_subset))
|
||||
overlap_start = max(i - overlap_pages, 0)
|
||||
current_subset = list(page_contents[overlap_start:i])
|
||||
current_token_count = sum(token_lengths[overlap_start:i])
|
||||
|
||||
current_subset.append(page_content)
|
||||
current_token_count += page_tokens
|
||||
|
||||
if current_subset:
|
||||
subsets.append(''.join(current_subset))
|
||||
|
||||
return subsets
|
||||
|
||||
|
||||
def calculate_page_offset(pairs: List[Dict]) -> int:
|
||||
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
|
||||
differences = []
|
||||
for pair in pairs:
|
||||
try:
|
||||
difference = pair['physical_index'] - pair['page']
|
||||
differences.append(difference)
|
||||
except (KeyError, TypeError):
|
||||
continue
|
||||
|
||||
if not differences:
|
||||
return 0
|
||||
|
||||
counts: Dict[int, int] = {}
|
||||
for diff in differences:
|
||||
counts[diff] = counts.get(diff, 0) + 1
|
||||
|
||||
return max(counts.items(), key=lambda x: x[1])[0]
|
||||
|
||||
|
||||
# ── Text preprocessing ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def preprocess_text(text: str) -> str:
|
||||
"""Normalize whitespace and newlines in raw text.
|
||||
|
||||
Args:
|
||||
text: Raw text to normalize.
|
||||
|
||||
Returns:
|
||||
Normalized text with consistent newlines, stripped lines, and no
|
||||
excessive blank lines.
|
||||
"""
|
||||
# Normalize line endings: \r\n and \r -> \n
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
# Reduce 3+ consecutive newlines to at most 2
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
# Strip whitespace from each line
|
||||
text = '\n'.join(line.strip() for line in text.split('\n'))
|
||||
# Strip globally
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_text_stats(text: str) -> dict:
|
||||
"""Compute basic statistics of a text: characters, lines, words.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze.
|
||||
|
||||
Returns:
|
||||
Dict with keys total_chars (int), total_lines (int), total_words (int).
|
||||
"""
|
||||
return {
|
||||
'total_chars': len(text),
|
||||
'total_lines': text.count('\n') + 1,
|
||||
'total_words': len(text.split()),
|
||||
}
|
||||
|
||||
|
||||
# ── Git URL parsing ──────────────────────────────────────────────────────────
|
||||
|
||||
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
|
||||
|
||||
|
||||
def _sanitize_git_segment(segment: str) -> str:
|
||||
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
|
||||
if segment.endswith(".git"):
|
||||
segment = segment[:-4]
|
||||
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
|
||||
|
||||
|
||||
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""Parse a code-hosting URL and return the 'org/repo' path component.
|
||||
|
||||
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
|
||||
Returns None if the URL does not match any known host or is malformed.
|
||||
|
||||
Args:
|
||||
url: Repository URL in any supported format.
|
||||
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
'org/repo' string or None.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
if url.startswith("git@"):
|
||||
# git@github.com:org/repo.git
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return None
|
||||
host, path = rest.split(":", 1)
|
||||
if host not in hosts:
|
||||
return None
|
||||
segments = [s for s in path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
for prefix in ("http://", "https://", "git://", "ssh://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.hostname or ""
|
||||
if netloc not in hosts:
|
||||
return None
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
|
||||
"""Return True only if url points to a clonable git repository.
|
||||
|
||||
Accepts org/repo and org/repo/tree/<ref> paths.
|
||||
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
|
||||
|
||||
Args:
|
||||
url: URL to verify.
|
||||
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
True if url is a clonable repository URL.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
# SSH shorthand — always repo-level if host matches
|
||||
if url.startswith("git@"):
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return False
|
||||
host, _ = rest.split(":", 1)
|
||||
return host in hosts
|
||||
|
||||
# git:// and ssh:// — always repo-level if host matches
|
||||
for prefix in ("ssh://", "git://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
return (parsed.hostname or "") in hosts
|
||||
|
||||
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
|
||||
for prefix in ("http://", "https://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
if (parsed.hostname or "") not in hosts:
|
||||
return False
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) == 2:
|
||||
return True
|
||||
if len(segments) == 4 and segments[2] == "tree":
|
||||
return True
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_git_ssh_uri(url: str) -> None:
|
||||
"""Validate a git SSH URI of the form git@host:path.
|
||||
|
||||
Raises ValueError with a descriptive message if the URI is malformed.
|
||||
|
||||
Args:
|
||||
url: URI string to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the URI does not conform to git SSH format.
|
||||
"""
|
||||
if not url.startswith("git@"):
|
||||
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
|
||||
_, path = rest.split(":", 1)
|
||||
if not path:
|
||||
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown parsing utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
|
||||
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
|
||||
|
||||
Args:
|
||||
content: Raw markdown string, optionally starting with YAML frontmatter.
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_frontmatter, frontmatter_dict).
|
||||
frontmatter_dict is None when no frontmatter is found.
|
||||
"""
|
||||
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
|
||||
match = pattern.match(content)
|
||||
if not match:
|
||||
return content, None
|
||||
|
||||
raw = match.group(1)
|
||||
remaining = content[match.end():]
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
data = yaml.safe_load(raw)
|
||||
if not isinstance(data, dict):
|
||||
data = None
|
||||
except Exception:
|
||||
# Fallback: simple key: value parser (no yaml dependency)
|
||||
data = {}
|
||||
for line in raw.splitlines():
|
||||
if ':' in line:
|
||||
key, _, value = line.partition(':')
|
||||
data[key.strip()] = value.strip()
|
||||
|
||||
return remaining, data
|
||||
|
||||
|
||||
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
|
||||
"""Find all markdown headings (# to ######), excluding those inside code blocks,
|
||||
HTML comments, and indented blocks.
|
||||
|
||||
Args:
|
||||
content: Markdown text to search.
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos, title, level) for each heading found.
|
||||
"""
|
||||
excluded: List[Tuple[int, int]] = []
|
||||
|
||||
# Code blocks (triple backtick)
|
||||
for m in re.finditer(r'```.*?```', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# HTML comments
|
||||
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# Indented blocks (lines starting with 4 spaces or a tab)
|
||||
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
def is_excluded(pos: int) -> bool:
|
||||
return any(start <= pos < end for start, end in excluded)
|
||||
|
||||
results: List[Tuple[int, int, str, int]] = []
|
||||
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||
# Skip escaped headings (\#)
|
||||
before = content[m.start() - 1] if m.start() > 0 else ''
|
||||
if before == '\\':
|
||||
continue
|
||||
if is_excluded(m.start()):
|
||||
continue
|
||||
level = len(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
results.append((m.start(), m.end(), title, level))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def estimate_token_count(content: str) -> int:
|
||||
"""Estimate token count without a tokenizer.
|
||||
|
||||
CJK characters count as ~0.7 tokens each; other non-whitespace characters
|
||||
count as ~0.3 tokens each.
|
||||
|
||||
Args:
|
||||
content: Text to estimate.
|
||||
|
||||
Returns:
|
||||
Estimated integer token count.
|
||||
"""
|
||||
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
|
||||
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
|
||||
others = re.findall(r'\S', without_cjk)
|
||||
return int(len(cjk) * 0.7 + len(others) * 0.3)
|
||||
|
||||
|
||||
def smart_split_content(
|
||||
content: str,
|
||||
max_tokens: int = 1024,
|
||||
max_chars: int = 8000,
|
||||
) -> List[str]:
|
||||
"""Split large content into parts respecting token and character limits.
|
||||
|
||||
Splits by paragraphs (double newline). If a single paragraph exceeds the
|
||||
limit it is force-cut into chunks of max_chars.
|
||||
|
||||
Args:
|
||||
content: Text to split.
|
||||
max_tokens: Maximum estimated tokens per part.
|
||||
max_chars: Maximum characters per part.
|
||||
|
||||
Returns:
|
||||
List of string parts.
|
||||
"""
|
||||
paragraphs = content.split('\n\n')
|
||||
parts: List[str] = []
|
||||
current_parts: List[str] = []
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
def flush() -> None:
|
||||
if current_parts:
|
||||
parts.append('\n\n'.join(current_parts))
|
||||
current_parts.clear()
|
||||
|
||||
for para in paragraphs:
|
||||
para_tokens = estimate_token_count(para)
|
||||
para_chars = len(para)
|
||||
|
||||
# Single paragraph exceeds limits — force-cut it
|
||||
if para_tokens > max_tokens or para_chars > max_chars:
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
for i in range(0, len(para), max_chars):
|
||||
parts.append(para[i:i + max_chars])
|
||||
continue
|
||||
|
||||
# Would exceed limits if added — flush first
|
||||
if (current_tokens + para_tokens > max_tokens or
|
||||
current_chars + para_chars > max_chars):
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
current_parts.append(para)
|
||||
current_tokens += para_tokens
|
||||
current_chars += para_chars
|
||||
|
||||
flush()
|
||||
return parts if parts else [content]
|
||||
|
||||
|
||||
def sanitize_for_path(text: str, max_length: int = 50) -> str:
|
||||
"""Convert text to a safe string for use in file paths.
|
||||
|
||||
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
|
||||
with underscores. Truncates with a sha256 suffix if the result exceeds
|
||||
max_length.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize.
|
||||
max_length: Maximum length of the returned string.
|
||||
|
||||
Returns:
|
||||
Safe path-friendly string.
|
||||
"""
|
||||
cleaned = re.sub(
|
||||
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
|
||||
'',
|
||||
text,
|
||||
)
|
||||
cleaned = cleaned.replace(' ', '_').strip('_')
|
||||
|
||||
if not cleaned:
|
||||
return 'section'
|
||||
|
||||
if len(cleaned) <= max_length:
|
||||
return cleaned
|
||||
|
||||
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
|
||||
return cleaned[:max_length - len(suffix)] + suffix
|
||||
@@ -0,0 +1,283 @@
|
||||
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from deduplication_result import DeduplicationResult
|
||||
from normalize_entity_name import normalize_entity_name
|
||||
from merge_entity_attributes import merge_entity_attributes
|
||||
|
||||
|
||||
# ── Similitud helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _levenshtein(a: str, b: str) -> int:
|
||||
"""Distancia de edicion Levenshtein entre dos strings."""
|
||||
if a == b:
|
||||
return 0
|
||||
if not a:
|
||||
return len(b)
|
||||
if not b:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
curr = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
|
||||
"""Similitud de Jaccard entre dos conjuntos de tokens."""
|
||||
set_a = set(tokens_a)
|
||||
set_b = set(tokens_b)
|
||||
if not set_a and not set_b:
|
||||
return 1.0
|
||||
inter = len(set_a & set_b)
|
||||
union = len(set_a | set_b)
|
||||
return inter / union if union else 0.0
|
||||
|
||||
|
||||
def _name_similarity(a: str, b: str) -> float:
|
||||
"""Score de similitud entre dos nombres normalizados.
|
||||
|
||||
Combina similitud de Levenshtein y Jaccard sobre tokens.
|
||||
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
|
||||
"""
|
||||
if a == b:
|
||||
return 1.0
|
||||
|
||||
# Similitud Levenshtein
|
||||
max_len = max(len(a), len(b))
|
||||
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
|
||||
|
||||
# Similitud Jaccard sobre tokens
|
||||
tokens_a = a.split()
|
||||
tokens_b = b.split()
|
||||
jac_sim = _jaccard(tokens_a, tokens_b)
|
||||
|
||||
score = max(lev_sim, jac_sim)
|
||||
|
||||
# Bonus de contencion: un nombre contiene al otro
|
||||
if a in b or b in a:
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
|
||||
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
|
||||
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
|
||||
if not candidate or not tokens:
|
||||
return False
|
||||
initials = "".join(t[0] for t in tokens if t).upper()
|
||||
return candidate.upper() == initials
|
||||
|
||||
|
||||
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
|
||||
|
||||
|
||||
def _is_exact_type(entity_type: str) -> bool:
|
||||
"""Tipos tecnicos donde solo se acepta matching exacto."""
|
||||
return entity_type.lower() in _EXACT_TYPES
|
||||
|
||||
|
||||
# ── Union-Find ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class _UnionFind:
|
||||
def __init__(self, n: int) -> None:
|
||||
self._parent = list(range(n))
|
||||
self._rank = [0] * n
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
while self._parent[x] != x:
|
||||
self._parent[x] = self._parent[self._parent[x]]
|
||||
x = self._parent[x]
|
||||
return x
|
||||
|
||||
def union(self, x: int, y: int) -> None:
|
||||
rx, ry = self.find(x), self.find(y)
|
||||
if rx == ry:
|
||||
return
|
||||
if self._rank[rx] < self._rank[ry]:
|
||||
rx, ry = ry, rx
|
||||
self._parent[ry] = rx
|
||||
if self._rank[rx] == self._rank[ry]:
|
||||
self._rank[rx] += 1
|
||||
|
||||
|
||||
# ── Implementacion principal ────────────────────────────────────────────────────
|
||||
|
||||
def deduplicate_entities(
|
||||
candidates: list[EntityCandidate],
|
||||
name_threshold: float = 0.85,
|
||||
same_type_only: bool = True,
|
||||
) -> DeduplicationResult:
|
||||
"""Agrupa entidades candidatas que refieren a la misma entidad real.
|
||||
|
||||
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
|
||||
detectar clusters transitivos. Por cada cluster genera una entidad canonica
|
||||
mergeando atributos de todos sus miembros.
|
||||
|
||||
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
|
||||
acepta matching exacto normalizado, ignorando el umbral de nombre.
|
||||
|
||||
Args:
|
||||
candidates: lista de EntityCandidate a deduplicar.
|
||||
name_threshold: score minimo para considerar dos nombres iguales (0-1).
|
||||
same_type_only: si True, solo compara entidades del mismo type_ref.
|
||||
|
||||
Returns:
|
||||
DeduplicationResult con entidades deduplicadas, mapas de resolucion
|
||||
e historial de merges.
|
||||
"""
|
||||
if not candidates:
|
||||
return DeduplicationResult(
|
||||
entities=[],
|
||||
entity_id_map={},
|
||||
name_to_id={},
|
||||
merge_log=[],
|
||||
total_before=0,
|
||||
total_after=0,
|
||||
)
|
||||
|
||||
n = len(candidates)
|
||||
|
||||
# Paso 1: normalizar nombres
|
||||
normalized: list[str] = []
|
||||
for c in candidates:
|
||||
norm = normalize_entity_name(c.name, c.type_ref)
|
||||
normalized.append(norm)
|
||||
|
||||
# Paso 2: Union-Find sobre todos los indices
|
||||
uf = _UnionFind(n)
|
||||
|
||||
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
|
||||
merge_pairs: list[tuple[int, int, float]] = []
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
|
||||
continue
|
||||
|
||||
ni, nj = normalized[i], normalized[j]
|
||||
et = candidates[i].type_ref.lower()
|
||||
|
||||
if _is_exact_type(et):
|
||||
if ni == nj:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, 1.0))
|
||||
continue
|
||||
|
||||
score = _name_similarity(ni, nj)
|
||||
if score >= name_threshold:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, score))
|
||||
|
||||
# Paso 4: agrupar indices por raiz del Union-Find
|
||||
clusters: dict[int, list[int]] = {}
|
||||
for i in range(n):
|
||||
root = uf.find(i)
|
||||
clusters.setdefault(root, []).append(i)
|
||||
|
||||
# Paso 5: merge por cluster
|
||||
merged_entities: list[EntityCandidate] = []
|
||||
entity_id_map: dict[str, str] = {}
|
||||
name_to_id: dict[str, str] = {}
|
||||
merge_log: list[dict] = []
|
||||
|
||||
# Pares mergeados para construir el log
|
||||
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
|
||||
for i, j, score in merge_pairs:
|
||||
root = uf.find(i)
|
||||
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
|
||||
|
||||
for root, indices in clusters.items():
|
||||
cluster_candidates = [candidates[idx] for idx in indices]
|
||||
|
||||
if len(cluster_candidates) == 1:
|
||||
c = cluster_candidates[0]
|
||||
canonical_name = c.name
|
||||
canonical_norm = normalized[indices[0]]
|
||||
merged_attrs = c.attributes
|
||||
merged_confidence = c.confidence
|
||||
merged_chunks = list(c.source_chunk_indices)
|
||||
merged_from = list(c.merged_from) if c.merged_from else [c.name]
|
||||
else:
|
||||
# Candidato con mayor confidence es el canonico
|
||||
best = max(cluster_candidates, key=lambda c: c.confidence)
|
||||
canonical_name = best.name
|
||||
canonical_norm = normalize_entity_name(best.name, best.type_ref)
|
||||
|
||||
merged_attrs = merge_entity_attributes(
|
||||
[c.attributes for c in cluster_candidates]
|
||||
)
|
||||
merged_confidence = max(c.confidence for c in cluster_candidates)
|
||||
|
||||
merged_chunks: list[int] = []
|
||||
seen_chunks: set[int] = set()
|
||||
for c in cluster_candidates:
|
||||
for idx in c.source_chunk_indices:
|
||||
if idx not in seen_chunks:
|
||||
merged_chunks.append(idx)
|
||||
seen_chunks.add(idx)
|
||||
|
||||
merged_from: list[str] = []
|
||||
seen_names: set[str] = set()
|
||||
for c in cluster_candidates:
|
||||
names_to_add = c.merged_from if c.merged_from else [c.name]
|
||||
for nm in names_to_add:
|
||||
if nm not in seen_names:
|
||||
merged_from.append(nm)
|
||||
seen_names.add(nm)
|
||||
|
||||
# Log de merge
|
||||
other_names = [c.name for c in cluster_candidates if c is not best]
|
||||
pairs = merged_pairs_by_root.get(root, [])
|
||||
max_score = max((s for _, _, s in pairs), default=1.0)
|
||||
merge_log.append(
|
||||
{
|
||||
"canonical": canonical_name,
|
||||
"merged": other_names,
|
||||
"score": round(max_score, 4),
|
||||
"reason": "fuzzy_name",
|
||||
}
|
||||
)
|
||||
|
||||
ent_id = str(uuid.uuid4())
|
||||
entity = EntityCandidate(
|
||||
name=canonical_name,
|
||||
name_normalized=canonical_norm,
|
||||
type_ref=cluster_candidates[0].type_ref,
|
||||
type_label=cluster_candidates[0].type_label,
|
||||
attributes=merged_attrs,
|
||||
confidence=merged_confidence,
|
||||
source_chunk_indices=merged_chunks,
|
||||
merged_from=merged_from,
|
||||
)
|
||||
merged_entities.append(entity)
|
||||
|
||||
# Poblar mapas de resolucion
|
||||
entity_id_map[canonical_norm] = ent_id
|
||||
for orig_name in merged_from:
|
||||
name_to_id[orig_name] = ent_id
|
||||
name_to_id[canonical_norm] = ent_id
|
||||
|
||||
return DeduplicationResult(
|
||||
entities=merged_entities,
|
||||
entity_id_map=entity_id_map,
|
||||
name_to_id=name_to_id,
|
||||
merge_log=merge_log,
|
||||
total_before=n,
|
||||
total_after=len(merged_entities),
|
||||
)
|
||||
@@ -0,0 +1,189 @@
|
||||
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Importar levenshtein_distance desde cybersecurity ---
|
||||
# Soporta dos contextos:
|
||||
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
|
||||
# 2. Ejecutado desde la raiz del registry (fn run)
|
||||
def _levenshtein_distance(a: str, b: str) -> int:
|
||||
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
|
||||
if len(a) < len(b):
|
||||
return _levenshtein_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev_row = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr_row = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr_row.append(
|
||||
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
|
||||
)
|
||||
prev_row = curr_row
|
||||
return prev_row[-1]
|
||||
|
||||
|
||||
try:
|
||||
_here = os.path.dirname(os.path.abspath(__file__))
|
||||
_cyber_path = os.path.join(_here, "..", "cybersecurity")
|
||||
if _cyber_path not in sys.path:
|
||||
sys.path.insert(0, _cyber_path)
|
||||
from cybersecurity import levenshtein_distance as _lev
|
||||
except ImportError:
|
||||
_lev = None # type: ignore
|
||||
|
||||
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
|
||||
|
||||
|
||||
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
|
||||
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
|
||||
|
||||
Recorre todas las claves de entity_id_map y busca la mas cercana segun
|
||||
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
|
||||
threshold, o '' si no hay match aceptable.
|
||||
|
||||
Args:
|
||||
name: nombre a resolver (ya en lowercase strip).
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id.
|
||||
threshold: distancia maxima de edicion para considerar match (default 3).
|
||||
|
||||
Returns:
|
||||
entity_id del mejor match o '' si no hay match.
|
||||
"""
|
||||
best_id = ""
|
||||
best_dist = threshold + 1
|
||||
for key, entity_id in entity_id_map.items():
|
||||
dist = levenshtein_distance(name, key)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_id = entity_id
|
||||
return best_id if best_dist <= threshold else ""
|
||||
|
||||
|
||||
def deduplicate_relations(
|
||||
relations: list,
|
||||
entity_id_map: dict[str, str],
|
||||
) -> list:
|
||||
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
|
||||
|
||||
Algoritmo:
|
||||
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
|
||||
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
|
||||
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
|
||||
Si sigue sin match, descartar la relacion con warning.
|
||||
2. Descartar self-loops (from_id == to_id).
|
||||
3. Deduplicar por (from_id, to_id, relation_type):
|
||||
- description: concatenar descripciones unicas separadas por '; '
|
||||
- confidence: max del grupo
|
||||
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
|
||||
|
||||
Args:
|
||||
relations: lista de RelationCandidate con from_name/to_name originales.
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id (output de
|
||||
deduplicate_entities). Permite resolver nombres que fueron mergeados.
|
||||
|
||||
Returns:
|
||||
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
|
||||
"""
|
||||
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
|
||||
try:
|
||||
_types_path = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"..", "..", "..", "python", "types", "datascience",
|
||||
)
|
||||
if _types_path not in sys.path:
|
||||
sys.path.insert(0, _types_path)
|
||||
from relation_candidate import RelationCandidate
|
||||
except ImportError:
|
||||
from relation_candidate import RelationCandidate # type: ignore
|
||||
|
||||
resolved: list = []
|
||||
|
||||
for rel in relations:
|
||||
# --- Resolver from_name ---
|
||||
from_key = rel.from_name.lower().strip()
|
||||
from_id = entity_id_map.get(from_key, "")
|
||||
if not from_id:
|
||||
from_id = _fuzzy_resolve(from_key, entity_id_map)
|
||||
if not from_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
|
||||
rel.from_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Resolver to_name ---
|
||||
to_key = rel.to_name.lower().strip()
|
||||
to_id = entity_id_map.get(to_key, "")
|
||||
if not to_id:
|
||||
to_id = _fuzzy_resolve(to_key, entity_id_map)
|
||||
if not to_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
|
||||
rel.to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Descartar self-loops ---
|
||||
if from_id == to_id:
|
||||
logger.debug(
|
||||
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
|
||||
rel.from_name,
|
||||
rel.to_name,
|
||||
rel.relation_type,
|
||||
)
|
||||
continue
|
||||
|
||||
resolved.append(
|
||||
RelationCandidate(
|
||||
from_name=rel.from_name,
|
||||
to_name=rel.to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel.relation_type,
|
||||
description=rel.description,
|
||||
confidence=rel.confidence,
|
||||
source_chunk_index=rel.source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
# --- Deduplicar por (from_id, to_id, relation_type) ---
|
||||
groups: dict[tuple, list] = {}
|
||||
for rel in resolved:
|
||||
key = (rel.from_id, rel.to_id, rel.relation_type)
|
||||
groups.setdefault(key, []).append(rel)
|
||||
|
||||
result: list = []
|
||||
for (from_id, to_id, rel_type), group in groups.items():
|
||||
if len(group) == 1:
|
||||
result.append(group[0])
|
||||
continue
|
||||
|
||||
# Mergear: max confidence + union de descripciones unicas
|
||||
best_confidence = max(r.confidence for r in group)
|
||||
seen_desc: set[str] = set()
|
||||
descriptions: list[str] = []
|
||||
for r in group:
|
||||
if r.description and r.description not in seen_desc:
|
||||
descriptions.append(r.description)
|
||||
seen_desc.add(r.description)
|
||||
|
||||
result.append(
|
||||
RelationCandidate(
|
||||
from_name=group[0].from_name,
|
||||
to_name=group[0].to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel_type,
|
||||
description="; ".join(descriptions),
|
||||
confidence=best_confidence,
|
||||
source_chunk_index=group[0].source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,22 @@
|
||||
"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeduplicationResult:
|
||||
"""Resultado de deduplicacion de entidades.
|
||||
|
||||
El `name_to_id` mapea TODOS los nombres originales (incluyendo los
|
||||
mergeados) a su ID final, permitiendo resolver relaciones que usan
|
||||
cualquier variante del nombre.
|
||||
"""
|
||||
|
||||
entities: list[EntityCandidate]
|
||||
entity_id_map: dict[str, str]
|
||||
name_to_id: dict[str, str]
|
||||
merge_log: list[dict] = field(default_factory=list)
|
||||
total_before: int = 0
|
||||
total_after: int = 0
|
||||
@@ -0,0 +1,34 @@
|
||||
"""EntityCandidate — candidato de entidad extraido por el LLM."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityCandidate:
|
||||
"""Candidato de entidad extraido por el LLM.
|
||||
|
||||
Puede venir de un solo chunk o ser el resultado de mergear multiples
|
||||
extracciones. `merged_from` rastrea los nombres originales para debugging.
|
||||
"""
|
||||
|
||||
name: str
|
||||
name_normalized: str = ""
|
||||
type_ref: str = ""
|
||||
type_label: str = ""
|
||||
attributes: dict = field(default_factory=dict)
|
||||
confidence: float = 0.0
|
||||
source_chunk_indices: list[int] = field(default_factory=list)
|
||||
merged_from: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serializa el candidato a un diccionario."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"name_normalized": self.name_normalized,
|
||||
"type_ref": self.type_ref,
|
||||
"type_label": self.type_label,
|
||||
"attributes": self.attributes,
|
||||
"confidence": self.confidence,
|
||||
"source_chunk_indices": self.source_chunk_indices,
|
||||
"merged_from": self.merged_from,
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
|
||||
|
||||
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
|
||||
"""Construye el system prompt para extraccion de entidades."""
|
||||
lines = [
|
||||
"You are an entity extraction expert. Given text, extract all entities",
|
||||
"matching these types. For each entity, provide: name, type_ref,",
|
||||
"attributes (matching the metadata_fields for that type), and a",
|
||||
"confidence score (0.0-1.0).",
|
||||
"",
|
||||
"Entity types:",
|
||||
]
|
||||
|
||||
for schema_entry in entity_schema:
|
||||
label = schema_entry.get("label", "Unknown")
|
||||
type_ref = schema_entry.get("type_ref", "")
|
||||
metadata_fields = schema_entry.get("metadata_fields", [])
|
||||
lines.append(f"- {label} (type_ref: {type_ref})")
|
||||
if metadata_fields:
|
||||
lines.append(f" fields: {', '.join(metadata_fields)}")
|
||||
|
||||
lines += [
|
||||
"",
|
||||
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
|
||||
"",
|
||||
"Rules:",
|
||||
"- Only extract entities explicitly mentioned in the text",
|
||||
"- Use the exact type_ref from the schema",
|
||||
"- Leave unknown attributes as null",
|
||||
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
|
||||
f"- {language_instruction}",
|
||||
]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def extract_entities_llm(
|
||||
text: str,
|
||||
entity_schema: list[dict],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[EntityCandidate]:
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
|
||||
|
||||
Construye un system prompt con el schema de entity types, llama al LLM
|
||||
y valida la respuesta retornando una lista de EntityCandidate.
|
||||
|
||||
Args:
|
||||
text: Chunk de texto a analizar.
|
||||
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
|
||||
dict con las claves 'type_ref', 'label' y opcionalmente
|
||||
'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
|
||||
y retorna un dict con la respuesta JSON del LLM. Interfaz:
|
||||
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
|
||||
language_instruction: Instruccion de idioma para el LLM. Por defecto
|
||||
"Respond in English."
|
||||
|
||||
Returns:
|
||||
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
|
||||
no retorna JSON valido o si no se encuentran entidades.
|
||||
|
||||
Raises:
|
||||
ValueError: Si entity_schema esta vacio.
|
||||
"""
|
||||
if not entity_schema:
|
||||
raise ValueError("entity_schema no puede estar vacio")
|
||||
|
||||
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
|
||||
type_ref_to_label = {
|
||||
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
|
||||
}
|
||||
|
||||
system_prompt = _build_system_prompt(entity_schema, language_instruction)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
|
||||
return []
|
||||
|
||||
raw_entities = response.get("entities", [])
|
||||
if not isinstance(raw_entities, list):
|
||||
warnings.warn(
|
||||
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
|
||||
stacklevel=2,
|
||||
)
|
||||
return []
|
||||
|
||||
candidates: list[EntityCandidate] = []
|
||||
for item in raw_entities:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
name = item.get("name", "")
|
||||
if not name:
|
||||
continue
|
||||
|
||||
type_ref = item.get("type_ref", "")
|
||||
if type_ref not in valid_type_refs:
|
||||
warnings.warn(
|
||||
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
|
||||
stacklevel=2,
|
||||
)
|
||||
continue
|
||||
|
||||
attributes = item.get("attributes", {})
|
||||
if not isinstance(attributes, dict):
|
||||
attributes = {}
|
||||
# Normalizar null values a None
|
||||
attributes = {k: v for k, v in attributes.items() if v is not None}
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
candidates.append(
|
||||
EntityCandidate(
|
||||
name=name,
|
||||
type_ref=type_ref,
|
||||
type_label=type_ref_to_label.get(type_ref, ""),
|
||||
attributes=attributes,
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return candidates
|
||||
@@ -0,0 +1,141 @@
|
||||
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from relation_candidate import RelationCandidate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_relations_llm(
|
||||
text: str,
|
||||
entities: list[EntityCandidate],
|
||||
relation_types: list[str],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[RelationCandidate]:
|
||||
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
|
||||
|
||||
Dado el texto original y las entidades ya extraidas, pide al LLM que
|
||||
identifique relaciones entre pares de entidades. Las relaciones cuyo
|
||||
from_name o to_name no coincidan con ninguna entidad existente se descartan.
|
||||
Los tipos de relacion no permitidos se reemplazan por "related_to".
|
||||
|
||||
Args:
|
||||
text: chunk de texto (el mismo que se uso para extraer las entidades).
|
||||
entities: entidades ya extraidas del chunk.
|
||||
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
|
||||
"communicates_with", "owns", "related_to"].
|
||||
llm_chat_json: funcion inyectada que recibe una lista de mensajes
|
||||
(dicts con "role" y "content") y retorna un dict con la respuesta
|
||||
JSON del LLM.
|
||||
language_instruction: instruccion de idioma para el LLM.
|
||||
|
||||
Returns:
|
||||
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
|
||||
o si el LLM no encuentra relaciones.
|
||||
"""
|
||||
if len(entities) < 2:
|
||||
return []
|
||||
|
||||
entity_names = {e.name for e in entities}
|
||||
relation_types_set = set(relation_types)
|
||||
|
||||
# Construir lista de entidades para el prompt
|
||||
entity_lines = "\n".join(
|
||||
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
|
||||
)
|
||||
|
||||
# Construir tipos de relacion para el prompt
|
||||
relation_types_str = ", ".join(relation_types)
|
||||
|
||||
system_prompt = f"""\
|
||||
You are a relation extraction expert. Given text and a list of entities already \
|
||||
extracted, identify relationships between them.
|
||||
|
||||
Entities found in this text:
|
||||
{entity_lines}
|
||||
|
||||
Allowed relation types: {relation_types_str}
|
||||
|
||||
Output JSON: {{"relations": [
|
||||
{{"from_name": "Entity A", "to_name": "Entity B",
|
||||
"relation_type": "employs", "description": "...", "confidence": 0.8}}
|
||||
]}}
|
||||
|
||||
Rules:
|
||||
- Only extract relations explicitly stated or strongly implied in the text
|
||||
- from_name and to_name must match entity names exactly as listed above
|
||||
- relation_type must be one of the allowed types
|
||||
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
|
||||
- Do not invent entities not in the list above
|
||||
- {language_instruction}"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
|
||||
return []
|
||||
|
||||
raw_relations = response.get("relations", [])
|
||||
if not isinstance(raw_relations, list):
|
||||
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
|
||||
return []
|
||||
|
||||
results: list[RelationCandidate] = []
|
||||
for item in raw_relations:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
from_name = item.get("from_name", "")
|
||||
to_name = item.get("to_name", "")
|
||||
|
||||
# Validar que ambos nombres corresponden a entidades existentes
|
||||
if from_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
|
||||
from_name,
|
||||
)
|
||||
continue
|
||||
if to_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
|
||||
to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
relation_type = item.get("relation_type", "")
|
||||
if relation_type not in relation_types_set:
|
||||
logger.debug(
|
||||
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
|
||||
relation_type,
|
||||
)
|
||||
relation_type = "related_to"
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
results.append(
|
||||
RelationCandidate(
|
||||
from_name=from_name,
|
||||
to_name=to_name,
|
||||
relation_type=relation_type,
|
||||
description=item.get("description", ""),
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Extract plain text from PDF, Markdown, or TXT files."""
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
||||
|
||||
|
||||
def _detect_encoding(data: bytes) -> str:
|
||||
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
||||
# Strategy 1: UTF-8
|
||||
try:
|
||||
data.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: charset_normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
|
||||
result = from_bytes(data).best()
|
||||
if result is not None and result.encoding:
|
||||
return result.encoding
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Strategy 3: chardet
|
||||
try:
|
||||
import chardet
|
||||
|
||||
detected = chardet.detect(data)
|
||||
if detected and detected.get("encoding"):
|
||||
return detected["encoding"]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Last resort: UTF-8 with replacement
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
||||
|
||||
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
||||
joining them with double newlines. For text-based files (.md, .markdown,
|
||||
.txt) reads the file with automatic encoding detection.
|
||||
|
||||
Args:
|
||||
file_path: Absolute or relative path to the file.
|
||||
|
||||
Returns:
|
||||
str: Extracted plain text content.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If the file extension is not supported.
|
||||
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
||||
"""
|
||||
import os
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"PyMuPDF is required for PDF extraction. "
|
||||
"Install it with: pip install PyMuPDF"
|
||||
) from e
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
pages = [page.get_text() for page in doc]
|
||||
return "\n\n".join(pages)
|
||||
|
||||
elif ext in {".md", ".markdown", ".txt"}:
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
|
||||
encoding = _detect_encoding(raw)
|
||||
try:
|
||||
return raw.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: '{ext}'. "
|
||||
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||
)
|
||||
@@ -0,0 +1,208 @@
|
||||
"""Pipeline de extraccion de entidades y relaciones desde un documento."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
|
||||
|
||||
from extract_text_from_file import extract_text_from_file
|
||||
from core_functions import preprocess_text
|
||||
from split_text_into_chunks import split_text_into_chunks
|
||||
from build_entity_schema_prompt import build_entity_schema_prompt
|
||||
from build_relation_schema_prompt import build_relation_schema_prompt
|
||||
from extract_entities_llm import extract_entities_llm
|
||||
from extract_relations_llm import extract_relations_llm
|
||||
from deduplicate_entities import deduplicate_entities
|
||||
from deduplicate_relations import deduplicate_relations
|
||||
from entity_candidate import EntityCandidate
|
||||
from extraction_result import ExtractionResult
|
||||
from extraction_stats import ExtractionStats
|
||||
|
||||
|
||||
def extraction_pipeline(
|
||||
file_path: str,
|
||||
entity_presets: list[dict],
|
||||
relation_types: list[str],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
chunk_size: int = 500,
|
||||
chunk_overlap: int = 50,
|
||||
confidence_threshold: float = 0.5,
|
||||
dedup_threshold: float = 0.85,
|
||||
on_progress: Callable[[str, float], None] | None = None,
|
||||
) -> ExtractionResult:
|
||||
"""Pipeline completa de extraccion de entidades y relaciones desde un documento.
|
||||
|
||||
Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
|
||||
-> extract_entities_llm por chunk -> deduplicate_entities ->
|
||||
extract_relations_llm por chunk -> deduplicate_relations.
|
||||
|
||||
Args:
|
||||
file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
|
||||
entity_presets: lista de dicts con type_ref, label y metadata_fields.
|
||||
Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "nationality"]}]
|
||||
relation_types: tipos de relacion permitidos para extraccion.
|
||||
Ejemplo: ["funds", "employs", "communicates_with", "owns"]
|
||||
llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
|
||||
con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
|
||||
chunk_size: numero de caracteres por chunk (default 500).
|
||||
chunk_overlap: overlap entre chunks consecutivos (default 50).
|
||||
confidence_threshold: umbral minimo de confidence para aceptar entidades
|
||||
candidatas antes de deduplicar (default 0.5).
|
||||
dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
|
||||
on_progress: callback opcional de progreso (message: str, pct: float 0-1).
|
||||
0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
|
||||
80-100%: deduplicacion.
|
||||
|
||||
Returns:
|
||||
ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: si file_path no existe.
|
||||
ValueError: si entity_presets esta vacio.
|
||||
"""
|
||||
if not entity_presets:
|
||||
raise ValueError("entity_presets no puede estar vacio")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
|
||||
|
||||
def _progress(msg: str, pct: float) -> None:
|
||||
if on_progress is not None:
|
||||
try:
|
||||
on_progress(msg, pct)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
start_time = time.monotonic()
|
||||
stats = ExtractionStats()
|
||||
|
||||
# ── Paso 1: Extraer texto ──────────────────────────────────────────────────
|
||||
_progress("Extracting text from file...", 0.0)
|
||||
try:
|
||||
raw_text = extract_text_from_file(file_path)
|
||||
except Exception as exc:
|
||||
warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
|
||||
raw_text = ""
|
||||
|
||||
# ── Paso 2: Preprocesar ────────────────────────────────────────────────────
|
||||
clean_text = preprocess_text(raw_text)
|
||||
stats.total_chars = len(clean_text)
|
||||
|
||||
# ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
|
||||
chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
|
||||
n = len(chunks)
|
||||
stats.total_chunks = n
|
||||
|
||||
if n == 0:
|
||||
stats.processing_time_seconds = time.monotonic() - start_time
|
||||
return ExtractionResult(entities=[], relations=[], stats=stats)
|
||||
|
||||
# ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
|
||||
all_raw_entities: list[EntityCandidate] = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
_progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
|
||||
try:
|
||||
candidates = extract_entities_llm(
|
||||
text=chunk,
|
||||
entity_schema=entity_presets,
|
||||
llm_chat_json=llm_chat_json,
|
||||
)
|
||||
except Exception as exc:
|
||||
warnings.warn(
|
||||
f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
|
||||
)
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
# Anotar el chunk de origen
|
||||
if i not in candidate.source_chunk_indices:
|
||||
candidate.source_chunk_indices.append(i)
|
||||
all_raw_entities.append(candidate)
|
||||
|
||||
# ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
|
||||
filtered_entities = [
|
||||
e for e in all_raw_entities if e.confidence >= confidence_threshold
|
||||
]
|
||||
stats.raw_entities_count = len(filtered_entities)
|
||||
|
||||
# Actualizar stats de tipos
|
||||
for ent in filtered_entities:
|
||||
stats.entity_types_found[ent.type_ref] = (
|
||||
stats.entity_types_found.get(ent.type_ref, 0) + 1
|
||||
)
|
||||
|
||||
# ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
|
||||
_progress("Deduplicating entities...", 0.4)
|
||||
dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
|
||||
|
||||
stats.final_entities_count = dedup_result.total_after
|
||||
stats.entities_merged = dedup_result.total_before - dedup_result.total_after
|
||||
|
||||
final_entities = dedup_result.entities
|
||||
entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id
|
||||
|
||||
# ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
|
||||
all_raw_relations = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
_progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
|
||||
|
||||
# Obtener entidades relevantes de este chunk
|
||||
chunk_entities = [
|
||||
e for e in final_entities if i in e.source_chunk_indices
|
||||
]
|
||||
# Si no hay entidades en este chunk especifico, usar todas
|
||||
if not chunk_entities:
|
||||
chunk_entities = final_entities
|
||||
|
||||
if len(chunk_entities) < 2:
|
||||
continue
|
||||
|
||||
try:
|
||||
chunk_relations = extract_relations_llm(
|
||||
text=chunk,
|
||||
entities=chunk_entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=llm_chat_json,
|
||||
)
|
||||
except Exception as exc:
|
||||
warnings.warn(
|
||||
f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
|
||||
)
|
||||
chunk_relations = []
|
||||
|
||||
for rel in chunk_relations:
|
||||
rel.source_chunk_index = i
|
||||
all_raw_relations.extend(chunk_relations)
|
||||
|
||||
stats.raw_relations_count = len(all_raw_relations)
|
||||
|
||||
# Actualizar stats de tipos de relacion
|
||||
for rel in all_raw_relations:
|
||||
stats.relation_types_found[rel.relation_type] = (
|
||||
stats.relation_types_found.get(rel.relation_type, 0) + 1
|
||||
)
|
||||
|
||||
# ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
|
||||
_progress("Deduplicating relations...", 0.8)
|
||||
final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
|
||||
|
||||
stats.final_relations_count = len(final_relations)
|
||||
stats.relations_merged = stats.raw_relations_count - len(final_relations)
|
||||
stats.processing_time_seconds = time.monotonic() - start_time
|
||||
|
||||
_progress("Done", 1.0)
|
||||
|
||||
return ExtractionResult(
|
||||
entities=final_entities,
|
||||
relations=final_relations,
|
||||
stats=stats,
|
||||
)
|
||||
@@ -0,0 +1,20 @@
|
||||
"""ExtractionResult — resultado final del pipeline de extraccion."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from extraction_stats import ExtractionStats
|
||||
from relation_candidate import RelationCandidate
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Resultado final del pipeline de extraccion de entidades y relaciones.
|
||||
|
||||
Contiene las listas deduplicadas de entidades y relaciones junto con
|
||||
las estadisticas del proceso completo.
|
||||
"""
|
||||
|
||||
entities: list[EntityCandidate]
|
||||
relations: list[RelationCandidate]
|
||||
stats: ExtractionStats = field(default_factory=ExtractionStats)
|
||||
@@ -0,0 +1,25 @@
|
||||
"""ExtractionStats — estadisticas del proceso de extraccion."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionStats:
|
||||
"""Estadisticas del proceso de extraccion.
|
||||
|
||||
Util para reporting y debugging. Registra conteos antes y despues de
|
||||
deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
|
||||
"""
|
||||
|
||||
total_chunks: int = 0
|
||||
total_chars: int = 0
|
||||
raw_entities_count: int = 0
|
||||
final_entities_count: int = 0
|
||||
entities_merged: int = 0
|
||||
raw_relations_count: int = 0
|
||||
final_relations_count: int = 0
|
||||
relations_merged: int = 0
|
||||
relations_discarded: int = 0
|
||||
entity_types_found: dict[str, int] = field(default_factory=dict)
|
||||
relation_types_found: dict[str, int] = field(default_factory=dict)
|
||||
processing_time_seconds: float = 0.0
|
||||
@@ -0,0 +1,78 @@
|
||||
"""Combina atributos de multiples candidatos de la misma entidad."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
|
||||
_DATE_MIN_FIELDS = {"first_seen", "created_date"}
|
||||
_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
|
||||
_BOOL_FIELDS = {"verified", "exploited"}
|
||||
|
||||
|
||||
def merge_entity_attributes(attr_list: list[dict]) -> dict:
|
||||
"""Combina atributos de multiples candidatos de la misma entidad.
|
||||
|
||||
Para cada campo presente en cualquier candidato recopila todos los valores
|
||||
non-null y aplica heuristicas de resolucion por tipo de campo:
|
||||
- Numerico (risk_score, balance, cvss): max
|
||||
- Fecha min (first_seen, created_date): min (mas antigua)
|
||||
- Fecha max (last_seen, expires_date): max (mas reciente)
|
||||
- Lista (cualquier valor de tipo list): union sin duplicados
|
||||
- Boolean (verified, exploited): OR logico
|
||||
- String: el mas largo
|
||||
|
||||
Args:
|
||||
attr_list: Lista de dicts con los atributos de cada candidato.
|
||||
|
||||
Returns:
|
||||
Dict con los atributos fusionados.
|
||||
"""
|
||||
if not attr_list:
|
||||
return {}
|
||||
|
||||
# Recopilar todas las claves presentes en cualquier candidato
|
||||
all_keys: set[str] = set()
|
||||
for attrs in attr_list:
|
||||
all_keys.update(attrs.keys())
|
||||
|
||||
merged: dict = {}
|
||||
|
||||
for key in all_keys:
|
||||
# Recopilar valores non-null
|
||||
values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
|
||||
|
||||
if not values:
|
||||
merged[key] = None
|
||||
continue
|
||||
|
||||
if len(values) == 1:
|
||||
merged[key] = values[0]
|
||||
continue
|
||||
|
||||
# Todos iguales
|
||||
if all(v == values[0] for v in values):
|
||||
merged[key] = values[0]
|
||||
continue
|
||||
|
||||
# Resolver conflicto segun tipo de campo
|
||||
if key in _NUMERIC_FIELDS:
|
||||
merged[key] = max(values)
|
||||
elif key in _DATE_MIN_FIELDS:
|
||||
merged[key] = min(values)
|
||||
elif key in _DATE_MAX_FIELDS:
|
||||
merged[key] = max(values)
|
||||
elif key in _BOOL_FIELDS:
|
||||
merged[key] = any(values)
|
||||
elif isinstance(values[0], list):
|
||||
# Union de listas sin duplicados, preservando orden de aparicion
|
||||
seen: list = []
|
||||
for lst in values:
|
||||
for item in lst:
|
||||
if item not in seen:
|
||||
seen.append(item)
|
||||
merged[key] = seen
|
||||
else:
|
||||
# String u otro: usar el mas largo
|
||||
str_values = [str(v) for v in values]
|
||||
merged[key] = max(str_values, key=len)
|
||||
|
||||
return merged
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
_TITLES = re.compile(
|
||||
r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_LEGAL_SUFFIXES = re.compile(
|
||||
r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
|
||||
r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
|
||||
r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_MULTI_SPACE = re.compile(r"\s+")
|
||||
|
||||
|
||||
def normalize_entity_name(name: str, entity_type: str = "") -> str:
|
||||
"""Normaliza el nombre de una entidad para comparacion y deduplicacion.
|
||||
|
||||
Aplica reglas diferentes segun el tipo de entidad:
|
||||
- ip / email / domain / crypto_wallet / phone: normalizacion tecnica
|
||||
- person: normalizacion de nombre humano (titulos, formato apellido-nombre)
|
||||
- organization: normalizacion corporativa (sufijos legales)
|
||||
- default: lower + strip + colapsar espacios
|
||||
|
||||
Args:
|
||||
name: nombre de la entidad a normalizar.
|
||||
entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
|
||||
person, organization). Vacio = default.
|
||||
|
||||
Returns:
|
||||
nombre normalizado como string.
|
||||
"""
|
||||
name = name.strip()
|
||||
et = entity_type.lower().strip()
|
||||
|
||||
if et == "ip":
|
||||
return name.lower()
|
||||
|
||||
if et == "email":
|
||||
return name.lower()
|
||||
|
||||
if et == "domain":
|
||||
result = name.lower().rstrip(".")
|
||||
if result.startswith("www."):
|
||||
result = result[4:]
|
||||
return result
|
||||
|
||||
if et == "crypto_wallet":
|
||||
# Bitcoin addresses son case-sensitive — solo strip
|
||||
return name
|
||||
|
||||
if et == "phone":
|
||||
# Mantener solo digitos y el signo +
|
||||
return re.sub(r"[^\d+]", "", name)
|
||||
|
||||
if et == "person":
|
||||
# Remover titulos al inicio
|
||||
result = _TITLES.sub("", name).strip()
|
||||
# Detectar formato "Apellido, Nombre"
|
||||
if "," in result:
|
||||
parts = result.split(",", 1)
|
||||
last = parts[0].strip()
|
||||
first = parts[1].strip()
|
||||
result = f"{first} {last}"
|
||||
# Colapsar espacios y title case
|
||||
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||
return result.title()
|
||||
|
||||
if et == "organization":
|
||||
result = _LEGAL_SUFFIXES.sub("", name).strip()
|
||||
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||
# Title case para consistencia
|
||||
return result.title()
|
||||
|
||||
# Default: lower, strip, colapsar espacios
|
||||
return _MULTI_SPACE.sub(" ", name.lower()).strip()
|
||||
@@ -0,0 +1,35 @@
|
||||
"""RelationCandidate — candidato de relacion extraido por el LLM."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelationCandidate:
|
||||
"""Candidato de relacion entre dos entidades extraido por el LLM.
|
||||
|
||||
`from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
|
||||
y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
|
||||
contra los EntityCandidate finales.
|
||||
"""
|
||||
|
||||
from_name: str
|
||||
to_name: str
|
||||
from_id: str = ""
|
||||
to_id: str = ""
|
||||
relation_type: str = ""
|
||||
description: str = ""
|
||||
confidence: float = 0.0
|
||||
source_chunk_index: int = -1
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serializa el candidato a un diccionario."""
|
||||
return {
|
||||
"from_name": self.from_name,
|
||||
"to_name": self.to_name,
|
||||
"from_id": self.from_id,
|
||||
"to_id": self.to_id,
|
||||
"relation_type": self.relation_type,
|
||||
"description": self.description,
|
||||
"confidence": self.confidence,
|
||||
"source_chunk_index": self.source_chunk_index,
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
_HTML_TEMPLATE = """\
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>{title}</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
|
||||
<style>
|
||||
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
||||
body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
|
||||
#container {{ width: 100vw; height: 100vh; }}
|
||||
#panel {{
|
||||
position: absolute; top: 12px; right: 12px;
|
||||
background: rgba(10, 10, 30, 0.88);
|
||||
border: 1px solid rgba(255,255,255,0.12);
|
||||
padding: 16px; border-radius: 10px;
|
||||
z-index: 10; min-width: 200px; max-width: 260px;
|
||||
backdrop-filter: blur(6px);
|
||||
}}
|
||||
#panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
|
||||
#stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
|
||||
#filters {{ display: flex; flex-direction: column; gap: 6px; }}
|
||||
.filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
|
||||
.filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
|
||||
.color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
|
||||
#tooltip {{
|
||||
position: absolute; display: none;
|
||||
background: rgba(5, 5, 20, 0.95);
|
||||
border: 1px solid rgba(255,255,255,0.15);
|
||||
padding: 10px 14px; border-radius: 8px;
|
||||
pointer-events: none; z-index: 20;
|
||||
max-width: 300px; font-size: 12px; line-height: 1.6;
|
||||
}}
|
||||
#tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
|
||||
#tooltip .tt-row {{ display: flex; gap: 6px; }}
|
||||
#tooltip .tt-key {{ color: #888; min-width: 80px; }}
|
||||
#tooltip .tt-val {{ color: #eee; word-break: break-all; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="container"></div>
|
||||
<div id="panel">
|
||||
<h3>{title}</h3>
|
||||
<div id="stats"></div>
|
||||
<div id="filters"></div>
|
||||
</div>
|
||||
<div id="tooltip"></div>
|
||||
|
||||
<script>
|
||||
(function () {{
|
||||
const graphData = {json_data};
|
||||
|
||||
// ── Build graphology graph ──────────────────────────────────────────────
|
||||
const Graph = graphology.Graph || graphology;
|
||||
const g = new Graph({{ multi: true, type: 'directed' }});
|
||||
|
||||
// Assign random initial positions
|
||||
graphData.nodes.forEach(function (n) {{
|
||||
g.addNode(n.key, Object.assign({{
|
||||
x: (Math.random() - 0.5) * 10,
|
||||
y: (Math.random() - 0.5) * 10,
|
||||
}}, n.attributes));
|
||||
}});
|
||||
|
||||
graphData.edges.forEach(function (e) {{
|
||||
try {{
|
||||
g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
|
||||
}} catch (err) {{
|
||||
// skip duplicate edge keys gracefully
|
||||
}}
|
||||
}});
|
||||
|
||||
// ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
|
||||
const FA2 = graphologyLibrary.layoutForceAtlas2;
|
||||
FA2.assign(g, {{
|
||||
iterations: 500,
|
||||
settings: {{
|
||||
gravity: 1,
|
||||
scalingRatio: 2,
|
||||
slowDown: 5,
|
||||
barnesHutOptimize: g.order > 300,
|
||||
}},
|
||||
}});
|
||||
|
||||
// ── Sigma renderer ──────────────────────────────────────────────────────
|
||||
const renderer = new Sigma(g, document.getElementById('container'), {{
|
||||
renderEdgeLabels: false,
|
||||
defaultEdgeColor: '#444',
|
||||
defaultNodeColor: '#95a5a6',
|
||||
labelColor: {{ color: '#ccc' }},
|
||||
labelSize: 11,
|
||||
edgeReducer: function (edge, data) {{
|
||||
return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
|
||||
}},
|
||||
}});
|
||||
|
||||
// ── Stats panel ─────────────────────────────────────────────────────────
|
||||
document.getElementById('stats').textContent =
|
||||
graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
|
||||
|
||||
// ── Filter panel by node type ───────────────────────────────────────────
|
||||
const typeColors = {{}};
|
||||
graphData.nodes.forEach(function (n) {{
|
||||
const t = n.attributes.entity_type || 'unknown';
|
||||
typeColors[t] = n.attributes.color || '#95a5a6';
|
||||
}});
|
||||
|
||||
const hiddenTypes = new Set();
|
||||
const filtersDiv = document.getElementById('filters');
|
||||
|
||||
Object.keys(typeColors).sort().forEach(function (type) {{
|
||||
const color = typeColors[type];
|
||||
const label = document.createElement('label');
|
||||
label.className = 'filter-item';
|
||||
|
||||
const cb = document.createElement('input');
|
||||
cb.type = 'checkbox';
|
||||
cb.checked = true;
|
||||
cb.addEventListener('change', function () {{
|
||||
if (cb.checked) hiddenTypes.delete(type);
|
||||
else hiddenTypes.add(type);
|
||||
renderer.refresh();
|
||||
}});
|
||||
|
||||
const dot = document.createElement('span');
|
||||
dot.className = 'color-dot';
|
||||
dot.style.background = color;
|
||||
|
||||
label.appendChild(cb);
|
||||
label.appendChild(dot);
|
||||
label.appendChild(document.createTextNode(type));
|
||||
filtersDiv.appendChild(label);
|
||||
}});
|
||||
|
||||
// Node reducer applies type filter
|
||||
renderer.setSetting('nodeReducer', function (node, data) {{
|
||||
if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
|
||||
return data;
|
||||
}});
|
||||
|
||||
// ── Tooltip on hover ────────────────────────────────────────────────────
|
||||
const tooltip = document.getElementById('tooltip');
|
||||
|
||||
renderer.on('enterNode', function (ref) {{
|
||||
const nodeAttrs = g.getNodeAttributes(ref.node);
|
||||
const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
|
||||
|
||||
let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
|
||||
|
||||
Object.keys(nodeAttrs).sort().forEach(function (k) {{
|
||||
if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
|
||||
html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
|
||||
}}
|
||||
}});
|
||||
|
||||
tooltip.innerHTML = html;
|
||||
tooltip.style.display = 'block';
|
||||
}});
|
||||
|
||||
renderer.on('leaveNode', function () {{
|
||||
tooltip.style.display = 'none';
|
||||
}});
|
||||
|
||||
document.getElementById('container').addEventListener('mousemove', function (e) {{
|
||||
tooltip.style.left = (e.clientX + 16) + 'px';
|
||||
tooltip.style.top = (e.clientY + 16) + 'px';
|
||||
}});
|
||||
|
||||
function escHtml(str) {{
|
||||
return String(str)
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"');
|
||||
}}
|
||||
}})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def render_sigma_html(
|
||||
graph_data: dict,
|
||||
output_path: str,
|
||||
title: str = "OSINT Graph",
|
||||
) -> str:
|
||||
"""Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
|
||||
|
||||
Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
|
||||
en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
|
||||
sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
|
||||
tooltip con metadata al hacer hover.
|
||||
|
||||
Args:
|
||||
graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
|
||||
output_path: Ruta del archivo HTML a escribir.
|
||||
title: Titulo del grafo mostrado en el panel y la pestana.
|
||||
|
||||
Returns:
|
||||
Ruta absoluta del archivo HTML escrito.
|
||||
|
||||
Raises:
|
||||
Exception: Si no se puede escribir el archivo en output_path.
|
||||
"""
|
||||
json_data = json.dumps(graph_data, ensure_ascii=False)
|
||||
|
||||
html = _HTML_TEMPLATE.format(
|
||||
title=title,
|
||||
json_data=json_data,
|
||||
)
|
||||
|
||||
abs_path = os.path.abspath(output_path)
|
||||
os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(abs_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
except OSError as exc:
|
||||
raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
|
||||
|
||||
return abs_path
|
||||
@@ -0,0 +1,66 @@
|
||||
"""Split text into overlapping chunks with sentence-boundary awareness."""
|
||||
|
||||
|
||||
def split_text_into_chunks(
|
||||
text: str, chunk_size: int = 500, overlap: int = 50
|
||||
) -> list[str]:
|
||||
"""Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
|
||||
|
||||
Args:
|
||||
text: Texto a dividir.
|
||||
chunk_size: Tamaño máximo de cada chunk en caracteres.
|
||||
overlap: Número de caracteres de solapamiento entre chunks consecutivos.
|
||||
|
||||
Returns:
|
||||
Lista de chunks. Vacía si el texto es vacío.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if len(text) <= chunk_size:
|
||||
stripped = text.strip()
|
||||
return [stripped] if stripped else []
|
||||
|
||||
# Separadores en orden de prioridad (más específicos primero)
|
||||
separators = ["。", "!", "?", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
text_len = len(text)
|
||||
|
||||
while start < text_len:
|
||||
end = start + chunk_size
|
||||
|
||||
if end < text_len:
|
||||
# Buscar el último separador de oración dentro de text[start:end]
|
||||
# Solo aceptar si está después del 30% del chunk
|
||||
min_pos = start + int(chunk_size * 0.30)
|
||||
best_end = None
|
||||
|
||||
for sep in separators:
|
||||
sep_len = len(sep)
|
||||
# Buscar la última ocurrencia del separador en text[start:end]
|
||||
search_region = text[start:end]
|
||||
pos = search_region.rfind(sep)
|
||||
if pos == -1:
|
||||
continue
|
||||
abs_pos = start + pos + sep_len
|
||||
if abs_pos > min_pos:
|
||||
# Usar este separador solo si produce un corte más tarde que el mínimo
|
||||
# y más temprano que chunk_size (ya garantizado por rfind en [start:end])
|
||||
if best_end is None or abs_pos > best_end:
|
||||
best_end = abs_pos
|
||||
|
||||
if best_end is not None:
|
||||
end = best_end
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = end - overlap
|
||||
# Protección contra bucle infinito si overlap >= chunk_size o end no avanza
|
||||
if start >= end:
|
||||
start = end
|
||||
|
||||
return chunks
|
||||
@@ -0,0 +1,6 @@
|
||||
def main():
|
||||
print("Hello from ontology-graph!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,935 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Ontology Graph Extraction\n",
|
||||
"\n",
|
||||
"Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
|
||||
"- LLM: `claude -p --model haiku`\n",
|
||||
"- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m 4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 10\u001b[39m \n",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys, os, json, subprocess\n",
|
||||
"\n",
|
||||
"ROOT = '/home/lucas/fn_registry'\n",
|
||||
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
|
||||
"sys.path.insert(0, ROOT)\n",
|
||||
"\n",
|
||||
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||
"\n",
|
||||
"print('Registry root:', ROOT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "'FN_REGISTRY_ROOT'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m 4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
|
||||
"\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys, os, json, subprocess\n",
|
||||
"\n",
|
||||
"ROOT = os.environ['FN_REGISTRY_ROOT']\n",
|
||||
"sys.path.insert(0, ROOT)\n",
|
||||
"\n",
|
||||
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||
"\n",
|
||||
"print('Registry root:', ROOT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## LLM wrapper: claude -p + haiku"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
|
||||
" \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
|
||||
" # Construir prompt desde messages\n",
|
||||
" parts = []\n",
|
||||
" for msg in messages:\n",
|
||||
" role = msg['role']\n",
|
||||
" content = msg['content']\n",
|
||||
" if role == 'system':\n",
|
||||
" parts.append(f\"[SYSTEM]\\n{content}\")\n",
|
||||
" elif role == 'user':\n",
|
||||
" parts.append(f\"[USER]\\n{content}\")\n",
|
||||
" prompt = \"\\n\\n\".join(parts)\n",
|
||||
" \n",
|
||||
" result = subprocess.run(\n",
|
||||
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
|
||||
" capture_output=True, text=True, timeout=120\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" if result.returncode != 0:\n",
|
||||
" raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
|
||||
" \n",
|
||||
" # Extraer el campo 'result' del JSON envelope de claude\n",
|
||||
" envelope = json.loads(result.stdout)\n",
|
||||
" raw_text = envelope.get('result', '')\n",
|
||||
" \n",
|
||||
" # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
|
||||
" return extract_json_from_llm(raw_text)\n",
|
||||
"\n",
|
||||
"# Test rapido\n",
|
||||
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
|
||||
"print('LLM wrapper OK:', test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Entity presets: OSINT + genéricos"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Presets OSINT (del registry) ---\n",
|
||||
"OSINT_PRESETS = [\n",
|
||||
" {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
|
||||
" \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
|
||||
" {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
|
||||
" \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
|
||||
" {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
|
||||
" \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
|
||||
" {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
|
||||
" \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
|
||||
" {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
|
||||
" \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
|
||||
" {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
|
||||
" \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
|
||||
" {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
|
||||
" \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
|
||||
" {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
|
||||
" \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
|
||||
" {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
|
||||
" \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
|
||||
" {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
|
||||
" \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
|
||||
" {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
|
||||
" \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
|
||||
" {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
|
||||
" \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
|
||||
" {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
|
||||
" \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# --- Presets genéricos (sin tipo Go, inline) ---\n",
|
||||
"GENERIC_PRESETS = [\n",
|
||||
" {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
|
||||
" \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
|
||||
" {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
|
||||
" \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
|
||||
" {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
|
||||
" \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
|
||||
" {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
|
||||
" \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
|
||||
" {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
|
||||
" \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
|
||||
" {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
|
||||
" \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
|
||||
"print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Relation types"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RELATION_TYPES = [\n",
|
||||
" # Personas / orgs\n",
|
||||
" \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
|
||||
" \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
|
||||
" # Comunicacion\n",
|
||||
" \"communicates_with\", \"sent_to\", \"received_from\",\n",
|
||||
" # Ubicacion\n",
|
||||
" \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
|
||||
" # Eventos\n",
|
||||
" \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
|
||||
" # Documentos / conceptos\n",
|
||||
" \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
|
||||
" # Financiero\n",
|
||||
" \"funds\", \"transacted_with\", \"invested_in\",\n",
|
||||
" # Tecnico\n",
|
||||
" \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
|
||||
" # Generico\n",
|
||||
" \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f'{len(RELATION_TYPES)} relation types')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extraer documento\n",
|
||||
"\n",
|
||||
"Pon tu documento en `data/` y cambia el path."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf') # <-- cambiar\n",
|
||||
"\n",
|
||||
"# Progreso visible\n",
|
||||
"def on_progress(msg, pct):\n",
|
||||
" print(f' [{pct*100:5.1f}%] {msg}')\n",
|
||||
"\n",
|
||||
"result = extraction_pipeline(\n",
|
||||
" file_path=DOC_PATH,\n",
|
||||
" entity_presets=ALL_PRESETS,\n",
|
||||
" relation_types=RELATION_TYPES,\n",
|
||||
" llm_chat_json=claude_haiku_json,\n",
|
||||
" chunk_size=800,\n",
|
||||
" chunk_overlap=100,\n",
|
||||
" confidence_threshold=0.5,\n",
|
||||
" dedup_threshold=0.85,\n",
|
||||
" on_progress=on_progress,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||
"print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||
"print(f'Chunks: {result.stats.total_chunks}')\n",
|
||||
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
|
||||
"print(f'Entity types: {result.stats.entity_types_found}')\n",
|
||||
"print(f'Relation types: {result.stats.relation_types_found}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Explorar resultados"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Entities\n",
|
||||
"ent_rows = []\n",
|
||||
"for e in result.entities:\n",
|
||||
" ent_rows.append({\n",
|
||||
" 'id': e.id,\n",
|
||||
" 'name': e.name,\n",
|
||||
" 'type': e.type_ref,\n",
|
||||
" 'confidence': e.confidence,\n",
|
||||
" 'attributes': e.attributes,\n",
|
||||
" })\n",
|
||||
"df_entities = pd.DataFrame(ent_rows)\n",
|
||||
"print(f'=== Entities ({len(df_entities)}) ===')\n",
|
||||
"df_entities.sort_values('type')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Relations\n",
|
||||
"rel_rows = []\n",
|
||||
"for r in result.relations:\n",
|
||||
" rel_rows.append({\n",
|
||||
" 'from_name': r.from_name,\n",
|
||||
" 'relation': r.relation_type,\n",
|
||||
" 'to_name': r.to_name,\n",
|
||||
" 'confidence': r.confidence,\n",
|
||||
" 'description': r.description,\n",
|
||||
" })\n",
|
||||
"df_relations = pd.DataFrame(rel_rows)\n",
|
||||
"print(f'=== Relations ({len(df_relations)}) ===')\n",
|
||||
"df_relations.sort_values('relation')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualizar grafo con sigma.js"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Colores por tipo de entidad\n",
|
||||
"TYPE_COLORS = {\n",
|
||||
" 'osint_person_go_cybersecurity': '#e74c3c',\n",
|
||||
" 'osint_organization_go_cybersecurity': '#3498db',\n",
|
||||
" 'osint_location_go_cybersecurity': '#2ecc71',\n",
|
||||
" 'osint_event_go_cybersecurity': '#f39c12',\n",
|
||||
" 'osint_email_go_cybersecurity': '#9b59b6',\n",
|
||||
" 'osint_domain_go_cybersecurity': '#1abc9c',\n",
|
||||
" 'osint_ip_address_go_cybersecurity': '#e67e22',\n",
|
||||
" 'osint_phone_go_cybersecurity': '#95a5a6',\n",
|
||||
" 'osint_social_media_go_cybersecurity': '#e91e63',\n",
|
||||
" 'osint_document_go_cybersecurity': '#607d8b',\n",
|
||||
" 'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
|
||||
" 'osint_malware_go_cybersecurity': '#f44336',\n",
|
||||
" 'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
|
||||
" 'concept': '#00bcd4',\n",
|
||||
" 'url': '#8bc34a',\n",
|
||||
" 'date_reference': '#cddc39',\n",
|
||||
" 'quantity': '#ffc107',\n",
|
||||
" 'coordinates': '#4caf50',\n",
|
||||
" 'text_fragment': '#78909c',\n",
|
||||
"}\n",
|
||||
"DEFAULT_COLOR = '#aaaaaa'\n",
|
||||
"\n",
|
||||
"def extraction_to_sigma(result) -> dict:\n",
|
||||
" \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
|
||||
" # Contar degree para tamaño de nodo\n",
|
||||
" degree = {}\n",
|
||||
" for r in result.relations:\n",
|
||||
" from_id = r.from_id or r.from_name\n",
|
||||
" to_id = r.to_id or r.to_name\n",
|
||||
" degree[from_id] = degree.get(from_id, 0) + 1\n",
|
||||
" degree[to_id] = degree.get(to_id, 0) + 1\n",
|
||||
"\n",
|
||||
" nodes = []\n",
|
||||
" for e in result.entities:\n",
|
||||
" eid = e.id or e.name\n",
|
||||
" nodes.append({\n",
|
||||
" 'key': eid,\n",
|
||||
" 'attributes': {\n",
|
||||
" 'label': e.name,\n",
|
||||
" 'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
|
||||
" 'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
|
||||
" 'type': e.type_ref,\n",
|
||||
" **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
|
||||
" }\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" edges = []\n",
|
||||
" node_keys = {n['key'] for n in nodes}\n",
|
||||
" for i, r in enumerate(result.relations):\n",
|
||||
" from_id = r.from_id or r.from_name\n",
|
||||
" to_id = r.to_id or r.to_name\n",
|
||||
" if from_id in node_keys and to_id in node_keys:\n",
|
||||
" edges.append({\n",
|
||||
" 'key': f'e{i}',\n",
|
||||
" 'source': from_id,\n",
|
||||
" 'target': to_id,\n",
|
||||
" 'attributes': {\n",
|
||||
" 'label': r.relation_type,\n",
|
||||
" 'type': r.relation_type,\n",
|
||||
" }\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" return {'nodes': nodes, 'edges': edges}\n",
|
||||
"\n",
|
||||
"graph_data = extraction_to_sigma(result)\n",
|
||||
"print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
|
||||
"html_path = render_sigma_html(\n",
|
||||
" graph_data=graph_data,\n",
|
||||
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
|
||||
" title='Ontology Graph',\n",
|
||||
")\n",
|
||||
"print(f'Graph saved: {html_path}')\n",
|
||||
"print(f'Open in browser: file://{html_path}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Auto-discovery de nuevos tipos\n",
|
||||
"\n",
|
||||
"Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
|
||||
" \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
|
||||
" # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
|
||||
" generic_entities = [\n",
|
||||
" {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
|
||||
" for e in result.entities\n",
|
||||
" if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" if not generic_entities:\n",
|
||||
" print('No hay entidades genéricas — los presets cubren todo.')\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
" existing_labels = [p['label'] for p in existing_presets]\n",
|
||||
" \n",
|
||||
" prompt_msg = [\n",
|
||||
" {'role': 'system', 'content': (\n",
|
||||
" 'You analyze entities extracted from a document and suggest new entity type presets. '\n",
|
||||
" 'Existing types: ' + ', '.join(existing_labels) + '. '\n",
|
||||
" 'For entities that dont fit existing types, suggest new type presets. '\n",
|
||||
" 'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
|
||||
" '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
|
||||
" 'Only suggest types that are genuinely different from existing ones. '\n",
|
||||
" 'Return {\"new_presets\": []} if no new types are needed.'\n",
|
||||
" )},\n",
|
||||
" {'role': 'user', 'content': (\n",
|
||||
" 'These entities were classified as generic (concept/text_fragment) '\n",
|
||||
" 'because they didnt fit existing types:\\n\\n'\n",
|
||||
" + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
|
||||
" )}\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" resp = claude_haiku_json(prompt_msg)\n",
|
||||
" new_presets = resp.get('new_presets', [])\n",
|
||||
" \n",
|
||||
" if new_presets:\n",
|
||||
" print(f'Discovered {len(new_presets)} new types:')\n",
|
||||
" for p in new_presets:\n",
|
||||
" print(f\" - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
|
||||
" else:\n",
|
||||
" print('No new types needed.')\n",
|
||||
" \n",
|
||||
" return new_presets\n",
|
||||
"\n",
|
||||
"new_types = discover_new_types(result, ALL_PRESETS)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
|
||||
"if new_types:\n",
|
||||
" EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
|
||||
" print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
|
||||
" \n",
|
||||
" result = extraction_pipeline(\n",
|
||||
" file_path=DOC_PATH,\n",
|
||||
" entity_presets=EXPANDED_PRESETS,\n",
|
||||
" relation_types=RELATION_TYPES,\n",
|
||||
" llm_chat_json=claude_haiku_json,\n",
|
||||
" chunk_size=800,\n",
|
||||
" chunk_overlap=100,\n",
|
||||
" confidence_threshold=0.5,\n",
|
||||
" dedup_threshold=0.85,\n",
|
||||
" on_progress=on_progress,\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||
" print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||
" \n",
|
||||
" # Re-generar grafo\n",
|
||||
" graph_data = extraction_to_sigma(result)\n",
|
||||
" html_path = render_sigma_html(\n",
|
||||
" graph_data=graph_data,\n",
|
||||
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
|
||||
" title='Ontology Graph (expanded)',\n",
|
||||
" )\n",
|
||||
" print(f'Updated graph: file://{html_path}')\n",
|
||||
"else:\n",
|
||||
" print('No re-extraction needed.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m 8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 12\u001b[39m \n",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys, os, json, subprocess\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"ROOT = '/home/lucas/fn_registry'\n",
|
||||
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
|
||||
"sys.path.insert(0, ROOT)\n",
|
||||
"sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
|
||||
"\n",
|
||||
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
|
||||
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
|
||||
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
|
||||
"\n",
|
||||
"print('OK: imports loaded')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"imports OK\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys, os, json, subprocess\n",
|
||||
"\n",
|
||||
"# Añadir lib/ al path\n",
|
||||
"sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
|
||||
"\n",
|
||||
"from core_functions import extract_json_from_llm\n",
|
||||
"from extraction_pipeline import extraction_pipeline\n",
|
||||
"from render_sigma_html import render_sigma_html\n",
|
||||
"\n",
|
||||
"print('imports OK')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"LLM wrapper OK: {'ok': True}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
|
||||
" \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
|
||||
" parts = []\n",
|
||||
" for msg in messages:\n",
|
||||
" role = msg['role']\n",
|
||||
" content = msg['content']\n",
|
||||
" if role == 'system':\n",
|
||||
" parts.append(f'[SYSTEM]\\n{content}')\n",
|
||||
" elif role == 'user':\n",
|
||||
" parts.append(f'[USER]\\n{content}')\n",
|
||||
" prompt = '\\n\\n'.join(parts)\n",
|
||||
" \n",
|
||||
" result = subprocess.run(\n",
|
||||
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
|
||||
" capture_output=True, text=True, timeout=120\n",
|
||||
" )\n",
|
||||
" if result.returncode != 0:\n",
|
||||
" raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
|
||||
" \n",
|
||||
" envelope = json.loads(result.stdout)\n",
|
||||
" raw_text = envelope.get('result', '')\n",
|
||||
" return extract_json_from_llm(raw_text)\n",
|
||||
"\n",
|
||||
"# Test\n",
|
||||
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
|
||||
"print('LLM wrapper OK:', test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"19 presets, 35 relation types\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"OSINT_PRESETS = [\n",
|
||||
" {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
|
||||
" 'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
|
||||
" {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
|
||||
" 'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
|
||||
" {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
|
||||
" 'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
|
||||
" {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
|
||||
" 'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
|
||||
" {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
|
||||
" 'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
|
||||
" {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
|
||||
" 'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
|
||||
" {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
|
||||
" 'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
|
||||
" {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
|
||||
" 'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
|
||||
" {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
|
||||
" 'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
|
||||
" {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
|
||||
" 'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
|
||||
" {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
|
||||
" 'metadata_fields': ['address', 'blockchain', 'balance']},\n",
|
||||
" {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
|
||||
" 'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
|
||||
" {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
|
||||
" 'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"GENERIC_PRESETS = [\n",
|
||||
" {'type_ref': 'concept', 'label': 'Concept',\n",
|
||||
" 'metadata_fields': ['name', 'category', 'definition']},\n",
|
||||
" {'type_ref': 'url', 'label': 'URL/Link',\n",
|
||||
" 'metadata_fields': ['url', 'domain', 'context']},\n",
|
||||
" {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
|
||||
" 'metadata_fields': ['date', 'precision', 'context']},\n",
|
||||
" {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
|
||||
" 'metadata_fields': ['value', 'unit', 'context']},\n",
|
||||
" {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
|
||||
" 'metadata_fields': ['lat', 'lon', 'label']},\n",
|
||||
" {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
|
||||
" 'metadata_fields': ['text', 'category', 'relevance']},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
|
||||
"\n",
|
||||
"RELATION_TYPES = [\n",
|
||||
" 'employs', 'works_for', 'founded', 'owns', 'controls',\n",
|
||||
" 'member_of', 'affiliated_with', 'collaborates_with',\n",
|
||||
" 'communicates_with', 'sent_to', 'received_from',\n",
|
||||
" 'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
|
||||
" 'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
|
||||
" 'mentions', 'references', 'describes', 'authored', 'published',\n",
|
||||
" 'funds', 'transacted_with', 'invested_in',\n",
|
||||
" 'hosts', 'resolves_to', 'exploits', 'targets',\n",
|
||||
" 'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 0.0%] Extracting text from file...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 0.0%] Extracting entities from chunk 1/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 0.7%] Extracting entities from chunk 2/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 1.5%] Extracting entities from chunk 3/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 2.2%] Extracting entities from chunk 4/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 3.0%] Extracting entities from chunk 5/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
|
||||
" candidates = extract_entities_llm(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 3.7%] Extracting entities from chunk 6/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 4.4%] Extracting entities from chunk 7/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 5.2%] Extracting entities from chunk 8/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 5.9%] Extracting entities from chunk 9/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 6.7%] Extracting entities from chunk 10/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 7.4%] Extracting entities from chunk 11/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 8.1%] Extracting entities from chunk 12/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 8.9%] Extracting entities from chunk 13/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 9.6%] Extracting entities from chunk 14/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 10.4%] Extracting entities from chunk 15/54\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" [ 11.1%] Extracting entities from chunk 16/54\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
|
||||
"\n",
|
||||
"def on_progress(msg, pct):\n",
|
||||
" print(f' [{pct*100:5.1f}%] {msg}')\n",
|
||||
"\n",
|
||||
"result = extraction_pipeline(\n",
|
||||
" file_path=DOC_PATH,\n",
|
||||
" entity_presets=ALL_PRESETS,\n",
|
||||
" relation_types=RELATION_TYPES,\n",
|
||||
" llm_chat_json=claude_haiku_json,\n",
|
||||
" chunk_size=800,\n",
|
||||
" chunk_overlap=100,\n",
|
||||
" confidence_threshold=0.5,\n",
|
||||
" dedup_threshold=0.85,\n",
|
||||
" on_progress=on_progress,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
|
||||
"print(f'Relations: {result.stats.final_relations_count}')\n",
|
||||
"print(f'Chunks: {result.stats.total_chunks}')\n",
|
||||
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
|
||||
"print(f'Entity types: {result.stats.entity_types_found}')\n",
|
||||
"print(f'Relation types: {result.stats.relation_types_found}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pipeline optimizado\n",
|
||||
"\n",
|
||||
"- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
|
||||
"- Chunks de 2000 chars\n",
|
||||
"- Paralelizado con ThreadPoolExecutor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||
"from extract_text_from_file import extract_text_from_file\n",
|
||||
"from core_functions import preprocess_text\n",
|
||||
"from split_text_into_chunks import split_text_into_chunks\n",
|
||||
"from deduplicate_entities import deduplicate_entities\n",
|
||||
"from deduplicate_relations import deduplicate_relations\n",
|
||||
"from entity_candidate import EntityCandidate\n",
|
||||
"from relation_candidate import RelationCandidate\n",
|
||||
"\n",
|
||||
"def build_unified_prompt(entity_presets, relation_types):\n",
|
||||
" \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
|
||||
" type_lines = []\n",
|
||||
" for p in entity_presets:\n",
|
||||
" fields = ', '.join(p.get('metadata_fields', []))\n",
|
||||
" type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
|
||||
"\n",
|
||||
" return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
|
||||
"\n",
|
||||
"ENTITY TYPES:\n",
|
||||
"{chr(10).join(type_lines)}\n",
|
||||
"\n",
|
||||
"RELATION TYPES: {', '.join(relation_types)}\n",
|
||||
"\n",
|
||||
"OUTPUT FORMAT (strict JSON):\n",
|
||||
"{{\n",
|
||||
" \"entities\": [\n",
|
||||
" {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
|
||||
" ],\n",
|
||||
" \"relations\": [\n",
|
||||
" {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
|
||||
" ],\n",
|
||||
" \"suggested_types\": [\n",
|
||||
" {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
|
||||
" ]\n",
|
||||
"}}\n",
|
||||
"\n",
|
||||
"RULES:\n",
|
||||
"- Extract ALL entities explicitly mentioned in the text\n",
|
||||
"- Use exact type_ref from the schema. Leave unknown attributes as null\n",
|
||||
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
|
||||
"- Relations: from_name and to_name MUST match extracted entity names exactly\n",
|
||||
"- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
|
||||
"- If no suggested types are needed, return \"suggested_types\": []\n",
|
||||
"- Respond in the same language as the text for descriptions'''\n",
|
||||
"\n",
|
||||
"UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
|
||||
"print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
[project]
|
||||
name = "ontology-graph"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"jupyter>=1.1.1",
|
||||
"jupyter-collaboration>=4.3.0",
|
||||
"jupyter-mcp-server>=0.4.0",
|
||||
"jupyterlab>=4.5.6",
|
||||
"matplotlib>=3.10.8",
|
||||
"numpy>=2.4.4",
|
||||
"pandas>=3.0.2",
|
||||
]
|
||||
Executable
+45
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# Jupyter Lab — modo colaborativo con autodeteccion de puerto
|
||||
# Generado por write_jupyter_launcher (fn_registry)
|
||||
|
||||
find_free_port() {
|
||||
for port in 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899; do
|
||||
if ! ss -tln 2>/dev/null | grep -q ":${port} " && \
|
||||
! lsof -i:"$port" >/dev/null 2>&1; then
|
||||
echo $port
|
||||
return
|
||||
fi
|
||||
done
|
||||
echo 8888
|
||||
}
|
||||
|
||||
PORT=${1:-$(find_free_port)}
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
echo $PORT > .jupyter-port
|
||||
|
||||
source .venv/bin/activate 2>/dev/null || true
|
||||
|
||||
if ! python -c "import jupyter_collaboration" 2>/dev/null; then
|
||||
echo "ERROR: jupyter-collaboration no esta instalado"
|
||||
echo "Instala con: uv add jupyter-collaboration"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "════════════════════════════════════════════════"
|
||||
echo " Jupyter Lab + Colaboracion en puerto $PORT"
|
||||
echo "════════════════════════════════════════════════"
|
||||
echo ""
|
||||
echo " Abre: http://localhost:$PORT"
|
||||
echo " Ctrl+C para detener"
|
||||
echo ""
|
||||
|
||||
jupyter lab \
|
||||
--port=$PORT \
|
||||
--no-browser \
|
||||
--ServerApp.token='' \
|
||||
--ServerApp.password='' \
|
||||
--ServerApp.disable_check_xsrf=True \
|
||||
--ServerApp.allow_origin='*' \
|
||||
--ServerApp.root_dir="$(pwd)" \
|
||||
--collaborative
|
||||
Reference in New Issue
Block a user