feat: enrichers, panel de ingest y menu contextual en el grafo

- Añade enricher.go + directorio enrichers/ para enriquecer entidades con fuentes externas. - Nuevos componentes frontend: IngestPanel (panel de ingesta de datos) y NodeContextMenu (menu contextual sobre nodos del grafo). - Retira SearchBar y lib/utils.ts; la busqueda se integra dentro de los paneles existentes. - Ajusta tipos (types.go, types.ts, wailsjs/go) y theming (postcss + app.css + Mantine). - Actualiza app.go y wails.json para exponer las nuevas capacidades. - Añade directorio projects/ con estado inicial. - Rebuild del frontend (dist actualizado).
2026-04-13 23:32:55 +02:00
parent 23198eee0c
commit c9fd4aa84c
42 changed files with 2615 additions and 1543 deletions
@@ -0,0 +1,58 @@
+"""Enricher: Extract text from a document file."""
+
+import sys
+import json
+import os
+
+sys.path.insert(0, os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), "python", "functions", "core"))
+
+from extract_text_from_file import extract_text_from_file
+
+
+def main():
+    entity = json.load(sys.stdin)
+    file_path = (entity.get("metadata") or {}).get("file_path", "")
+
+    if not file_path:
+        json.dump({"error": "No file_path in entity metadata"}, sys.stdout)
+        return
+
+    if not os.path.exists(file_path):
+        json.dump({"error": f"File not found: {file_path}"}, sys.stdout)
+        return
+
+    text = extract_text_from_file(file_path)
+
+    result = {
+        "entities": [
+            {
+                "name": f"Text: {os.path.basename(file_path)}",
+                "type_ref": "text",
+                "description": f"Text extracted from {os.path.basename(file_path)}",
+                "tags": ["extracted"],
+                "metadata": {
+                    "content_preview": text[:500],
+                    "source": file_path,
+                    "char_count": len(text),
+                    "full_content": text,
+                },
+                "notes": "",
+            }
+        ],
+        "relations": [
+            {
+                "name": "extracted_from",
+                "from_entity": "__NEW_0__",
+                "to_entity": "__SOURCE__",
+                "description": f"Text extracted from document",
+                "weight": 1.0,
+                "tags": [],
+                "notes": "",
+            }
+        ],
+    }
+    json.dump(result, sys.stdout, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,243 @@
+"""Enricher: Extract entities + relations from text using LLM (claude -p haiku)."""
+
+import sys
+import json
+import os
+import subprocess
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Registry functions
+ROOT = os.environ.get("FN_REGISTRY_ROOT", "")
+sys.path.insert(0, os.path.join(ROOT, "python", "functions", "core"))
+sys.path.insert(0, os.path.join(ROOT, "python", "functions", "datascience"))
+sys.path.insert(0, os.path.join(ROOT, "python", "functions", "cybersecurity"))
+sys.path.insert(0, os.path.join(ROOT, "analysis", "ontology_graph", "lib"))
+
+from core_functions import extract_json_from_llm, preprocess_text
+from split_text_into_chunks import split_text_into_chunks
+from deduplicate_entities import deduplicate_entities
+from deduplicate_relations import deduplicate_relations
+
+# ── Presets ────────────────────────────────────────────────────────────────────
+
+ENTITY_PRESETS = [
+    {"type_ref": "person", "label": "Person",
+     "metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
+    {"type_ref": "organization", "label": "Organization",
+     "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
+    {"type_ref": "location", "label": "Location",
+     "metadata_fields": ["lat", "lon", "address", "country", "city"]},
+    {"type_ref": "event", "label": "Event",
+     "metadata_fields": ["event_type", "date", "location", "description", "severity"]},
+    {"type_ref": "email", "label": "Email",
+     "metadata_fields": ["address", "provider", "verified", "breached"]},
+    {"type_ref": "domain", "label": "Domain",
+     "metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
+    {"type_ref": "ip_address", "label": "IP Address",
+     "metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
+    {"type_ref": "phone", "label": "Phone",
+     "metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
+    {"type_ref": "document", "label": "Document",
+     "metadata_fields": ["title", "format", "classification", "source"]},
+    {"type_ref": "url", "label": "URL/Link",
+     "metadata_fields": ["url", "domain", "context"]},
+    {"type_ref": "concept", "label": "Concept",
+     "metadata_fields": ["name", "category", "definition"]},
+    {"type_ref": "date_reference", "label": "Date/Time",
+     "metadata_fields": ["date", "precision", "context"]},
+    {"type_ref": "quantity", "label": "Quantity/Amount",
+     "metadata_fields": ["value", "unit", "context"]},
+]
+
+RELATION_TYPES = [
+    "employs", "works_for", "founded", "owns", "controls",
+    "member_of", "affiliated_with", "collaborates_with",
+    "communicates_with", "sent_to", "received_from",
+    "located_in", "headquartered_in", "operates_in",
+    "participated_in", "caused", "occurred_at", "occurred_on",
+    "mentions", "references", "describes", "authored", "published",
+    "funds", "transacted_with", "invested_in",
+    "hosts", "resolves_to", "exploits", "targets",
+    "related_to", "part_of", "instance_of", "has_attribute",
+]
+
+# ── Load custom presets ────────────────────────────────────────────────────────
+
+CUSTOM_PRESETS_PATH = os.path.join(ROOT, "analysis", "ontology_graph", "data", "custom_presets.json")
+
+def load_custom_presets():
+    if os.path.exists(CUSTOM_PRESETS_PATH):
+        with open(CUSTOM_PRESETS_PATH) as f:
+            data = json.load(f)
+        return [p for p in data.get("presets", []) if not p.get("promoted", False)]
+    return []
+
+# ── LLM ────────────────────────────────────────────────────────────────────────
+
+def claude_haiku_json(messages):
+    parts = []
+    for msg in messages:
+        if msg["role"] == "system":
+            parts.append(f"[SYSTEM]\n{msg['content']}")
+        elif msg["role"] == "user":
+            parts.append(f"[USER]\n{msg['content']}")
+    prompt = "\n\n".join(parts)
+
+    result = subprocess.run(
+        ["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
+        capture_output=True, text=True, timeout=120,
+    )
+    if result.returncode != 0:
+        return {}
+    envelope = json.loads(result.stdout)
+    return extract_json_from_llm(envelope.get("result", ""))
+
+# ── Prompt ─────────────────────────────────────────────────────────────────────
+
+def build_prompt(presets, rel_types):
+    type_lines = []
+    for p in presets:
+        fields = ", ".join(p.get("metadata_fields", []))
+        type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
+
+    return (
+        "You are an entity and relation extraction expert. "
+        "Given text, extract ALL entities and relations in a single pass.\n\n"
+        "ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
+        "RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
+        'OUTPUT FORMAT (strict JSON):\n'
+        '{\n'
+        '  "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
+        '  "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}]\n'
+        '}\n\n'
+        "RULES:\n"
+        "- Extract ALL entities explicitly mentioned\n"
+        "- Use exact type_ref from schema. Unknown attributes = null\n"
+        "- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
+        "- Relations: from_name/to_name MUST match entity names exactly\n"
+        "- Respond in the same language as the text for descriptions"
+    )
+
+# ── Process chunk ──────────────────────────────────────────────────────────────
+
+def process_chunk(chunk_text, system_prompt):
+    try:
+        resp = claude_haiku_json([
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": chunk_text},
+        ])
+        return resp.get("entities", []), resp.get("relations", [])
+    except Exception:
+        return [], []
+
+# ── Main ───────────────────────────────────────────────────────────────────────
+
+def main():
+    entity = json.load(sys.stdin)
+    text = (entity.get("metadata") or {}).get("full_content", "")
+
+    if not text:
+        json.dump({"error": "No text content in entity metadata"}, sys.stdout)
+        return
+
+    text = preprocess_text(text)
+    chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
+
+    all_presets = ENTITY_PRESETS + load_custom_presets()
+    system_prompt = build_prompt(all_presets, RELATION_TYPES)
+
+    # Parallel extraction
+    from entity_candidate import EntityCandidate
+    from relation_candidate import RelationCandidate
+
+    all_entities = []
+    all_relations_raw = []
+
+    with ThreadPoolExecutor(max_workers=4) as pool:
+        futures = {pool.submit(process_chunk, chunk, system_prompt): i for i, chunk in enumerate(chunks)}
+        for future in as_completed(futures):
+            ents, rels = future.result()
+            for e in ents:
+                name = e.get("name", "").strip()
+                if name and e.get("confidence", 0) >= 0.5:
+                    all_entities.append(EntityCandidate(
+                        name=name,
+                        type_ref=e.get("type_ref", "concept"),
+                        attributes=e.get("attributes", {}),
+                        confidence=float(e.get("confidence", 0.5)),
+                        source_chunk_indices=[futures[future]],
+                    ))
+            for r in rels:
+                fn = r.get("from_name", "").strip()
+                tn = r.get("to_name", "").strip()
+                if fn and tn:
+                    all_relations_raw.append(RelationCandidate(
+                        from_name=fn, to_name=tn,
+                        relation_type=r.get("relation_type", "related_to"),
+                        confidence=float(r.get("confidence", 0.5)),
+                        description=r.get("description", ""),
+                        source_chunk_index=futures[future],
+                    ))
+
+    # Dedup
+    if all_entities:
+        dedup = deduplicate_entities(all_entities, name_threshold=0.85)
+        final_entities = dedup.entities
+        entity_id_map = dedup.name_to_id
+        final_relations = deduplicate_relations(all_relations_raw, entity_id_map)
+    else:
+        final_entities = []
+        final_relations = []
+
+    # Convert to enricher output format
+    entities_out = []
+    relations_out = []
+
+    for i, e in enumerate(final_entities):
+        attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None}
+        entities_out.append({
+            "name": e.name,
+            "type_ref": e.type_ref,
+            "description": f"Extracted from text ({e.confidence:.0%} confidence)",
+            "tags": ["extracted", "llm"],
+            "metadata": attrs,
+            "notes": "",
+        })
+
+    # Build name→index map for relations
+    name_to_idx = {}
+    for i, e in enumerate(final_entities):
+        name_to_idx[e.name] = i
+        name_to_idx[e.name.lower().strip()] = i
+
+    for r in final_relations:
+        from_idx = name_to_idx.get(r.from_name) or name_to_idx.get(r.from_name.lower().strip())
+        to_idx = name_to_idx.get(r.to_name) or name_to_idx.get(r.to_name.lower().strip())
+        if from_idx is not None and to_idx is not None:
+            relations_out.append({
+                "name": r.relation_type,
+                "from_entity": f"__NEW_{from_idx}__",
+                "to_entity": f"__NEW_{to_idx}__",
+                "description": r.description,
+                "weight": r.confidence,
+                "tags": ["extracted"],
+                "notes": "",
+            })
+
+    # Also connect all entities to source text node
+    for i in range(len(entities_out)):
+        relations_out.append({
+            "name": "extracted_from",
+            "from_entity": f"__NEW_{i}__",
+            "to_entity": "__SOURCE__",
+            "description": "Entity extracted from text",
+            "weight": 1.0,
+            "tags": [],
+            "notes": "",
+        })
+
+    json.dump({"entities": entities_out, "relations": relations_out}, sys.stdout, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,71 @@
+"""Enricher: Extract URLs from a text node."""
+
+import sys
+import json
+import os
+
+sys.path.insert(0, os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), "python", "functions", "cybersecurity"))
+
+from cybersecurity import extract_urls
+
+
+def main():
+    entity = json.load(sys.stdin)
+    text = (entity.get("metadata") or {}).get("full_content", "")
+
+    if not text:
+        text = entity.get("description", "")
+
+    if not text:
+        json.dump({"error": "No text content found in entity"}, sys.stdout)
+        return
+
+    urls = extract_urls(text)
+
+    # Deduplicate
+    seen = set()
+    unique_urls = []
+    for u in urls:
+        normalized = u.rstrip("/").lower()
+        if normalized not in seen:
+            seen.add(normalized)
+            unique_urls.append(u)
+
+    entities = []
+    relations = []
+
+    for i, url in enumerate(unique_urls):
+        # Extract domain from URL
+        domain = ""
+        try:
+            from urllib.parse import urlparse
+            domain = urlparse(url).netloc
+        except Exception:
+            pass
+
+        entities.append({
+            "name": url[:80],
+            "type_ref": "url",
+            "description": f"URL found in text",
+            "tags": ["extracted"],
+            "metadata": {
+                "url": url,
+                "domain": domain,
+            },
+            "notes": "",
+        })
+        relations.append({
+            "name": "contains",
+            "from_entity": "__SOURCE__",
+            "to_entity": f"__NEW_{i}__",
+            "description": "URL found in text",
+            "weight": 1.0,
+            "tags": [],
+            "notes": "",
+        })
+
+    json.dump({"entities": entities, "relations": relations}, sys.stdout, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,52 @@
+"""Enricher: Fetch HTTP headers for a URL and update entity metadata."""
+
+import sys
+import json
+
+try:
+    import httpx
+except ImportError:
+    import urllib.request
+
+
+def main():
+    entity = json.load(sys.stdin)
+    url = (entity.get("metadata") or {}).get("url") or entity.get("name", "")
+
+    if not url:
+        json.dump({"error": "No URL found in entity"}, sys.stdout)
+        return
+
+    try:
+        try:
+            resp = httpx.head(url, follow_redirects=True, timeout=10)
+            headers = dict(resp.headers)
+            status = resp.status_code
+        except NameError:
+            req = urllib.request.Request(url, method="HEAD")
+            with urllib.request.urlopen(req, timeout=10) as resp:
+                headers = dict(resp.headers)
+                status = resp.status
+    except Exception as e:
+        json.dump({"error": f"Failed to fetch headers: {e}"}, sys.stdout)
+        return
+
+    # Return metadata update for the source entity
+    result = {
+        "entities": [],
+        "relations": [],
+        "metadata_update": {
+            "entity_id": entity["id"],
+            "metadata": {
+                "status_code": status,
+                "server": headers.get("server", ""),
+                "content_type": headers.get("content-type", ""),
+                "x_powered_by": headers.get("x-powered-by", ""),
+            },
+        },
+    }
+    json.dump(result, sys.stdout, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,54 @@
+"""Enricher: Fetch URL and produce a text node."""
+
+import sys
+import json
+import os
+
+sys.path.insert(0, os.path.join(os.environ.get("FN_REGISTRY_ROOT", ""), "python", "functions", "core"))
+
+from fetch_and_parse_url import fetch_and_parse_url
+
+
+def main():
+    entity = json.load(sys.stdin)
+    url = (entity.get("metadata") or {}).get("url") or entity.get("name", "")
+
+    if not url:
+        json.dump({"error": "No URL found in entity metadata or name"}, sys.stdout)
+        return
+
+    text = fetch_and_parse_url(url)
+
+    result = {
+        "entities": [
+            {
+                "name": f"Text: {url[:60]}",
+                "type_ref": "text",
+                "description": f"Text extracted from {url}",
+                "tags": ["extracted"],
+                "metadata": {
+                    "content_preview": text[:500],
+                    "source": url,
+                    "char_count": len(text),
+                    "full_content": text,
+                },
+                "notes": "",
+            }
+        ],
+        "relations": [
+            {
+                "name": "extracted_from",
+                "from_entity": "__NEW_0__",
+                "to_entity": "__SOURCE__",
+                "description": "Text extracted from URL",
+                "weight": 1.0,
+                "tags": [],
+                "notes": "",
+            }
+        ],
+    }
+    json.dump(result, sys.stdout, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()