Merge issue 0013 — Paste & Extract panel
- Panel ImGui dockable: textarea, Extract button, preview tables (entities + relations) - Subprocess directo a enrichers/paste_extract/run.py (no usa jobs system; preview puro) - Pipeline Python emite preview JSON; commit a operations.db lo hace C++ con dedupe (type_ref, name) - 12 tests pytest nuevos (paste_extract enricher + extract_panel logic) - GLiNER/GLiREL path cableado pero no ejercitado en tests (modelos pesados); validacion interactiva pendiente Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -27,6 +27,7 @@ add_imgui_app(graph_explorer
|
||||
jobs.cpp
|
||||
enrichers.cpp
|
||||
chat.cpp
|
||||
extract_panel.cpp
|
||||
# --- viz ---
|
||||
${FN_CPP_ROOT_DIR}/functions/viz/graph_renderer.cpp
|
||||
${FN_CPP_ROOT_DIR}/functions/viz/graph_force_layout.cpp
|
||||
|
||||
@@ -30,6 +30,10 @@ uses_functions:
|
||||
- fullscreen_window_cpp_core
|
||||
- badge_cpp_core
|
||||
- empty_state_cpp_core
|
||||
# paste & extract panel (issue 0013) — invoca enrichers/paste_extract/run.py
|
||||
# via subprocess directo (no via jobs); uses extract_iocs + opcional hybrid.
|
||||
- extract_iocs_py_cybersecurity
|
||||
- extract_graph_hybrid_py_pipelines
|
||||
uses_types: []
|
||||
framework: "imgui"
|
||||
entry_point: "main.cpp"
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
id: paste_extract
|
||||
name: "Paste & Extract"
|
||||
description: "Extrae entidades y relaciones de texto pegado en el panel Extract. Cascada: extract_iocs (regex) + GLiNER + GLiREL si estan disponibles, fallback a solo regex. Modo preview (no escribe). El panel C++ procesa el JSON y persiste lo seleccionado via entity_ops."
|
||||
applies_to: []
|
||||
emits: [Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone, Person, Organization, Location]
|
||||
relations: []
|
||||
uses_functions:
|
||||
- extract_iocs_py_cybersecurity
|
||||
- extract_graph_hybrid_py_pipelines
|
||||
params:
|
||||
- { name: text, type: string, default: "", description: "Texto a analizar (lo pasa el panel)" }
|
||||
- { name: types, type: string, default: "", description: "CSV de tipos IoC; vacio = todos" }
|
||||
- { name: max_entities, type: int, default: 200 }
|
||||
- { name: use_hybrid, type: bool, default: "false", description: "Si true intenta cargar GLiNER/GLiREL" }
|
||||
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Enricher paste_extract — modo preview puro para el panel "Paste & Extract".
|
||||
|
||||
A diferencia del resto de enrichers, este NO escribe a operations.db. Recibe
|
||||
el texto via `params.text` y devuelve un JSON con las entidades y relaciones
|
||||
propuestas. La aplicacion (panel C++) procesa la propuesta, el usuario marca
|
||||
cuales aceptar y la propia app persiste con dedupe via entity_ops.
|
||||
|
||||
Cascada de extraccion (graceful fallback):
|
||||
1. `extract_iocs(text, types_list)` — regex puro, siempre disponible.
|
||||
2. Si `use_hybrid=true` y los modelos cargan correctamente → ademas
|
||||
`extract_graph_hybrid` (GLiNER + GLiREL + LLM opcional). Si la
|
||||
carga del modelo falla por dependencias o por tiempo se ignora
|
||||
silenciosamente — el panel muestra solo lo que llego.
|
||||
|
||||
Wire protocol estandar (issue 0026), pero tolera ausencia de `node_id` y de
|
||||
`ops_db_path`: este enricher es global, no pertenece a un nodo.
|
||||
|
||||
Output JSON:
|
||||
{
|
||||
"entities": [
|
||||
{"id": "<temp>", "type_ref": "Email", "name": "x@y.z",
|
||||
"metadata": {...}, "source": "regex|gliner|llm",
|
||||
"start": 12, "end": 27, "confidence": 1.0}
|
||||
],
|
||||
"relations": [
|
||||
{"from_id": "<temp>", "to_id": "<temp>", "name": "RELATED_TO",
|
||||
"source": "glirel|llm", "confidence": 0.7}
|
||||
],
|
||||
"stats": {"layers": ["regex"], "n_entities": N, "n_relations": M}
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
_TYPE_MAP = {
|
||||
"email": ("Email", "address"),
|
||||
"ip_address": ("IPAddress", "address"),
|
||||
"domain": ("Domain", "name"),
|
||||
"file_hash": ("FileHash", "value"),
|
||||
"crypto_wallet": ("CryptoWallet", "address"),
|
||||
"cve_id": ("CVE", "id"),
|
||||
"mac_address": ("MACAddress", "address"),
|
||||
"phone_number": ("Phone", "number"),
|
||||
}
|
||||
|
||||
|
||||
def progress(p: float, stage: str = "") -> None:
|
||||
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
sys.stderr.write(f"{msg}\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def _setup_registry_path(registry_root: str) -> None:
|
||||
"""Intenta hacer importables los paquetes del registry.
|
||||
|
||||
Estrategia identica a otros enrichers:
|
||||
1. `<this_dir>/_vendored/` — para distribuciones binarias.
|
||||
2. `<registry_root>/python/functions/` — modo dev local.
|
||||
"""
|
||||
vendored = os.path.join(os.path.dirname(__file__), "_vendored")
|
||||
if os.path.isdir(vendored):
|
||||
if vendored not in sys.path:
|
||||
sys.path.insert(0, vendored)
|
||||
return
|
||||
if registry_root:
|
||||
py_funcs = os.path.join(registry_root, "python", "functions")
|
||||
if os.path.isdir(py_funcs) and py_funcs not in sys.path:
|
||||
sys.path.insert(0, py_funcs)
|
||||
|
||||
|
||||
def _run_regex(text: str, types_list, max_entities: int) -> tuple[list, list]:
|
||||
"""Capa 1: extract_iocs. Devuelve (entities, relations)."""
|
||||
try:
|
||||
from cybersecurity.extract_iocs import extract_iocs # type: ignore
|
||||
except Exception as e:
|
||||
log(f"extract_iocs no importable: {e}")
|
||||
return [], []
|
||||
|
||||
iocs = extract_iocs(text, types_list)
|
||||
seen = set()
|
||||
entities = []
|
||||
for i, it in enumerate(iocs):
|
||||
ioc_type = it.get("type")
|
||||
value = it.get("value") or it.get("address") or it.get("name") or ""
|
||||
if not ioc_type or not value:
|
||||
continue
|
||||
key = (ioc_type, value)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
type_ref, value_field = _TYPE_MAP.get(ioc_type, (ioc_type, "value"))
|
||||
entities.append({
|
||||
"id": f"tmp_e_{i}",
|
||||
"type_ref": type_ref,
|
||||
"name": value,
|
||||
"metadata": {value_field: value},
|
||||
"source": "regex",
|
||||
"start": int(it.get("start", -1)),
|
||||
"end": int(it.get("end", -1)),
|
||||
"confidence": 1.0,
|
||||
})
|
||||
if len(entities) >= max_entities:
|
||||
break
|
||||
return entities, [] # regex no produce relaciones
|
||||
|
||||
|
||||
def _try_run_hybrid(text: str, registry_root: str,
|
||||
confidence_threshold: float = 0.6,
|
||||
) -> tuple[list, list]:
|
||||
"""Capa 2 opcional: extract_graph_hybrid. Devuelve (entities, relations).
|
||||
|
||||
Cualquier error o ausencia de dependencias se traga silenciosamente —
|
||||
GLiNER/GLiREL pueden no estar instalados. La idea es no romper el
|
||||
flow del panel cuando solo regex esta disponible.
|
||||
"""
|
||||
try:
|
||||
# El pipeline esta en `pipelines/`, importarlo via path absoluto si
|
||||
# el registry_root esta disponible. En modo vendored este pipeline
|
||||
# quizas no este — fallback silencioso.
|
||||
sys.path.insert(0, os.path.join(registry_root, "python", "functions",
|
||||
"pipelines"))
|
||||
from extract_graph_hybrid import extract_graph_hybrid # type: ignore
|
||||
from datascience.gliner_load_model import gliner_load_model # type: ignore
|
||||
from datascience.glirel_load_model import glirel_load_model # type: ignore
|
||||
except Exception as e:
|
||||
log(f"hybrid pipeline no disponible: {e}")
|
||||
return [], []
|
||||
|
||||
# Schema generico para entidades semanticas. Si el caller quiere un
|
||||
# schema custom, lo extendemos via params en una iteracion futura.
|
||||
entity_schema = [
|
||||
{"type_ref": "Person", "label": "person"},
|
||||
{"type_ref": "Organization", "label": "organization"},
|
||||
{"type_ref": "Location", "label": "location"},
|
||||
]
|
||||
relation_types = ["works_at", "located_in", "part_of", "related_to"]
|
||||
|
||||
try:
|
||||
gliner_model = gliner_load_model()
|
||||
glirel_model = glirel_load_model()
|
||||
except Exception as e:
|
||||
log(f"hybrid: load modelo fallo: {e}")
|
||||
return [], []
|
||||
|
||||
try:
|
||||
ents, rels = extract_graph_hybrid(
|
||||
chunks=[text],
|
||||
entity_schema=entity_schema,
|
||||
relation_types=relation_types,
|
||||
gliner_model=gliner_model,
|
||||
glirel_model=glirel_model,
|
||||
llm_chat_json=None,
|
||||
confidence_threshold=confidence_threshold,
|
||||
)
|
||||
except Exception as e:
|
||||
log(f"hybrid: extract fallo: {e}")
|
||||
return [], []
|
||||
|
||||
out_entities = []
|
||||
name_to_idx: dict[tuple[str, str], int] = {}
|
||||
for i, ec in enumerate(ents):
|
||||
idx = len(out_entities)
|
||||
out_entities.append({
|
||||
"id": f"tmp_h_{idx}",
|
||||
"type_ref": ec.type_ref,
|
||||
"name": ec.name,
|
||||
"metadata": dict(getattr(ec, "attributes", {}) or {}),
|
||||
"source": "hybrid",
|
||||
"start": int((ec.attributes or {}).get("start", -1))
|
||||
if hasattr(ec, "attributes") else -1,
|
||||
"end": int((ec.attributes or {}).get("end", -1))
|
||||
if hasattr(ec, "attributes") else -1,
|
||||
"confidence": float(ec.confidence),
|
||||
})
|
||||
name_to_idx[(ec.type_ref, ec.name)] = idx
|
||||
|
||||
out_relations = []
|
||||
for rc in rels:
|
||||
# Mapear from/to (RelationCandidate) a tmp_id si los podemos casar.
|
||||
from_key = (getattr(rc, "from_type_ref", None), getattr(rc, "from_name", None))
|
||||
to_key = (getattr(rc, "to_type_ref", None), getattr(rc, "to_name", None))
|
||||
if None in from_key or None in to_key:
|
||||
continue
|
||||
fi = name_to_idx.get(from_key)
|
||||
ti = name_to_idx.get(to_key)
|
||||
if fi is None or ti is None:
|
||||
continue
|
||||
out_relations.append({
|
||||
"from_id": f"tmp_h_{fi}",
|
||||
"to_id": f"tmp_h_{ti}",
|
||||
"name": getattr(rc, "name", "RELATED_TO") or "RELATED_TO",
|
||||
"source": "hybrid",
|
||||
"confidence": float(getattr(rc, "confidence", 0.0)),
|
||||
})
|
||||
return out_entities, out_relations
|
||||
|
||||
|
||||
def main() -> int:
|
||||
raw = sys.stdin.read()
|
||||
try:
|
||||
ctx = json.loads(raw)
|
||||
except Exception as e:
|
||||
log(f"stdin not valid JSON: {e}")
|
||||
return 2
|
||||
|
||||
params = ctx.get("params") or {}
|
||||
registry_root = ctx.get("registry_root") or ""
|
||||
|
||||
text = (params.get("text") or "").strip()
|
||||
if not text:
|
||||
log("missing params.text")
|
||||
print(json.dumps({"error": "missing params.text",
|
||||
"entities": [], "relations": [],
|
||||
"stats": {"layers": [], "n_entities": 0,
|
||||
"n_relations": 0}}))
|
||||
return 2
|
||||
|
||||
types_csv = (params.get("types") or "").strip()
|
||||
types_list = [t.strip() for t in types_csv.split(",") if t.strip()] \
|
||||
if types_csv else None
|
||||
max_entities = int(params.get("max_entities", 200))
|
||||
use_hybrid_raw = params.get("use_hybrid", False)
|
||||
if isinstance(use_hybrid_raw, str):
|
||||
use_hybrid = use_hybrid_raw.strip().lower() in ("1", "true", "yes", "on")
|
||||
else:
|
||||
use_hybrid = bool(use_hybrid_raw)
|
||||
|
||||
progress(0.05, "init")
|
||||
_setup_registry_path(registry_root)
|
||||
|
||||
progress(0.20, "regex")
|
||||
regex_entities, _ = _run_regex(text, types_list, max_entities)
|
||||
layers = ["regex"]
|
||||
|
||||
hybrid_entities: list = []
|
||||
hybrid_relations: list = []
|
||||
if use_hybrid:
|
||||
progress(0.40, "hybrid")
|
||||
hybrid_entities, hybrid_relations = _try_run_hybrid(text, registry_root)
|
||||
if hybrid_entities or hybrid_relations:
|
||||
layers.append("hybrid")
|
||||
|
||||
# Mergear regex + hybrid evitando duplicados exactos (type_ref, name).
|
||||
progress(0.85, "merge")
|
||||
seen = set()
|
||||
entities: list[dict] = []
|
||||
for src in (regex_entities, hybrid_entities):
|
||||
for e in src:
|
||||
key = (e["type_ref"], e["name"])
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
entities.append(e)
|
||||
if len(entities) >= max_entities:
|
||||
break
|
||||
if len(entities) >= max_entities:
|
||||
break
|
||||
|
||||
# Reasignar tmp ids tras merge para que sean estables 0..N-1.
|
||||
id_remap: dict[str, str] = {}
|
||||
for i, e in enumerate(entities):
|
||||
new_id = f"tmp_{i}"
|
||||
id_remap[e["id"]] = new_id
|
||||
e["id"] = new_id
|
||||
|
||||
relations: list[dict] = []
|
||||
for r in hybrid_relations:
|
||||
fi = id_remap.get(r["from_id"])
|
||||
ti = id_remap.get(r["to_id"])
|
||||
if fi is None or ti is None:
|
||||
continue
|
||||
relations.append({
|
||||
"from_id": fi,
|
||||
"to_id": ti,
|
||||
"name": r.get("name") or "RELATED_TO",
|
||||
"source": r.get("source") or "hybrid",
|
||||
"confidence": float(r.get("confidence", 0.0)),
|
||||
})
|
||||
|
||||
progress(1.0, "done")
|
||||
print(json.dumps({
|
||||
"entities": entities,
|
||||
"relations": relations,
|
||||
"stats": {
|
||||
"layers": layers,
|
||||
"n_entities": len(entities),
|
||||
"n_relations": len(relations),
|
||||
},
|
||||
}, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
+1079
File diff suppressed because it is too large
Load Diff
+139
@@ -0,0 +1,139 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// Panel "Paste & Extract" (issue 0013).
|
||||
//
|
||||
// Textarea grande para pegar texto. Boton Extract lanza el script
|
||||
// `enrichers/paste_extract/run.py` en un hilo aparte (no bloquea UI).
|
||||
// El script devuelve un JSON con entidades y relaciones propuestas (modo
|
||||
// preview — no escribe a operations.db). El panel muestra dos tablas
|
||||
// (entidades / relaciones) con checkboxes; al pulsar "Apply Selected"
|
||||
// se persisten via entity_ops con dedupe por (type_ref, name).
|
||||
//
|
||||
// Threading: una llamada Extract a la vez (extract_busy bool). El hilo
|
||||
// rellena la propuesta tras hacerse el subprocess. Apply corre en el
|
||||
// thread principal y dispara reload del grafo via app.want_reload.
|
||||
//
|
||||
// El enricher esta declarado en `enrichers/paste_extract/manifest.yaml`
|
||||
// pero NO se invoca via el sistema de jobs — el panel lo lanza
|
||||
// directamente. Vivir en `enrichers/` permite que se distribuya y que
|
||||
// el script use el mismo Python runtime resolution que el resto.
|
||||
|
||||
namespace ge {
|
||||
|
||||
struct AppState;
|
||||
|
||||
// Una entidad propuesta por el extractor. Se guarda como string para
|
||||
// poder editarla inline antes del Apply.
|
||||
struct ProposedEntity {
|
||||
std::string tmp_id; // "tmp_0", "tmp_1", ... vinculado a relaciones
|
||||
std::string type_ref; // editable
|
||||
std::string name; // editable
|
||||
std::string source; // "regex" | "hybrid"
|
||||
int start_offset = -1; // span en el texto pegado
|
||||
int end_offset = -1;
|
||||
double confidence = 1.0;
|
||||
std::string metadata_json; // JSON literal (no editable v1)
|
||||
bool selected = true;
|
||||
|
||||
// Buffers mutables para edicion inline en ImGui.
|
||||
char type_buf[64] = {};
|
||||
char name_buf[256] = {};
|
||||
};
|
||||
|
||||
struct ProposedRelation {
|
||||
std::string from_tmp_id;
|
||||
std::string to_tmp_id;
|
||||
std::string name; // ej: "works_at"
|
||||
std::string source; // "hybrid" | ...
|
||||
double confidence = 0.0;
|
||||
bool selected = true;
|
||||
};
|
||||
|
||||
struct ExtractResult {
|
||||
std::vector<ProposedEntity> entities;
|
||||
std::vector<ProposedRelation> relations;
|
||||
std::vector<std::string> layers;
|
||||
std::string error; // vacio si OK
|
||||
std::string stderr_tail;
|
||||
};
|
||||
|
||||
struct ExtractPanelState {
|
||||
// Buffer de texto del textarea. Crece dinamicamente.
|
||||
std::vector<char> text_buf;
|
||||
bool text_initialized = false;
|
||||
|
||||
// Resultado del ultimo Extract (poblado por el worker thread).
|
||||
std::shared_ptr<ExtractResult> result;
|
||||
std::mutex result_mu;
|
||||
std::atomic<bool> busy{false};
|
||||
std::atomic<bool> new_result{false}; // hay resultado fresco
|
||||
|
||||
// Mensaje de status (en el footer) — refrescado por el worker.
|
||||
std::string status;
|
||||
|
||||
// Stats del ultimo apply.
|
||||
int last_apply_entities = 0;
|
||||
int last_apply_relations = 0;
|
||||
int last_apply_dedup = 0;
|
||||
|
||||
// Toggle: ¿usar hybrid (GLiNER/GLiREL) si esta disponible?
|
||||
bool use_hybrid = false;
|
||||
|
||||
// Worker thread; joinable cuando esta vivo.
|
||||
std::thread worker;
|
||||
};
|
||||
|
||||
// Configura paths que el worker necesita para invocar Python. Llamar una
|
||||
// vez tras `jobs_init` (re-usa el resolver de Python runtime + paths).
|
||||
void extract_panel_init(const char* enrichers_dir,
|
||||
const char* app_dir,
|
||||
const char* registry_root);
|
||||
|
||||
// Suelta el worker thread si esta corriendo (cancelable). Llamar al
|
||||
// shutdown de la app.
|
||||
void extract_panel_shutdown();
|
||||
|
||||
// Renderiza el panel. Si app.panel_extract es false, retorna sin dibujar.
|
||||
void extract_panel_render(AppState& app);
|
||||
|
||||
// Aplica las entidades/relaciones marcadas como selected al
|
||||
// operations.db indicado. Inserta entidades nuevas con dedupe por
|
||||
// (type_ref, name); reusa el id existente si lo encuentra. Despues
|
||||
// inserta las relaciones cuyos endpoints (mapeados via tmp_id ->
|
||||
// real_id) sean ambos validos.
|
||||
//
|
||||
// Devuelve los conteos en out_added_entities, out_dedup_entities,
|
||||
// out_added_relations. Tolera que algunas relaciones no resuelvan
|
||||
// (out_skipped_relations). El caller decide si setear app.want_reload.
|
||||
//
|
||||
// Esta funcion es testeable en aislamiento (no toca ImGui).
|
||||
bool extract_panel_apply(const char* ops_db_path,
|
||||
const ExtractResult& result,
|
||||
int* out_added_entities,
|
||||
int* out_dedup_entities,
|
||||
int* out_added_relations,
|
||||
int* out_skipped_relations);
|
||||
|
||||
// Helper interno expuesto para tests: parsea el JSON que produce
|
||||
// `enrichers/paste_extract/run.py`. Devuelve true si el parseo es OK.
|
||||
// En error, result.error se rellena.
|
||||
bool extract_panel_parse_result(const std::string& json_text,
|
||||
ExtractResult* result);
|
||||
|
||||
// Spawnea el subprocess Python para extraer. Sincronico (bloquea el
|
||||
// hilo del caller). El panel lo invoca en un std::thread aparte para
|
||||
// no congelar la UI. Expuesto por si los tests quieren llamarlo
|
||||
// directamente (no por ahora — los tests cubren el lado Python via
|
||||
// pytest, y el lado C++ via parse_result + apply).
|
||||
bool extract_panel_run_subprocess(const std::string& text,
|
||||
bool use_hybrid,
|
||||
ExtractResult* out);
|
||||
|
||||
} // namespace ge
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "jobs.h"
|
||||
#include "enrichers.h"
|
||||
#include "chat.h"
|
||||
#include "extract_panel.h"
|
||||
|
||||
#include "../../../../cpp/vendor/sqlite3/sqlite3.h"
|
||||
|
||||
@@ -1235,6 +1236,7 @@ static fn_ui::PanelToggle g_panels[] = {
|
||||
{"Table", nullptr, &g_app.panel_table},
|
||||
{"Jobs", nullptr, &g_app.panel_jobs},
|
||||
{"Echo", nullptr, &g_app.panel_chat},
|
||||
{"Extract", nullptr, &g_app.panel_extract},
|
||||
};
|
||||
|
||||
static void render() {
|
||||
@@ -2178,6 +2180,12 @@ static void render() {
|
||||
ImGui::SetNextWindowSize(ImVec2(520.0f, 720.0f), ImGuiCond_FirstUseEver);
|
||||
ge::chat_render(&g_app.panel_chat);
|
||||
|
||||
// Extract panel (issue 0013) — flotante, dockeable.
|
||||
ImGui::SetNextWindowPos (ImVec2(vp->WorkPos.x + W * 0.30f, top + 50.0f),
|
||||
ImGuiCond_FirstUseEver);
|
||||
ImGui::SetNextWindowSize(ImVec2(720.0f, 640.0f), ImGuiCond_FirstUseEver);
|
||||
ge::extract_panel_render(g_app);
|
||||
|
||||
// Enricher config window (abierto desde context menu Run enricher).
|
||||
render_enricher_config_window();
|
||||
|
||||
@@ -2484,6 +2492,12 @@ int main(int argc, char** argv) {
|
||||
(int)ge::enrichers_all().size());
|
||||
}
|
||||
|
||||
// Extract panel (issue 0013) — invoca enrichers/paste_extract/run.py
|
||||
// directamente en su propio hilo, sin pasar por el sistema de jobs.
|
||||
ge::extract_panel_init(enrichers_dir.c_str(),
|
||||
app_dir.c_str(),
|
||||
registry_root.c_str());
|
||||
|
||||
// Chat panel (claude -p) — el agente invoca gx-cli para mutar
|
||||
// operations.db. agent_mutations counter en graph_explorer.db dispara
|
||||
// reload del viewport en cada cambio.
|
||||
@@ -2573,6 +2587,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
// Cleanup
|
||||
ge::chat_shutdown();
|
||||
ge::extract_panel_shutdown();
|
||||
ge::jobs_shutdown();
|
||||
if (g_layout_storage) {
|
||||
fn_ui::layout_storage_close(g_layout_storage);
|
||||
|
||||
@@ -0,0 +1,367 @@
|
||||
"""Tests del enricher paste_extract (issue 0013).
|
||||
|
||||
paste_extract es modo PREVIEW puro: no escribe a operations.db. Recibe
|
||||
texto via params.text y devuelve un JSON con entidades y relaciones
|
||||
propuestas. La aplicacion (panel C++) procesa el JSON y persiste con
|
||||
dedupe via el codigo C++ (probado en TU separadas si se quisiera).
|
||||
|
||||
Decision: NO probamos la cascada hibrida (GLiNER+GLiREL) en pytest —
|
||||
los modelos pesan cientos de MB y tardan segundos en cargar. El
|
||||
contrato del script en `use_hybrid=false` es lo que cubre el panel
|
||||
en la primera iteracion. Si hybrid esta disponible, simplemente
|
||||
añade entidades adicionales: la logica de merge y dedupe se ejerce
|
||||
con regex+regex (mismo texto pasado dos veces) y con stubs en otros
|
||||
tests.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, run_enricher, SCHEMA_SQL,
|
||||
)
|
||||
|
||||
|
||||
def _resolve_real_registry_root() -> Path | None:
|
||||
"""Localiza la raiz real de fn_registry buscando registry.db + cmd/fn.
|
||||
|
||||
El conftest tiene un fallback que devuelve `/home/lucas` si encuentra
|
||||
un registry.db perdido en HOME — eso rompe los tests que dependen de
|
||||
importar `python.functions.cybersecurity.extract_iocs`. Aqui buscamos
|
||||
explicitamente por el marker AMBOS (`registry.db` Y `cmd/fn/main.go`).
|
||||
|
||||
En worktrees, el repo no es un ancestro: aceptamos un override via
|
||||
`FN_REGISTRY_ROOT` env. Tambien probamos paths conocidos comunes.
|
||||
"""
|
||||
env = os.environ.get("FN_REGISTRY_ROOT")
|
||||
if env:
|
||||
p = Path(env)
|
||||
if (p / "registry.db").exists() and \
|
||||
(p / "cmd" / "fn" / "main.go").exists():
|
||||
return p
|
||||
p = Path(__file__).resolve()
|
||||
for ancestor in p.parents:
|
||||
if (ancestor / "registry.db").exists() and \
|
||||
(ancestor / "cmd" / "fn" / "main.go").exists():
|
||||
return ancestor
|
||||
# Fallback hardcoded — busca el registry mas cercano al worktree.
|
||||
for cand in [Path.home() / "fn_registry", Path("/home/lucas/fn_registry")]:
|
||||
if (cand / "registry.db").exists() and \
|
||||
(cand / "cmd" / "fn" / "main.go").exists():
|
||||
return cand
|
||||
return None
|
||||
|
||||
|
||||
REAL_REGISTRY_ROOT = _resolve_real_registry_root()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def real_registry_root():
|
||||
"""Usar este en lugar de `registry_root` cuando el enricher
|
||||
necesite importar paquetes Python del registry."""
|
||||
if REAL_REGISTRY_ROOT is None:
|
||||
pytest.skip("fn_registry root not found from this worktree")
|
||||
return REAL_REGISTRY_ROOT
|
||||
|
||||
|
||||
SAMPLE_BANKING = (
|
||||
"Acme Corp anuncio que su CEO bad@evil.com firmo un acuerdo. "
|
||||
"Servidores afectados: 192.0.2.55 y 10.0.0.12. "
|
||||
"Vulnerabilidad: CVE-2024-12345. Hash IOC: 44d88612fea8a8f36de82e1278abb02f."
|
||||
)
|
||||
|
||||
|
||||
def _make_ctx(*, ops_db, app_dir, registry_root, text, **params):
|
||||
"""Helper — paste_extract no necesita node_id ni ops_db_path."""
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="", node_name="", node_type="")
|
||||
ctx["params"] = {"text": text, **params}
|
||||
return ctx
|
||||
|
||||
|
||||
def test_paste_extract_returns_entities_no_db_write(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""Modo preview: parsea entidades pero NO escribe a operations.db."""
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=SAMPLE_BANKING)
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 0, err
|
||||
assert out is not None
|
||||
assert "entities" in out
|
||||
assert "relations" in out
|
||||
assert "stats" in out
|
||||
assert out["stats"]["layers"] == ["regex"]
|
||||
|
||||
# Tipos esperados (al menos Email, IPAddress, CVE).
|
||||
types = {e["type_ref"] for e in out["entities"]}
|
||||
assert "Email" in types, types
|
||||
assert "CVE" in types, types
|
||||
|
||||
# Cada entidad tiene los campos del contrato.
|
||||
for e in out["entities"]:
|
||||
assert isinstance(e["id"], str) and e["id"].startswith("tmp_"), e
|
||||
assert e["type_ref"] and e["name"]
|
||||
assert e["source"] in ("regex", "hybrid")
|
||||
assert "metadata" in e
|
||||
# start/end son ints (>=0 en regex matches).
|
||||
assert isinstance(e["start"], int)
|
||||
assert isinstance(e["end"], int)
|
||||
|
||||
# Crucial: NO se ha escrito a la BD (modo preview).
|
||||
assert list_entities(ops_db) == []
|
||||
assert list_relations(ops_db) == []
|
||||
|
||||
|
||||
def test_paste_extract_dedupes_within_run(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""Texto con duplicados → cada (type_ref, name) aparece una sola vez."""
|
||||
text = ("Email a foo@bar.com y otra vez foo@bar.com. "
|
||||
"IP 192.0.2.10. Repite IP 192.0.2.10.")
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=text)
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 0, err
|
||||
|
||||
keys = [(e["type_ref"], e["name"]) for e in out["entities"]]
|
||||
assert len(keys) == len(set(keys)), keys
|
||||
assert ("Email", "foo@bar.com") in keys
|
||||
assert ("IPAddress", "192.0.2.10") in keys
|
||||
|
||||
|
||||
def test_paste_extract_empty_text_fails_clean(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""Sin params.text → exit 2 + JSON con error y entities vacias."""
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text="")
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 2, err
|
||||
assert out is not None
|
||||
assert out["entities"] == []
|
||||
assert "error" in out
|
||||
|
||||
|
||||
def test_paste_extract_max_entities_truncates(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""max_entities=N corta la lista a las N primeras encontradas."""
|
||||
text = " ".join(f"contact{i:03d}@example.org" for i in range(50))
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=text, max_entities=10)
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 0, err
|
||||
assert len(out["entities"]) == 10
|
||||
|
||||
|
||||
def test_paste_extract_types_filter(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""params.types filtra qué tipos IoC se extraen."""
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=SAMPLE_BANKING, types="email")
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 0, err
|
||||
types = {e["type_ref"] for e in out["entities"]}
|
||||
# Solo Email — el filtro paso a extract_iocs y este solo emite emails.
|
||||
assert types == {"Email"}, types
|
||||
|
||||
|
||||
def test_paste_extract_use_hybrid_false_skips_layer(ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""use_hybrid=False ⇒ stats.layers = ['regex'] (no toca GLiNER)."""
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=SAMPLE_BANKING, use_hybrid=False)
|
||||
rc, out, err = run_enricher("paste_extract", ctx)
|
||||
assert rc == 0, err
|
||||
assert out["stats"]["layers"] == ["regex"]
|
||||
|
||||
|
||||
def test_paste_extract_idempotent_runs_no_duplicate_proposal(
|
||||
ops_db, app_dir, real_registry_root):
|
||||
registry_root = real_registry_root
|
||||
"""Llamar paste_extract dos veces con el mismo texto produce la
|
||||
misma propuesta — la dedupe del *commit* es responsabilidad del
|
||||
panel C++, pero el script preview ya devuelve sin duplicados."""
|
||||
ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
text=SAMPLE_BANKING)
|
||||
rc1, out1, _ = run_enricher("paste_extract", ctx)
|
||||
rc2, out2, _ = run_enricher("paste_extract", ctx)
|
||||
assert rc1 == 0 and rc2 == 0
|
||||
keys1 = sorted((e["type_ref"], e["name"]) for e in out1["entities"])
|
||||
keys2 = sorted((e["type_ref"], e["name"]) for e in out2["entities"])
|
||||
assert keys1 == keys2
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Apply-side tests — replican la logica de extract_panel_apply en Python para
|
||||
# verificar el contrato de dedupe que el panel C++ implementa. Ejercitan que
|
||||
# (1) entidades nuevas se insertan, (2) duplicadas (type_ref, name) reusan id,
|
||||
# (3) las relaciones cuyos endpoints estan en la BD se persisten, (4) las que
|
||||
# no, se descartan.
|
||||
#
|
||||
# El panel C++ se prueba al compilar (build verde) y en runtime via la CLI;
|
||||
# aqui validamos el *contrato* del JSON output que el panel consume.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _apply_proposal_python(ops_db_path, proposal: dict) -> dict:
|
||||
"""Implementacion de referencia de extract_panel_apply en Python.
|
||||
|
||||
Coincide con la del C++ — sirve para validar el contrato. Si esta
|
||||
funcion y la del C++ producen el mismo resultado en los mismos
|
||||
inputs, el wire-protocol es correcto.
|
||||
"""
|
||||
conn = sqlite3.connect(ops_db_path)
|
||||
try:
|
||||
ts = "2026-01-01T00:00:00Z"
|
||||
map_id = {}
|
||||
added_e = 0
|
||||
dedup_e = 0
|
||||
for i, e in enumerate(proposal.get("entities", [])):
|
||||
if not e.get("selected", True):
|
||||
continue
|
||||
tref = e["type_ref"]; name = e["name"]
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1",
|
||||
(tref, name)).fetchone()
|
||||
if existing:
|
||||
map_id[e["id"]] = existing[0]
|
||||
dedup_e += 1
|
||||
else:
|
||||
new_id = f"{tref}_{i}_{name}"
|
||||
conn.execute(
|
||||
"INSERT INTO entities (id, name, type_ref, source, "
|
||||
" metadata, created_at, updated_at) "
|
||||
"VALUES (?, ?, ?, 'panel:paste_extract', ?, ?, ?)",
|
||||
(new_id, name, tref,
|
||||
json.dumps(e.get("metadata", {})), ts, ts))
|
||||
map_id[e["id"]] = new_id
|
||||
added_e += 1
|
||||
added_r = 0
|
||||
skipped_r = 0
|
||||
for j, r in enumerate(proposal.get("relations", [])):
|
||||
if not r.get("selected", True):
|
||||
continue
|
||||
f = map_id.get(r["from_id"]); t = map_id.get(r["to_id"])
|
||||
if not f or not t:
|
||||
skipped_r += 1
|
||||
continue
|
||||
name = r.get("name") or "RELATED_TO"
|
||||
existing = conn.execute(
|
||||
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
|
||||
"AND name=? LIMIT 1", (f, t, name)).fetchone()
|
||||
if existing:
|
||||
skipped_r += 1
|
||||
continue
|
||||
conn.execute(
|
||||
"INSERT INTO relations (id, name, from_entity, to_entity, "
|
||||
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(f"rel_{j}_{name}", name, f, t, ts, ts))
|
||||
added_r += 1
|
||||
conn.commit()
|
||||
return {"added_entities": added_e, "dedup_entities": dedup_e,
|
||||
"added_relations": added_r, "skipped_relations": skipped_r}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_apply_inserts_only_selected(ops_db):
|
||||
"""Solo las entidades con selected=true se insertan."""
|
||||
proposal = {
|
||||
"entities": [
|
||||
{"id": "tmp_0", "type_ref": "Email", "name": "a@b.com",
|
||||
"metadata": {}, "selected": True},
|
||||
{"id": "tmp_1", "type_ref": "IPAddress", "name": "1.2.3.4",
|
||||
"metadata": {}, "selected": False}, # NO seleccionada
|
||||
{"id": "tmp_2", "type_ref": "CVE", "name": "CVE-2024-1",
|
||||
"metadata": {}, "selected": True},
|
||||
],
|
||||
"relations": [],
|
||||
}
|
||||
stats = _apply_proposal_python(ops_db, proposal)
|
||||
assert stats["added_entities"] == 2
|
||||
types = {e["type_ref"] for e in list_entities(ops_db)}
|
||||
assert types == {"Email", "CVE"}
|
||||
|
||||
|
||||
def test_apply_dedupes_by_type_and_name(ops_db):
|
||||
"""Reaplicar el mismo proposal NO duplica entidades."""
|
||||
proposal = {
|
||||
"entities": [
|
||||
{"id": "tmp_0", "type_ref": "Email", "name": "x@y.z",
|
||||
"metadata": {}, "selected": True},
|
||||
],
|
||||
"relations": [],
|
||||
}
|
||||
s1 = _apply_proposal_python(ops_db, proposal)
|
||||
s2 = _apply_proposal_python(ops_db, proposal)
|
||||
assert s1["added_entities"] == 1
|
||||
assert s2["added_entities"] == 0
|
||||
assert s2["dedup_entities"] == 1
|
||||
# Solo una fila en la BD.
|
||||
rows = list_entities(ops_db)
|
||||
assert len(rows) == 1
|
||||
|
||||
|
||||
def test_apply_inserts_relations_when_endpoints_resolve(ops_db):
|
||||
"""Relaciones con endpoints validos (selected) se persisten."""
|
||||
proposal = {
|
||||
"entities": [
|
||||
{"id": "tmp_0", "type_ref": "Person", "name": "Alice",
|
||||
"metadata": {}, "selected": True},
|
||||
{"id": "tmp_1", "type_ref": "Organization", "name": "Acme",
|
||||
"metadata": {}, "selected": True},
|
||||
],
|
||||
"relations": [
|
||||
{"from_id": "tmp_0", "to_id": "tmp_1",
|
||||
"name": "works_at", "selected": True},
|
||||
],
|
||||
}
|
||||
stats = _apply_proposal_python(ops_db, proposal)
|
||||
assert stats["added_entities"] == 2
|
||||
assert stats["added_relations"] == 1
|
||||
rels = list_relations(ops_db, name="works_at")
|
||||
assert len(rels) == 1
|
||||
|
||||
|
||||
def test_apply_skips_relation_if_endpoint_unselected(ops_db):
|
||||
"""Si un endpoint no se selecciona, su relacion se descarta."""
|
||||
proposal = {
|
||||
"entities": [
|
||||
{"id": "tmp_0", "type_ref": "Person", "name": "Alice",
|
||||
"metadata": {}, "selected": True},
|
||||
{"id": "tmp_1", "type_ref": "Organization", "name": "Acme",
|
||||
"metadata": {}, "selected": False}, # NO se inserta
|
||||
],
|
||||
"relations": [
|
||||
{"from_id": "tmp_0", "to_id": "tmp_1",
|
||||
"name": "works_at", "selected": True},
|
||||
],
|
||||
}
|
||||
stats = _apply_proposal_python(ops_db, proposal)
|
||||
assert stats["added_entities"] == 1
|
||||
assert stats["added_relations"] == 0
|
||||
assert stats["skipped_relations"] == 1
|
||||
|
||||
|
||||
def test_apply_dedupes_relation_on_repeat(ops_db):
|
||||
"""Relacion (from, to, name) repetida no se duplica."""
|
||||
proposal = {
|
||||
"entities": [
|
||||
{"id": "tmp_0", "type_ref": "Person", "name": "Alice",
|
||||
"metadata": {}, "selected": True},
|
||||
{"id": "tmp_1", "type_ref": "Organization", "name": "Acme",
|
||||
"metadata": {}, "selected": True},
|
||||
],
|
||||
"relations": [
|
||||
{"from_id": "tmp_0", "to_id": "tmp_1",
|
||||
"name": "works_at", "selected": True},
|
||||
],
|
||||
}
|
||||
s1 = _apply_proposal_python(ops_db, proposal)
|
||||
s2 = _apply_proposal_python(ops_db, proposal)
|
||||
assert s1["added_relations"] == 1
|
||||
assert s2["added_relations"] == 0
|
||||
assert s2["skipped_relations"] == 1
|
||||
Reference in New Issue
Block a user