diff --git a/CMakeLists.txt b/CMakeLists.txt index 2205c01..26d27fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ add_imgui_app(graph_explorer jobs.cpp enrichers.cpp chat.cpp + extract_panel.cpp # --- viz --- ${FN_CPP_ROOT_DIR}/functions/viz/graph_renderer.cpp ${FN_CPP_ROOT_DIR}/functions/viz/graph_force_layout.cpp diff --git a/app.md b/app.md index d530d7b..89d321c 100644 --- a/app.md +++ b/app.md @@ -30,6 +30,10 @@ uses_functions: - fullscreen_window_cpp_core - badge_cpp_core - empty_state_cpp_core + # paste & extract panel (issue 0013) — invoca enrichers/paste_extract/run.py + # via subprocess directo (no via jobs); uses extract_iocs + opcional hybrid. + - extract_iocs_py_cybersecurity + - extract_graph_hybrid_py_pipelines uses_types: [] framework: "imgui" entry_point: "main.cpp" diff --git a/enrichers/paste_extract/manifest.yaml b/enrichers/paste_extract/manifest.yaml new file mode 100644 index 0000000..1404589 --- /dev/null +++ b/enrichers/paste_extract/manifest.yaml @@ -0,0 +1,14 @@ +id: paste_extract +name: "Paste & Extract" +description: "Extrae entidades y relaciones de texto pegado en el panel Extract. Cascada: extract_iocs (regex) + GLiNER + GLiREL si estan disponibles, fallback a solo regex. Modo preview (no escribe). El panel C++ procesa el JSON y persiste lo seleccionado via entity_ops." +applies_to: [] +emits: [Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone, Person, Organization, Location] +relations: [] +uses_functions: + - extract_iocs_py_cybersecurity + - extract_graph_hybrid_py_pipelines +params: + - { name: text, type: string, default: "", description: "Texto a analizar (lo pasa el panel)" } + - { name: types, type: string, default: "", description: "CSV de tipos IoC; vacio = todos" } + - { name: max_entities, type: int, default: 200 } + - { name: use_hybrid, type: bool, default: "false", description: "Si true intenta cargar GLiNER/GLiREL" } diff --git a/enrichers/paste_extract/run.py b/enrichers/paste_extract/run.py new file mode 100644 index 0000000..4323df0 --- /dev/null +++ b/enrichers/paste_extract/run.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +"""Enricher paste_extract — modo preview puro para el panel "Paste & Extract". + +A diferencia del resto de enrichers, este NO escribe a operations.db. Recibe +el texto via `params.text` y devuelve un JSON con las entidades y relaciones +propuestas. La aplicacion (panel C++) procesa la propuesta, el usuario marca +cuales aceptar y la propia app persiste con dedupe via entity_ops. + +Cascada de extraccion (graceful fallback): + 1. `extract_iocs(text, types_list)` — regex puro, siempre disponible. + 2. Si `use_hybrid=true` y los modelos cargan correctamente → ademas + `extract_graph_hybrid` (GLiNER + GLiREL + LLM opcional). Si la + carga del modelo falla por dependencias o por tiempo se ignora + silenciosamente — el panel muestra solo lo que llego. + +Wire protocol estandar (issue 0026), pero tolera ausencia de `node_id` y de +`ops_db_path`: este enricher es global, no pertenece a un nodo. + +Output JSON: +{ + "entities": [ + {"id": "", "type_ref": "Email", "name": "x@y.z", + "metadata": {...}, "source": "regex|gliner|llm", + "start": 12, "end": 27, "confidence": 1.0} + ], + "relations": [ + {"from_id": "", "to_id": "", "name": "RELATED_TO", + "source": "glirel|llm", "confidence": 0.7} + ], + "stats": {"layers": ["regex"], "n_entities": N, "n_relations": M} +} +""" +from __future__ import annotations + +import json +import os +import sys + + +_TYPE_MAP = { + "email": ("Email", "address"), + "ip_address": ("IPAddress", "address"), + "domain": ("Domain", "name"), + "file_hash": ("FileHash", "value"), + "crypto_wallet": ("CryptoWallet", "address"), + "cve_id": ("CVE", "id"), + "mac_address": ("MACAddress", "address"), + "phone_number": ("Phone", "number"), +} + + +def progress(p: float, stage: str = "") -> None: + sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") + sys.stderr.flush() + + +def log(msg: str) -> None: + sys.stderr.write(f"{msg}\n") + sys.stderr.flush() + + +def _setup_registry_path(registry_root: str) -> None: + """Intenta hacer importables los paquetes del registry. + + Estrategia identica a otros enrichers: + 1. `/_vendored/` — para distribuciones binarias. + 2. `/python/functions/` — modo dev local. + """ + vendored = os.path.join(os.path.dirname(__file__), "_vendored") + if os.path.isdir(vendored): + if vendored not in sys.path: + sys.path.insert(0, vendored) + return + if registry_root: + py_funcs = os.path.join(registry_root, "python", "functions") + if os.path.isdir(py_funcs) and py_funcs not in sys.path: + sys.path.insert(0, py_funcs) + + +def _run_regex(text: str, types_list, max_entities: int) -> tuple[list, list]: + """Capa 1: extract_iocs. Devuelve (entities, relations).""" + try: + from cybersecurity.extract_iocs import extract_iocs # type: ignore + except Exception as e: + log(f"extract_iocs no importable: {e}") + return [], [] + + iocs = extract_iocs(text, types_list) + seen = set() + entities = [] + for i, it in enumerate(iocs): + ioc_type = it.get("type") + value = it.get("value") or it.get("address") or it.get("name") or "" + if not ioc_type or not value: + continue + key = (ioc_type, value) + if key in seen: + continue + seen.add(key) + type_ref, value_field = _TYPE_MAP.get(ioc_type, (ioc_type, "value")) + entities.append({ + "id": f"tmp_e_{i}", + "type_ref": type_ref, + "name": value, + "metadata": {value_field: value}, + "source": "regex", + "start": int(it.get("start", -1)), + "end": int(it.get("end", -1)), + "confidence": 1.0, + }) + if len(entities) >= max_entities: + break + return entities, [] # regex no produce relaciones + + +def _try_run_hybrid(text: str, registry_root: str, + confidence_threshold: float = 0.6, + ) -> tuple[list, list]: + """Capa 2 opcional: extract_graph_hybrid. Devuelve (entities, relations). + + Cualquier error o ausencia de dependencias se traga silenciosamente — + GLiNER/GLiREL pueden no estar instalados. La idea es no romper el + flow del panel cuando solo regex esta disponible. + """ + try: + # El pipeline esta en `pipelines/`, importarlo via path absoluto si + # el registry_root esta disponible. En modo vendored este pipeline + # quizas no este — fallback silencioso. + sys.path.insert(0, os.path.join(registry_root, "python", "functions", + "pipelines")) + from extract_graph_hybrid import extract_graph_hybrid # type: ignore + from datascience.gliner_load_model import gliner_load_model # type: ignore + from datascience.glirel_load_model import glirel_load_model # type: ignore + except Exception as e: + log(f"hybrid pipeline no disponible: {e}") + return [], [] + + # Schema generico para entidades semanticas. Si el caller quiere un + # schema custom, lo extendemos via params en una iteracion futura. + entity_schema = [ + {"type_ref": "Person", "label": "person"}, + {"type_ref": "Organization", "label": "organization"}, + {"type_ref": "Location", "label": "location"}, + ] + relation_types = ["works_at", "located_in", "part_of", "related_to"] + + try: + gliner_model = gliner_load_model() + glirel_model = glirel_load_model() + except Exception as e: + log(f"hybrid: load modelo fallo: {e}") + return [], [] + + try: + ents, rels = extract_graph_hybrid( + chunks=[text], + entity_schema=entity_schema, + relation_types=relation_types, + gliner_model=gliner_model, + glirel_model=glirel_model, + llm_chat_json=None, + confidence_threshold=confidence_threshold, + ) + except Exception as e: + log(f"hybrid: extract fallo: {e}") + return [], [] + + out_entities = [] + name_to_idx: dict[tuple[str, str], int] = {} + for i, ec in enumerate(ents): + idx = len(out_entities) + out_entities.append({ + "id": f"tmp_h_{idx}", + "type_ref": ec.type_ref, + "name": ec.name, + "metadata": dict(getattr(ec, "attributes", {}) or {}), + "source": "hybrid", + "start": int((ec.attributes or {}).get("start", -1)) + if hasattr(ec, "attributes") else -1, + "end": int((ec.attributes or {}).get("end", -1)) + if hasattr(ec, "attributes") else -1, + "confidence": float(ec.confidence), + }) + name_to_idx[(ec.type_ref, ec.name)] = idx + + out_relations = [] + for rc in rels: + # Mapear from/to (RelationCandidate) a tmp_id si los podemos casar. + from_key = (getattr(rc, "from_type_ref", None), getattr(rc, "from_name", None)) + to_key = (getattr(rc, "to_type_ref", None), getattr(rc, "to_name", None)) + if None in from_key or None in to_key: + continue + fi = name_to_idx.get(from_key) + ti = name_to_idx.get(to_key) + if fi is None or ti is None: + continue + out_relations.append({ + "from_id": f"tmp_h_{fi}", + "to_id": f"tmp_h_{ti}", + "name": getattr(rc, "name", "RELATED_TO") or "RELATED_TO", + "source": "hybrid", + "confidence": float(getattr(rc, "confidence", 0.0)), + }) + return out_entities, out_relations + + +def main() -> int: + raw = sys.stdin.read() + try: + ctx = json.loads(raw) + except Exception as e: + log(f"stdin not valid JSON: {e}") + return 2 + + params = ctx.get("params") or {} + registry_root = ctx.get("registry_root") or "" + + text = (params.get("text") or "").strip() + if not text: + log("missing params.text") + print(json.dumps({"error": "missing params.text", + "entities": [], "relations": [], + "stats": {"layers": [], "n_entities": 0, + "n_relations": 0}})) + return 2 + + types_csv = (params.get("types") or "").strip() + types_list = [t.strip() for t in types_csv.split(",") if t.strip()] \ + if types_csv else None + max_entities = int(params.get("max_entities", 200)) + use_hybrid_raw = params.get("use_hybrid", False) + if isinstance(use_hybrid_raw, str): + use_hybrid = use_hybrid_raw.strip().lower() in ("1", "true", "yes", "on") + else: + use_hybrid = bool(use_hybrid_raw) + + progress(0.05, "init") + _setup_registry_path(registry_root) + + progress(0.20, "regex") + regex_entities, _ = _run_regex(text, types_list, max_entities) + layers = ["regex"] + + hybrid_entities: list = [] + hybrid_relations: list = [] + if use_hybrid: + progress(0.40, "hybrid") + hybrid_entities, hybrid_relations = _try_run_hybrid(text, registry_root) + if hybrid_entities or hybrid_relations: + layers.append("hybrid") + + # Mergear regex + hybrid evitando duplicados exactos (type_ref, name). + progress(0.85, "merge") + seen = set() + entities: list[dict] = [] + for src in (regex_entities, hybrid_entities): + for e in src: + key = (e["type_ref"], e["name"]) + if key in seen: + continue + seen.add(key) + entities.append(e) + if len(entities) >= max_entities: + break + if len(entities) >= max_entities: + break + + # Reasignar tmp ids tras merge para que sean estables 0..N-1. + id_remap: dict[str, str] = {} + for i, e in enumerate(entities): + new_id = f"tmp_{i}" + id_remap[e["id"]] = new_id + e["id"] = new_id + + relations: list[dict] = [] + for r in hybrid_relations: + fi = id_remap.get(r["from_id"]) + ti = id_remap.get(r["to_id"]) + if fi is None or ti is None: + continue + relations.append({ + "from_id": fi, + "to_id": ti, + "name": r.get("name") or "RELATED_TO", + "source": r.get("source") or "hybrid", + "confidence": float(r.get("confidence", 0.0)), + }) + + progress(1.0, "done") + print(json.dumps({ + "entities": entities, + "relations": relations, + "stats": { + "layers": layers, + "n_entities": len(entities), + "n_relations": len(relations), + }, + }, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/extract_panel.cpp b/extract_panel.cpp new file mode 100644 index 0000000..96b3038 --- /dev/null +++ b/extract_panel.cpp @@ -0,0 +1,1079 @@ +#include "extract_panel.h" +#include "views.h" +#include "entity_ops.h" + +#include "imgui.h" +#include "core/icons_tabler.h" +#include "core/tokens.h" + +#include "../../../../cpp/vendor/sqlite3/sqlite3.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include +#else + #include + #include + #include + #include + #include + #include +#endif + +namespace ge { + +// --------------------------------------------------------------------------- +// Estado del modulo +// --------------------------------------------------------------------------- + +namespace { + +struct ModuleState { + std::string enrichers_dir; + std::string app_dir; + std::string registry_root; +}; + +ModuleState g_mod; +ExtractPanelState* g_panel = nullptr; + +ExtractPanelState& panel_state() { + if (!g_panel) { + g_panel = new ExtractPanelState(); + g_panel->text_buf.assign(8192, 0); + g_panel->text_initialized = true; + } + return *g_panel; +} + +bool file_exists(const std::string& p) { + struct stat st{}; + return !p.empty() && stat(p.c_str(), &st) == 0 && !S_ISDIR(st.st_mode); +} + +long long now_ms_local() { + using namespace std::chrono; + return duration_cast(system_clock::now() + .time_since_epoch()).count(); +} + +std::string now_iso_local() { + auto t = std::time(nullptr); + std::tm tm_utc{}; +#ifdef _WIN32 + gmtime_s(&tm_utc, &t); +#else + gmtime_r(&t, &tm_utc); +#endif + char buf[32]; + std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &tm_utc); + return buf; +} + +// --------------------------------------------------------------------------- +// JSON parser minimo +// --------------------------------------------------------------------------- + +const char* skip_ws(const char* p, const char* end) { + while (p < end) { + char c = *p; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { ++p; continue; } + break; + } + return p; +} + +bool parse_string(const char*& p, const char* end, std::string* out) { + if (p >= end || *p != '"') return false; + ++p; + out->clear(); + while (p < end) { + char c = *p++; + if (c == '"') return true; + if (c == '\\') { + if (p >= end) return false; + char esc = *p++; + switch (esc) { + case '"': out->push_back('"'); break; + case '\\': out->push_back('\\'); break; + case '/': out->push_back('/'); break; + case 'b': out->push_back('\b'); break; + case 'f': out->push_back('\f'); break; + case 'n': out->push_back('\n'); break; + case 'r': out->push_back('\r'); break; + case 't': out->push_back('\t'); break; + case 'u': { + if (p + 4 > end) return false; + unsigned cp = 0; + for (int i = 0; i < 4; ++i) { + char h = p[i]; cp <<= 4; + if (h >= '0' && h <= '9') cp |= (h - '0'); + else if (h >= 'a' && h <= 'f') cp |= (h - 'a' + 10); + else if (h >= 'A' && h <= 'F') cp |= (h - 'A' + 10); + else return false; + } + p += 4; + if (cp < 0x80) out->push_back((char)cp); + else if (cp < 0x800) { + out->push_back((char)(0xC0 | (cp >> 6))); + out->push_back((char)(0x80 | (cp & 0x3F))); + } else { + out->push_back((char)(0xE0 | (cp >> 12))); + out->push_back((char)(0x80 | ((cp >> 6) & 0x3F))); + out->push_back((char)(0x80 | (cp & 0x3F))); + } + break; + } + default: out->push_back(esc); break; + } + } else { + out->push_back(c); + } + } + return false; +} + +bool skip_value(const char*& p, const char* end, std::string* lit_out = nullptr) { + p = skip_ws(p, end); + if (p >= end) return false; + const char* start = p; + if (*p == '"') { + std::string tmp; + bool ok = parse_string(p, end, &tmp); + if (ok && lit_out) { lit_out->assign("\""); *lit_out += tmp; *lit_out += "\""; } + return ok; + } + if (*p == '{' || *p == '[') { + char open = *p, close = (open == '{' ? '}' : ']'); + int depth = 0; + bool in_str = false; + const char* obj_start = p; + while (p < end) { + char c = *p++; + if (in_str) { + if (c == '\\' && p < end) { ++p; continue; } + if (c == '"') in_str = false; + continue; + } + if (c == '"') { in_str = true; continue; } + if (c == open) ++depth; + else if (c == close) { --depth; if (depth == 0) break; } + } + if (depth != 0) return false; + if (lit_out) lit_out->assign(obj_start, p); + return true; + } + while (p < end) { + char c = *p; + if (c == ',' || c == '}' || c == ']' || + c == ' ' || c == '\t' || c == '\n' || c == '\r') break; + ++p; + } + if (lit_out) lit_out->assign(start, p); + return true; +} + +bool expect_char(const char*& p, const char* end, char c) { + p = skip_ws(p, end); + if (p >= end || *p != c) return false; + ++p; + return true; +} + +bool parse_entities_array(const char*& p, const char* end, + std::vector* out) +{ + if (!expect_char(p, end, '[')) return false; + p = skip_ws(p, end); + if (p < end && *p == ']') { ++p; return true; } + while (p < end) { + if (!expect_char(p, end, '{')) return false; + ProposedEntity e; + e.confidence = 1.0; + while (p < end) { + p = skip_ws(p, end); + if (p < end && *p == '}') { ++p; break; } + std::string key; + if (!parse_string(p, end, &key)) return false; + if (!expect_char(p, end, ':')) return false; + p = skip_ws(p, end); + if (key == "metadata") { + std::string lit; + if (!skip_value(p, end, &lit)) return false; + e.metadata_json = std::move(lit); + } else if (p < end && *p == '"') { + std::string val; + if (!parse_string(p, end, &val)) return false; + if (key == "id") e.tmp_id = std::move(val); + else if (key == "type_ref") e.type_ref = std::move(val); + else if (key == "name") e.name = std::move(val); + else if (key == "source") e.source = std::move(val); + } else { + std::string lit; + if (!skip_value(p, end, &lit)) return false; + if (key == "start") e.start_offset = std::atoi(lit.c_str()); + else if (key == "end") e.end_offset = std::atoi(lit.c_str()); + else if (key == "confidence") e.confidence = std::atof(lit.c_str()); + } + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + } + std::snprintf(e.type_buf, sizeof(e.type_buf), "%s", e.type_ref.c_str()); + std::snprintf(e.name_buf, sizeof(e.name_buf), "%s", e.name.c_str()); + e.selected = true; + out->push_back(std::move(e)); + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + if (p < end && *p == ']') { ++p; return true; } + } + return false; +} + +bool parse_relations_array(const char*& p, const char* end, + std::vector* out) +{ + if (!expect_char(p, end, '[')) return false; + p = skip_ws(p, end); + if (p < end && *p == ']') { ++p; return true; } + while (p < end) { + if (!expect_char(p, end, '{')) return false; + ProposedRelation r; + while (p < end) { + p = skip_ws(p, end); + if (p < end && *p == '}') { ++p; break; } + std::string key; + if (!parse_string(p, end, &key)) return false; + if (!expect_char(p, end, ':')) return false; + p = skip_ws(p, end); + if (p < end && *p == '"') { + std::string val; + if (!parse_string(p, end, &val)) return false; + if (key == "from_id") r.from_tmp_id = std::move(val); + else if (key == "to_id") r.to_tmp_id = std::move(val); + else if (key == "name") r.name = std::move(val); + else if (key == "source") r.source = std::move(val); + } else { + std::string lit; + if (!skip_value(p, end, &lit)) return false; + if (key == "confidence") r.confidence = std::atof(lit.c_str()); + } + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + } + r.selected = true; + out->push_back(std::move(r)); + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + if (p < end && *p == ']') { ++p; return true; } + } + return false; +} + +bool parse_layers_array(const char*& p, const char* end, + std::vector* out) +{ + if (!expect_char(p, end, '[')) return false; + p = skip_ws(p, end); + if (p < end && *p == ']') { ++p; return true; } + while (p < end) { + std::string s; + if (!parse_string(p, end, &s)) return false; + out->push_back(std::move(s)); + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + if (p < end && *p == ']') { ++p; return true; } + } + return false; +} + +bool parse_object_paste_extract(const char*& p, const char* end, + ExtractResult* res) +{ + if (!expect_char(p, end, '{')) return false; + while (p < end) { + p = skip_ws(p, end); + if (p < end && *p == '}') { ++p; return true; } + std::string key; + if (!parse_string(p, end, &key)) return false; + if (!expect_char(p, end, ':')) return false; + p = skip_ws(p, end); + if (key == "entities") { + if (!parse_entities_array(p, end, &res->entities)) return false; + } else if (key == "relations") { + if (!parse_relations_array(p, end, &res->relations)) return false; + } else if (key == "stats") { + if (!expect_char(p, end, '{')) return false; + while (p < end) { + p = skip_ws(p, end); + if (p < end && *p == '}') { ++p; break; } + std::string sk; + if (!parse_string(p, end, &sk)) return false; + if (!expect_char(p, end, ':')) return false; + p = skip_ws(p, end); + if (sk == "layers") { + if (!parse_layers_array(p, end, &res->layers)) return false; + } else { + if (!skip_value(p, end, nullptr)) return false; + } + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + } + } else if (key == "error") { + std::string s; + if (p < end && *p == '"') { + if (!parse_string(p, end, &s)) return false; + res->error = std::move(s); + } else { + if (!skip_value(p, end, nullptr)) return false; + } + } else { + if (!skip_value(p, end, nullptr)) return false; + } + p = skip_ws(p, end); + if (p < end && *p == ',') { ++p; continue; } + } + return false; +} + +// --------------------------------------------------------------------------- +// Subprocess +// --------------------------------------------------------------------------- + +std::string python_path() { + if (const char* env = std::getenv("FN_PYTHON"); env && *env && file_exists(env)) { + return env; + } + if (!g_mod.registry_root.empty()) { + std::string p = g_mod.registry_root + "/python/.venv/bin/python3"; +#ifdef _WIN32 + return p; +#else + if (file_exists(p)) return p; +#endif + } +#ifdef _WIN32 + return "python.exe"; +#else + return "python3"; +#endif +} + +std::string build_stdin_payload(const std::string& text, bool use_hybrid) { + auto esc = [](const std::string& s) { + std::string o; o.reserve(s.size() + 8); + for (char c : s) { + switch (c) { + case '"': o += "\\\""; break; + case '\\': o += "\\\\"; break; + case '\n': o += "\\n"; break; + case '\r': o += "\\r"; break; + case '\t': o += "\\t"; break; + default: + if ((unsigned char)c < 0x20) { + char b[8]; + std::snprintf(b, sizeof(b), "\\u%04x", (unsigned char)c); + o += b; + } else { o += c; } + } + } + return o; + }; + std::ostringstream o; + o << "{" + << "\"node_id\":\"\"," + << "\"node_name\":\"\"," + << "\"node_type\":\"\"," + << "\"metadata\":{}," + << "\"ops_db_path\":\"\"," + << "\"app_dir\":\"" << esc(g_mod.app_dir) << "\"," + << "\"cache_dir\":\"\"," + << "\"registry_root\":\"" << esc(g_mod.registry_root) << "\"," + << "\"params\":{" + << "\"text\":\"" << esc(text) << "\"," + << "\"use_hybrid\":" << (use_hybrid ? "true" : "false") + << "}" + << "}"; + return o.str(); +} + +#ifdef _WIN32 + +bool spawn_python_blocking(const std::string& script_path, + const std::string& stdin_payload, + std::string* stdout_buf, + std::string* stderr_tail, + int* exit_code) +{ + SECURITY_ATTRIBUTES sa{}; sa.nLength = sizeof(sa); sa.bInheritHandle = TRUE; + HANDLE in_r=nullptr, in_w=nullptr, out_r=nullptr, out_w=nullptr, + err_r=nullptr, err_w=nullptr; + if (!CreatePipe(&in_r,&in_w,&sa,0) || + !CreatePipe(&out_r,&out_w,&sa,0) || + !CreatePipe(&err_r,&err_w,&sa,0)) return false; + SetHandleInformation(in_w, HANDLE_FLAG_INHERIT, 0); + SetHandleInformation(out_r, HANDLE_FLAG_INHERIT, 0); + SetHandleInformation(err_r, HANDLE_FLAG_INHERIT, 0); + + std::string py = python_path(); + std::string cmd = "\"" + py + "\" \"" + script_path + "\""; + int n = MultiByteToWideChar(CP_UTF8, 0, cmd.c_str(), (int)cmd.size(), + nullptr, 0); + std::wstring w(n, 0); + MultiByteToWideChar(CP_UTF8, 0, cmd.c_str(), (int)cmd.size(), w.data(), n); + std::vector wbuf(w.begin(), w.end()); wbuf.push_back(0); + + STARTUPINFOW si{}; si.cb = sizeof(si); si.dwFlags = STARTF_USESTDHANDLES; + si.hStdInput=in_r; si.hStdOutput=out_w; si.hStdError=err_w; + PROCESS_INFORMATION pi{}; + BOOL ok = CreateProcessW(nullptr, wbuf.data(), nullptr, nullptr, TRUE, + CREATE_NO_WINDOW, nullptr, nullptr, &si, &pi); + CloseHandle(in_r); CloseHandle(out_w); CloseHandle(err_w); + if (!ok) { + if (stderr_tail) *stderr_tail = "CreateProcessW failed"; + CloseHandle(in_w); CloseHandle(out_r); CloseHandle(err_r); + return false; + } + DWORD wn = 0; + WriteFile(in_w, stdin_payload.data(), (DWORD)stdin_payload.size(), + &wn, nullptr); + CloseHandle(in_w); + + std::thread err_t([&]() { + char b[1024]; + while (true) { + DWORD m = 0; + if (!ReadFile(err_r, b, sizeof(b), &m, nullptr) || m == 0) break; + if (stderr_tail) { + stderr_tail->append(b, (size_t)m); + if (stderr_tail->size() > 4096) + stderr_tail->erase(0, stderr_tail->size() - 4096); + } + } + }); + { + char b[4096]; + while (true) { + DWORD m = 0; + if (!ReadFile(out_r, b, sizeof(b), &m, nullptr) || m == 0) break; + if (stdout_buf) stdout_buf->append(b, (size_t)m); + if (stdout_buf && stdout_buf->size() > 1024 * 1024) break; + } + } + CloseHandle(out_r); + WaitForSingleObject(pi.hProcess, INFINITE); + DWORD ec = 0; GetExitCodeProcess(pi.hProcess, &ec); + if (exit_code) *exit_code = (int)ec; + err_t.join(); + CloseHandle(err_r); CloseHandle(pi.hProcess); CloseHandle(pi.hThread); + return true; +} + +#else + +bool spawn_python_blocking(const std::string& script_path, + const std::string& stdin_payload, + std::string* stdout_buf, + std::string* stderr_tail, + int* exit_code) +{ + int p_in[2]={-1,-1}, p_out[2]={-1,-1}, p_err[2]={-1,-1}; + if (pipe(p_in) != 0 || pipe(p_out) != 0 || pipe(p_err) != 0) return false; + pid_t pid = fork(); + if (pid < 0) return false; + if (pid == 0) { + dup2(p_in[0], 0); dup2(p_out[1], 1); dup2(p_err[1], 2); + close(p_in[0]); close(p_in[1]); + close(p_out[0]); close(p_out[1]); + close(p_err[0]); close(p_err[1]); + std::string py = python_path(); + const char* argv[] = { py.c_str(), script_path.c_str(), nullptr }; + execv(py.c_str(), (char* const*)argv); + execvp(py.c_str(), (char* const*)argv); + _exit(127); + } + close(p_in[0]); close(p_out[1]); close(p_err[1]); + if (!stdin_payload.empty()) { + size_t left = stdin_payload.size(); + const char* p = stdin_payload.data(); + while (left > 0) { + ssize_t n = write(p_in[1], p, left); + if (n < 0) { if (errno == EINTR) continue; break; } + p += n; left -= (size_t)n; + } + } + close(p_in[1]); + std::thread err_t([&]() { + char b[1024]; + while (true) { + ssize_t n = read(p_err[0], b, sizeof(b)); + if (n <= 0) break; + if (stderr_tail) { + stderr_tail->append(b, (size_t)n); + if (stderr_tail->size() > 4096) + stderr_tail->erase(0, stderr_tail->size() - 4096); + } + } + }); + { + char b[4096]; + while (true) { + ssize_t n = read(p_out[0], b, sizeof(b)); + if (n <= 0) break; + if (stdout_buf) stdout_buf->append(b, (size_t)n); + if (stdout_buf && stdout_buf->size() > 1024 * 1024) break; + } + } + close(p_out[0]); + int status = 0; + waitpid(pid, &status, 0); + err_t.join(); + close(p_err[0]); + if (exit_code) { + *exit_code = WIFEXITED(status) ? WEXITSTATUS(status) : -1; + } + return true; +} + +#endif + +// --------------------------------------------------------------------------- +// Apply (lado SQLite) +// --------------------------------------------------------------------------- + +bool find_existing_entity(sqlite3* db, const std::string& type_ref, + const std::string& name, std::string* out_id) +{ + sqlite3_stmt* st = nullptr; + const char* sql = + "SELECT id FROM entities WHERE type_ref = ? AND name = ? LIMIT 1"; + if (sqlite3_prepare_v2(db, sql, -1, &st, nullptr) != SQLITE_OK) return false; + sqlite3_bind_text(st, 1, type_ref.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 2, name.c_str(), -1, SQLITE_TRANSIENT); + bool found = false; + if (sqlite3_step(st) == SQLITE_ROW) { + const unsigned char* t = sqlite3_column_text(st, 0); + if (t) { *out_id = (const char*)t; found = true; } + } + sqlite3_finalize(st); + return found; +} + +bool insert_entity_with_metadata(sqlite3* db, + const std::string& id, + const std::string& name, + const std::string& type_ref, + const std::string& metadata_json, + const std::string& ts) +{ + sqlite3_stmt* st = nullptr; + const char* sql = + "INSERT INTO entities (id, name, type_ref, source, metadata, " + " created_at, updated_at) " + "VALUES (?, ?, ?, 'panel:paste_extract', ?, ?, ?)"; + if (sqlite3_prepare_v2(db, sql, -1, &st, nullptr) != SQLITE_OK) return false; + sqlite3_bind_text(st, 1, id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 2, name.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 3, type_ref.c_str(), -1, SQLITE_TRANSIENT); + const std::string md = metadata_json.empty() ? std::string("{}") + : metadata_json; + sqlite3_bind_text(st, 4, md.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 5, ts.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 6, ts.c_str(), -1, SQLITE_TRANSIENT); + bool ok = sqlite3_step(st) == SQLITE_DONE; + sqlite3_finalize(st); + return ok; +} + +bool relation_exists(sqlite3* db, const std::string& from_id, + const std::string& to_id, const std::string& name) +{ + sqlite3_stmt* st = nullptr; + const char* sql = + "SELECT 1 FROM relations WHERE from_entity = ? AND to_entity = ? " + "AND name = ? LIMIT 1"; + if (sqlite3_prepare_v2(db, sql, -1, &st, nullptr) != SQLITE_OK) return false; + sqlite3_bind_text(st, 1, from_id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 2, to_id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 3, name.c_str(), -1, SQLITE_TRANSIENT); + bool found = (sqlite3_step(st) == SQLITE_ROW); + sqlite3_finalize(st); + return found; +} + +bool insert_relation_simple(sqlite3* db, const std::string& id, + const std::string& name, + const std::string& from_id, + const std::string& to_id, + const std::string& ts) +{ + sqlite3_stmt* st = nullptr; + const char* sql = + "INSERT INTO relations (id, name, from_entity, to_entity, " + " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)"; + if (sqlite3_prepare_v2(db, sql, -1, &st, nullptr) != SQLITE_OK) return false; + sqlite3_bind_text(st, 1, id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 2, name.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 3, from_id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 4, to_id.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 5, ts.c_str(), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(st, 6, ts.c_str(), -1, SQLITE_TRANSIENT); + bool ok = sqlite3_step(st) == SQLITE_DONE; + sqlite3_finalize(st); + return ok; +} + +} // namespace + +// --------------------------------------------------------------------------- +// API publica +// --------------------------------------------------------------------------- + +void extract_panel_init(const char* enrichers_dir, + const char* app_dir, + const char* registry_root) +{ + g_mod.enrichers_dir = enrichers_dir ? enrichers_dir : ""; + g_mod.app_dir = app_dir ? app_dir : ""; + g_mod.registry_root = registry_root ? registry_root : ""; +} + +void extract_panel_shutdown() { + if (!g_panel) return; + if (g_panel->worker.joinable()) g_panel->worker.join(); + delete g_panel; + g_panel = nullptr; +} + +bool extract_panel_parse_result(const std::string& json_text, + ExtractResult* res) +{ + if (!res) return false; + res->entities.clear(); + res->relations.clear(); + res->layers.clear(); + res->error.clear(); + if (json_text.empty()) { + res->error = "empty result"; + return false; + } + size_t end = json_text.size(); + while (end > 0 && (json_text[end-1] == '\n' || json_text[end-1] == '\r' || + json_text[end-1] == ' ' || json_text[end-1] == '\t')) + --end; + size_t start = end > 0 ? json_text.rfind('\n', end - 1) : std::string::npos; + start = (start == std::string::npos) ? 0 : start + 1; + const char* p = json_text.data() + start; + const char* e = json_text.data() + end; + p = skip_ws(p, e); + if (!parse_object_paste_extract(p, e, res)) { + res->error = "json parse error"; + return false; + } + return res->error.empty(); +} + +bool extract_panel_run_subprocess(const std::string& text, + bool use_hybrid, + ExtractResult* out) +{ + if (!out) return false; + out->entities.clear(); + out->relations.clear(); + out->layers.clear(); + out->error.clear(); + out->stderr_tail.clear(); + + std::string script; + if (!g_mod.enrichers_dir.empty()) { + script = g_mod.enrichers_dir + "/paste_extract/run.py"; + } + if (script.empty() || !file_exists(script)) { + out->error = "paste_extract/run.py not found"; + return false; + } + std::string stdin_payload = build_stdin_payload(text, use_hybrid); + std::string stdout_buf; + std::string stderr_tail; + int rc = -1; + if (!spawn_python_blocking(script, stdin_payload, &stdout_buf, + &stderr_tail, &rc)) { + out->error = "spawn failed"; + out->stderr_tail = std::move(stderr_tail); + return false; + } + out->stderr_tail = std::move(stderr_tail); + if (rc != 0) { + char b[64]; + std::snprintf(b, sizeof(b), "exit %d", rc); + out->error = b; + return false; + } + if (!extract_panel_parse_result(stdout_buf, out)) { + return false; + } + return true; +} + +bool extract_panel_apply(const char* ops_db_path, + const ExtractResult& result, + int* out_added_entities, + int* out_dedup_entities, + int* out_added_relations, + int* out_skipped_relations) +{ + if (out_added_entities) *out_added_entities = 0; + if (out_dedup_entities) *out_dedup_entities = 0; + if (out_added_relations) *out_added_relations = 0; + if (out_skipped_relations) *out_skipped_relations = 0; + if (!ops_db_path || !*ops_db_path) return false; + + sqlite3* db = nullptr; + if (sqlite3_open_v2(ops_db_path, &db, SQLITE_OPEN_READWRITE, nullptr) + != SQLITE_OK) { + if (db) sqlite3_close(db); + return false; + } + sqlite3_exec(db, "BEGIN", nullptr, nullptr, nullptr); + + std::string ts = now_iso_local(); + std::unordered_map map_id; + map_id.reserve(result.entities.size()); + + int idx = 0; + int added_e = 0, dedup_e = 0; + for (const auto& e : result.entities) { + if (!e.selected) { ++idx; continue; } + std::string type_ref = e.type_buf[0] ? std::string(e.type_buf) : e.type_ref; + std::string name = e.name_buf[0] ? std::string(e.name_buf) : e.name; + if (type_ref.empty() || name.empty()) { ++idx; continue; } + std::string sql_id; + if (find_existing_entity(db, type_ref, name, &sql_id)) { + map_id[e.tmp_id] = sql_id; + ++dedup_e; + } else { + char id_buf[96]; + std::snprintf(id_buf, sizeof(id_buf), "%s_%lld_%d", + type_ref.c_str(), now_ms_local(), idx); + std::string new_id = id_buf; + if (!insert_entity_with_metadata(db, new_id, name, type_ref, + e.metadata_json, ts)) { + sqlite3_exec(db, "ROLLBACK", nullptr, nullptr, nullptr); + sqlite3_close(db); + return false; + } + map_id[e.tmp_id] = new_id; + ++added_e; + } + ++idx; + } + + int added_r = 0, skipped_r = 0; + int ridx = 0; + for (const auto& r : result.relations) { + ++ridx; + if (!r.selected) continue; + auto it_from = map_id.find(r.from_tmp_id); + auto it_to = map_id.find(r.to_tmp_id); + if (it_from == map_id.end() || it_to == map_id.end()) { + ++skipped_r; continue; + } + std::string name = r.name.empty() ? std::string("RELATED_TO") : r.name; + if (relation_exists(db, it_from->second, it_to->second, name)) { + ++skipped_r; continue; + } + char id_buf[96]; + std::snprintf(id_buf, sizeof(id_buf), "rel_%lld_%d", + now_ms_local(), ridx); + if (!insert_relation_simple(db, id_buf, name, + it_from->second, it_to->second, ts)) { + sqlite3_exec(db, "ROLLBACK", nullptr, nullptr, nullptr); + sqlite3_close(db); + return false; + } + ++added_r; + } + + sqlite3_exec(db, "COMMIT", nullptr, nullptr, nullptr); + sqlite3_close(db); + + if (out_added_entities) *out_added_entities = added_e; + if (out_dedup_entities) *out_dedup_entities = dedup_e; + if (out_added_relations) *out_added_relations = added_r; + if (out_skipped_relations) *out_skipped_relations = skipped_r; + return true; +} + +// --------------------------------------------------------------------------- +// Render (ImGui) +// --------------------------------------------------------------------------- + +namespace { + +void launch_extract_async(ExtractPanelState& s, bool use_hybrid) { + if (s.busy.load()) return; + if (s.worker.joinable()) s.worker.join(); // por si hubo uno previo + + // Snapshot del texto (el buffer puede mutar mientras corre). + std::string text(s.text_buf.data()); + s.busy.store(true); + s.status = "Extracting..."; + s.new_result.store(false); + + s.worker = std::thread([&s, text, use_hybrid]() { + auto r = std::make_shared(); + bool ok = extract_panel_run_subprocess(text, use_hybrid, r.get()); + { + std::lock_guard lk(s.result_mu); + s.result = r; + char buf[128]; + if (ok) { + std::snprintf(buf, sizeof(buf), + "OK — %zu entities, %zu relations", + r->entities.size(), r->relations.size()); + } else { + std::snprintf(buf, sizeof(buf), "ERROR: %s", + r->error.c_str()); + } + s.status = buf; + } + s.new_result.store(true); + s.busy.store(false); + }); +} + +} // namespace + +void extract_panel_render(AppState& app) { + if (!app.panel_extract) return; + + ExtractPanelState& s = panel_state(); + + if (!ImGui::Begin("Extract", &app.panel_extract)) { + ImGui::End(); + return; + } + + // Top bar: hybrid toggle + Extract button + status. + ImGui::Checkbox("Use hybrid (GLiNER/GLiREL)", &s.use_hybrid); + ImGui::SameLine(); + bool busy = s.busy.load(); + ImGui::BeginDisabled(busy || s.text_buf.size() < 2 || s.text_buf[0] == 0); + if (ImGui::Button(busy ? "Extracting..." : (TI_BOLT " Extract"))) { + launch_extract_async(s, s.use_hybrid); + } + ImGui::EndDisabled(); + ImGui::SameLine(); + ImGui::TextDisabled("%s", s.status.c_str()); + + // Multi-line text input. Crece dinamicamente si el usuario pega un + // texto largo (ImGuiInputTextFlags_CallbackResize). + auto resize_cb = [](ImGuiInputTextCallbackData* data) -> int { + if (data->EventFlag == ImGuiInputTextFlags_CallbackResize) { + auto* buf = (std::vector*)data->UserData; + buf->resize(data->BufTextLen + 1); + data->Buf = buf->data(); + } + return 0; + }; + + ImVec2 input_size(-1.0f, ImGui::GetContentRegionAvail().y * 0.45f); + ImGui::InputTextMultiline("##paste_text", + s.text_buf.data(), s.text_buf.size(), + input_size, + ImGuiInputTextFlags_CallbackResize, resize_cb, &s.text_buf); + + ImGui::Separator(); + + // Tablas de propuestas. + std::shared_ptr res; + { + std::lock_guard lk(s.result_mu); + res = s.result; + } + if (!res) { + ImGui::TextDisabled("Pega texto y pulsa Extract para ver propuestas."); + ImGui::End(); + return; + } + if (!res->error.empty()) { + ImGui::TextColored(ImVec4(0.95f,0.45f,0.45f,1.0f), + "Error: %s", res->error.c_str()); + if (!res->stderr_tail.empty()) { + ImGui::TextWrapped("%s", res->stderr_tail.c_str()); + } + ImGui::End(); + return; + } + + if (!res->layers.empty()) { + std::string layers = "Layers: "; + for (size_t i = 0; i < res->layers.size(); ++i) { + if (i) layers += ", "; + layers += res->layers[i]; + } + ImGui::TextDisabled("%s", layers.c_str()); + } + + // Toolbar para select-all / none. + if (ImGui::SmallButton("All")) { + for (auto& e : res->entities) e.selected = true; + for (auto& r : res->relations) r.selected = true; + } + ImGui::SameLine(); + if (ImGui::SmallButton("None")) { + for (auto& e : res->entities) e.selected = false; + for (auto& r : res->relations) r.selected = false; + } + ImGui::SameLine(); + int sel_e = 0, sel_r = 0; + for (const auto& e : res->entities) if (e.selected) ++sel_e; + for (const auto& r : res->relations) if (r.selected) ++sel_r; + ImGui::TextDisabled("Selected: %d entities, %d relations", sel_e, sel_r); + + // Apply Selected. + ImGui::SameLine(); + bool can_apply = (sel_e + sel_r) > 0 && !app.input_db_path.empty(); + ImGui::BeginDisabled(!can_apply); + if (ImGui::Button(TI_CHECK " Apply Selected")) { + int ae=0, de=0, ar=0, sr=0; + bool ok = extract_panel_apply(app.input_db_path.c_str(), + *res, &ae, &de, &ar, &sr); + if (ok) { + s.last_apply_entities = ae; + s.last_apply_relations = ar; + s.last_apply_dedup = de; + char buf[160]; + std::snprintf(buf, sizeof(buf), + "Applied: +%d entities, +%d relations (%d deduped, %d skipped)", + ae, ar, de, sr); + s.status = buf; + app.want_reload = true; + } else { + s.status = "Apply failed"; + } + } + ImGui::EndDisabled(); + if (app.input_db_path.empty()) { + ImGui::SameLine(); + ImGui::TextColored(ImVec4(1.0f,0.7f,0.3f,1.0f), + "(no operations.db loaded)"); + } + + // Tabla de entidades. + if (!res->entities.empty() && + ImGui::CollapsingHeader("Entities", ImGuiTreeNodeFlags_DefaultOpen)) { + if (ImGui::BeginTable("##ents", 5, + ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg | + ImGuiTableFlags_ScrollY, + ImVec2(0.0f, 200.0f))) { + ImGui::TableSetupColumn("", ImGuiTableColumnFlags_WidthFixed, 28.0f); + ImGui::TableSetupColumn("Type", ImGuiTableColumnFlags_WidthFixed, 120.0f); + ImGui::TableSetupColumn("Name"); + ImGui::TableSetupColumn("Span", ImGuiTableColumnFlags_WidthFixed, 90.0f); + ImGui::TableSetupColumn("Src", ImGuiTableColumnFlags_WidthFixed, 60.0f); + ImGui::TableHeadersRow(); + + for (size_t i = 0; i < res->entities.size(); ++i) { + auto& e = res->entities[i]; + ImGui::TableNextRow(); + ImGui::PushID((int)i); + + ImGui::TableNextColumn(); + ImGui::Checkbox("##sel", &e.selected); + + ImGui::TableNextColumn(); + ImGui::SetNextItemWidth(-1); + ImGui::InputText("##type", e.type_buf, sizeof(e.type_buf)); + + ImGui::TableNextColumn(); + ImGui::SetNextItemWidth(-1); + ImGui::InputText("##name", e.name_buf, sizeof(e.name_buf)); + + ImGui::TableNextColumn(); + if (e.start_offset >= 0) { + ImGui::Text("%d-%d", e.start_offset, e.end_offset); + } else { + ImGui::TextDisabled("—"); + } + + ImGui::TableNextColumn(); + ImGui::TextDisabled("%s", e.source.c_str()); + + ImGui::PopID(); + } + ImGui::EndTable(); + } + } + + // Tabla de relaciones. + if (!res->relations.empty() && + ImGui::CollapsingHeader("Relations", ImGuiTreeNodeFlags_DefaultOpen)) { + if (ImGui::BeginTable("##rels", 5, + ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg | + ImGuiTableFlags_ScrollY, + ImVec2(0.0f, 160.0f))) { + ImGui::TableSetupColumn("", ImGuiTableColumnFlags_WidthFixed, 28.0f); + ImGui::TableSetupColumn("From", ImGuiTableColumnFlags_WidthFixed, 100.0f); + ImGui::TableSetupColumn("Name"); + ImGui::TableSetupColumn("To", ImGuiTableColumnFlags_WidthFixed, 100.0f); + ImGui::TableSetupColumn("Conf", ImGuiTableColumnFlags_WidthFixed, 60.0f); + ImGui::TableHeadersRow(); + + // Lookup helper: tmp_id -> entity name (para mostrar en From/To). + auto entity_label = [&](const std::string& tmp) -> std::string { + for (const auto& e : res->entities) { + if (e.tmp_id == tmp) { + std::string n = e.name_buf[0] ? e.name_buf : e.name; + if (n.size() > 18) n = n.substr(0, 15) + "..."; + return n; + } + } + return tmp; + }; + + for (size_t i = 0; i < res->relations.size(); ++i) { + auto& r = res->relations[i]; + ImGui::TableNextRow(); + ImGui::PushID(2000 + (int)i); + + ImGui::TableNextColumn(); + ImGui::Checkbox("##sel", &r.selected); + + ImGui::TableNextColumn(); + ImGui::TextUnformatted(entity_label(r.from_tmp_id).c_str()); + + ImGui::TableNextColumn(); + ImGui::TextUnformatted(r.name.c_str()); + + ImGui::TableNextColumn(); + ImGui::TextUnformatted(entity_label(r.to_tmp_id).c_str()); + + ImGui::TableNextColumn(); + ImGui::Text("%.2f", r.confidence); + + ImGui::PopID(); + } + ImGui::EndTable(); + } + } + + ImGui::End(); +} + +} // namespace ge diff --git a/extract_panel.h b/extract_panel.h new file mode 100644 index 0000000..19d4601 --- /dev/null +++ b/extract_panel.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +// Panel "Paste & Extract" (issue 0013). +// +// Textarea grande para pegar texto. Boton Extract lanza el script +// `enrichers/paste_extract/run.py` en un hilo aparte (no bloquea UI). +// El script devuelve un JSON con entidades y relaciones propuestas (modo +// preview — no escribe a operations.db). El panel muestra dos tablas +// (entidades / relaciones) con checkboxes; al pulsar "Apply Selected" +// se persisten via entity_ops con dedupe por (type_ref, name). +// +// Threading: una llamada Extract a la vez (extract_busy bool). El hilo +// rellena la propuesta tras hacerse el subprocess. Apply corre en el +// thread principal y dispara reload del grafo via app.want_reload. +// +// El enricher esta declarado en `enrichers/paste_extract/manifest.yaml` +// pero NO se invoca via el sistema de jobs — el panel lo lanza +// directamente. Vivir en `enrichers/` permite que se distribuya y que +// el script use el mismo Python runtime resolution que el resto. + +namespace ge { + +struct AppState; + +// Una entidad propuesta por el extractor. Se guarda como string para +// poder editarla inline antes del Apply. +struct ProposedEntity { + std::string tmp_id; // "tmp_0", "tmp_1", ... vinculado a relaciones + std::string type_ref; // editable + std::string name; // editable + std::string source; // "regex" | "hybrid" + int start_offset = -1; // span en el texto pegado + int end_offset = -1; + double confidence = 1.0; + std::string metadata_json; // JSON literal (no editable v1) + bool selected = true; + + // Buffers mutables para edicion inline en ImGui. + char type_buf[64] = {}; + char name_buf[256] = {}; +}; + +struct ProposedRelation { + std::string from_tmp_id; + std::string to_tmp_id; + std::string name; // ej: "works_at" + std::string source; // "hybrid" | ... + double confidence = 0.0; + bool selected = true; +}; + +struct ExtractResult { + std::vector entities; + std::vector relations; + std::vector layers; + std::string error; // vacio si OK + std::string stderr_tail; +}; + +struct ExtractPanelState { + // Buffer de texto del textarea. Crece dinamicamente. + std::vector text_buf; + bool text_initialized = false; + + // Resultado del ultimo Extract (poblado por el worker thread). + std::shared_ptr result; + std::mutex result_mu; + std::atomic busy{false}; + std::atomic new_result{false}; // hay resultado fresco + + // Mensaje de status (en el footer) — refrescado por el worker. + std::string status; + + // Stats del ultimo apply. + int last_apply_entities = 0; + int last_apply_relations = 0; + int last_apply_dedup = 0; + + // Toggle: ¿usar hybrid (GLiNER/GLiREL) si esta disponible? + bool use_hybrid = false; + + // Worker thread; joinable cuando esta vivo. + std::thread worker; +}; + +// Configura paths que el worker necesita para invocar Python. Llamar una +// vez tras `jobs_init` (re-usa el resolver de Python runtime + paths). +void extract_panel_init(const char* enrichers_dir, + const char* app_dir, + const char* registry_root); + +// Suelta el worker thread si esta corriendo (cancelable). Llamar al +// shutdown de la app. +void extract_panel_shutdown(); + +// Renderiza el panel. Si app.panel_extract es false, retorna sin dibujar. +void extract_panel_render(AppState& app); + +// Aplica las entidades/relaciones marcadas como selected al +// operations.db indicado. Inserta entidades nuevas con dedupe por +// (type_ref, name); reusa el id existente si lo encuentra. Despues +// inserta las relaciones cuyos endpoints (mapeados via tmp_id -> +// real_id) sean ambos validos. +// +// Devuelve los conteos en out_added_entities, out_dedup_entities, +// out_added_relations. Tolera que algunas relaciones no resuelvan +// (out_skipped_relations). El caller decide si setear app.want_reload. +// +// Esta funcion es testeable en aislamiento (no toca ImGui). +bool extract_panel_apply(const char* ops_db_path, + const ExtractResult& result, + int* out_added_entities, + int* out_dedup_entities, + int* out_added_relations, + int* out_skipped_relations); + +// Helper interno expuesto para tests: parsea el JSON que produce +// `enrichers/paste_extract/run.py`. Devuelve true si el parseo es OK. +// En error, result.error se rellena. +bool extract_panel_parse_result(const std::string& json_text, + ExtractResult* result); + +// Spawnea el subprocess Python para extraer. Sincronico (bloquea el +// hilo del caller). El panel lo invoca en un std::thread aparte para +// no congelar la UI. Expuesto por si los tests quieren llamarlo +// directamente (no por ahora — los tests cubren el lado Python via +// pytest, y el lado C++ via parse_result + apply). +bool extract_panel_run_subprocess(const std::string& text, + bool use_hybrid, + ExtractResult* out); + +} // namespace ge diff --git a/issues/0013-paste-extract-panel.md b/issues/completed/0013-paste-extract-panel.md similarity index 100% rename from issues/0013-paste-extract-panel.md rename to issues/completed/0013-paste-extract-panel.md diff --git a/main.cpp b/main.cpp index 4bfb82a..9cc8c11 100644 --- a/main.cpp +++ b/main.cpp @@ -30,6 +30,7 @@ #include "jobs.h" #include "enrichers.h" #include "chat.h" +#include "extract_panel.h" #include "../../../../cpp/vendor/sqlite3/sqlite3.h" @@ -1235,6 +1236,7 @@ static fn_ui::PanelToggle g_panels[] = { {"Table", nullptr, &g_app.panel_table}, {"Jobs", nullptr, &g_app.panel_jobs}, {"Echo", nullptr, &g_app.panel_chat}, + {"Extract", nullptr, &g_app.panel_extract}, }; static void render() { @@ -2178,6 +2180,12 @@ static void render() { ImGui::SetNextWindowSize(ImVec2(520.0f, 720.0f), ImGuiCond_FirstUseEver); ge::chat_render(&g_app.panel_chat); + // Extract panel (issue 0013) — flotante, dockeable. + ImGui::SetNextWindowPos (ImVec2(vp->WorkPos.x + W * 0.30f, top + 50.0f), + ImGuiCond_FirstUseEver); + ImGui::SetNextWindowSize(ImVec2(720.0f, 640.0f), ImGuiCond_FirstUseEver); + ge::extract_panel_render(g_app); + // Enricher config window (abierto desde context menu Run enricher). render_enricher_config_window(); @@ -2484,6 +2492,12 @@ int main(int argc, char** argv) { (int)ge::enrichers_all().size()); } + // Extract panel (issue 0013) — invoca enrichers/paste_extract/run.py + // directamente en su propio hilo, sin pasar por el sistema de jobs. + ge::extract_panel_init(enrichers_dir.c_str(), + app_dir.c_str(), + registry_root.c_str()); + // Chat panel (claude -p) — el agente invoca gx-cli para mutar // operations.db. agent_mutations counter en graph_explorer.db dispara // reload del viewport en cada cambio. @@ -2573,6 +2587,7 @@ int main(int argc, char** argv) { // Cleanup ge::chat_shutdown(); + ge::extract_panel_shutdown(); ge::jobs_shutdown(); if (g_layout_storage) { fn_ui::layout_storage_close(g_layout_storage); diff --git a/tests/test_paste_extract.py b/tests/test_paste_extract.py new file mode 100644 index 0000000..1a782a8 --- /dev/null +++ b/tests/test_paste_extract.py @@ -0,0 +1,367 @@ +"""Tests del enricher paste_extract (issue 0013). + +paste_extract es modo PREVIEW puro: no escribe a operations.db. Recibe +texto via params.text y devuelve un JSON con entidades y relaciones +propuestas. La aplicacion (panel C++) procesa el JSON y persiste con +dedupe via el codigo C++ (probado en TU separadas si se quisiera). + +Decision: NO probamos la cascada hibrida (GLiNER+GLiREL) en pytest — +los modelos pesan cientos de MB y tardan segundos en cargar. El +contrato del script en `use_hybrid=false` es lo que cubre el panel +en la primera iteracion. Si hybrid esta disponible, simplemente +añade entidades adicionales: la logica de merge y dedupe se ejerce +con regex+regex (mismo texto pasado dos veces) y con stubs en otros +tests. +""" +from __future__ import annotations + +import json +import os +import sqlite3 +from pathlib import Path + +import pytest + +from conftest import ( + base_ctx, list_entities, list_relations, run_enricher, SCHEMA_SQL, +) + + +def _resolve_real_registry_root() -> Path | None: + """Localiza la raiz real de fn_registry buscando registry.db + cmd/fn. + + El conftest tiene un fallback que devuelve `/home/lucas` si encuentra + un registry.db perdido en HOME — eso rompe los tests que dependen de + importar `python.functions.cybersecurity.extract_iocs`. Aqui buscamos + explicitamente por el marker AMBOS (`registry.db` Y `cmd/fn/main.go`). + + En worktrees, el repo no es un ancestro: aceptamos un override via + `FN_REGISTRY_ROOT` env. Tambien probamos paths conocidos comunes. + """ + env = os.environ.get("FN_REGISTRY_ROOT") + if env: + p = Path(env) + if (p / "registry.db").exists() and \ + (p / "cmd" / "fn" / "main.go").exists(): + return p + p = Path(__file__).resolve() + for ancestor in p.parents: + if (ancestor / "registry.db").exists() and \ + (ancestor / "cmd" / "fn" / "main.go").exists(): + return ancestor + # Fallback hardcoded — busca el registry mas cercano al worktree. + for cand in [Path.home() / "fn_registry", Path("/home/lucas/fn_registry")]: + if (cand / "registry.db").exists() and \ + (cand / "cmd" / "fn" / "main.go").exists(): + return cand + return None + + +REAL_REGISTRY_ROOT = _resolve_real_registry_root() + + +@pytest.fixture +def real_registry_root(): + """Usar este en lugar de `registry_root` cuando el enricher + necesite importar paquetes Python del registry.""" + if REAL_REGISTRY_ROOT is None: + pytest.skip("fn_registry root not found from this worktree") + return REAL_REGISTRY_ROOT + + +SAMPLE_BANKING = ( + "Acme Corp anuncio que su CEO bad@evil.com firmo un acuerdo. " + "Servidores afectados: 192.0.2.55 y 10.0.0.12. " + "Vulnerabilidad: CVE-2024-12345. Hash IOC: 44d88612fea8a8f36de82e1278abb02f." +) + + +def _make_ctx(*, ops_db, app_dir, registry_root, text, **params): + """Helper — paste_extract no necesita node_id ni ops_db_path.""" + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="", node_name="", node_type="") + ctx["params"] = {"text": text, **params} + return ctx + + +def test_paste_extract_returns_entities_no_db_write(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """Modo preview: parsea entidades pero NO escribe a operations.db.""" + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=SAMPLE_BANKING) + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 0, err + assert out is not None + assert "entities" in out + assert "relations" in out + assert "stats" in out + assert out["stats"]["layers"] == ["regex"] + + # Tipos esperados (al menos Email, IPAddress, CVE). + types = {e["type_ref"] for e in out["entities"]} + assert "Email" in types, types + assert "CVE" in types, types + + # Cada entidad tiene los campos del contrato. + for e in out["entities"]: + assert isinstance(e["id"], str) and e["id"].startswith("tmp_"), e + assert e["type_ref"] and e["name"] + assert e["source"] in ("regex", "hybrid") + assert "metadata" in e + # start/end son ints (>=0 en regex matches). + assert isinstance(e["start"], int) + assert isinstance(e["end"], int) + + # Crucial: NO se ha escrito a la BD (modo preview). + assert list_entities(ops_db) == [] + assert list_relations(ops_db) == [] + + +def test_paste_extract_dedupes_within_run(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """Texto con duplicados → cada (type_ref, name) aparece una sola vez.""" + text = ("Email a foo@bar.com y otra vez foo@bar.com. " + "IP 192.0.2.10. Repite IP 192.0.2.10.") + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=text) + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 0, err + + keys = [(e["type_ref"], e["name"]) for e in out["entities"]] + assert len(keys) == len(set(keys)), keys + assert ("Email", "foo@bar.com") in keys + assert ("IPAddress", "192.0.2.10") in keys + + +def test_paste_extract_empty_text_fails_clean(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """Sin params.text → exit 2 + JSON con error y entities vacias.""" + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text="") + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 2, err + assert out is not None + assert out["entities"] == [] + assert "error" in out + + +def test_paste_extract_max_entities_truncates(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """max_entities=N corta la lista a las N primeras encontradas.""" + text = " ".join(f"contact{i:03d}@example.org" for i in range(50)) + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=text, max_entities=10) + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 0, err + assert len(out["entities"]) == 10 + + +def test_paste_extract_types_filter(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """params.types filtra qué tipos IoC se extraen.""" + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=SAMPLE_BANKING, types="email") + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 0, err + types = {e["type_ref"] for e in out["entities"]} + # Solo Email — el filtro paso a extract_iocs y este solo emite emails. + assert types == {"Email"}, types + + +def test_paste_extract_use_hybrid_false_skips_layer(ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """use_hybrid=False ⇒ stats.layers = ['regex'] (no toca GLiNER).""" + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=SAMPLE_BANKING, use_hybrid=False) + rc, out, err = run_enricher("paste_extract", ctx) + assert rc == 0, err + assert out["stats"]["layers"] == ["regex"] + + +def test_paste_extract_idempotent_runs_no_duplicate_proposal( + ops_db, app_dir, real_registry_root): + registry_root = real_registry_root + """Llamar paste_extract dos veces con el mismo texto produce la + misma propuesta — la dedupe del *commit* es responsabilidad del + panel C++, pero el script preview ya devuelve sin duplicados.""" + ctx = _make_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + text=SAMPLE_BANKING) + rc1, out1, _ = run_enricher("paste_extract", ctx) + rc2, out2, _ = run_enricher("paste_extract", ctx) + assert rc1 == 0 and rc2 == 0 + keys1 = sorted((e["type_ref"], e["name"]) for e in out1["entities"]) + keys2 = sorted((e["type_ref"], e["name"]) for e in out2["entities"]) + assert keys1 == keys2 + + +# ----------------------------------------------------------------------------- +# Apply-side tests — replican la logica de extract_panel_apply en Python para +# verificar el contrato de dedupe que el panel C++ implementa. Ejercitan que +# (1) entidades nuevas se insertan, (2) duplicadas (type_ref, name) reusan id, +# (3) las relaciones cuyos endpoints estan en la BD se persisten, (4) las que +# no, se descartan. +# +# El panel C++ se prueba al compilar (build verde) y en runtime via la CLI; +# aqui validamos el *contrato* del JSON output que el panel consume. +# ----------------------------------------------------------------------------- + + +def _apply_proposal_python(ops_db_path, proposal: dict) -> dict: + """Implementacion de referencia de extract_panel_apply en Python. + + Coincide con la del C++ — sirve para validar el contrato. Si esta + funcion y la del C++ producen el mismo resultado en los mismos + inputs, el wire-protocol es correcto. + """ + conn = sqlite3.connect(ops_db_path) + try: + ts = "2026-01-01T00:00:00Z" + map_id = {} + added_e = 0 + dedup_e = 0 + for i, e in enumerate(proposal.get("entities", [])): + if not e.get("selected", True): + continue + tref = e["type_ref"]; name = e["name"] + existing = conn.execute( + "SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1", + (tref, name)).fetchone() + if existing: + map_id[e["id"]] = existing[0] + dedup_e += 1 + else: + new_id = f"{tref}_{i}_{name}" + conn.execute( + "INSERT INTO entities (id, name, type_ref, source, " + " metadata, created_at, updated_at) " + "VALUES (?, ?, ?, 'panel:paste_extract', ?, ?, ?)", + (new_id, name, tref, + json.dumps(e.get("metadata", {})), ts, ts)) + map_id[e["id"]] = new_id + added_e += 1 + added_r = 0 + skipped_r = 0 + for j, r in enumerate(proposal.get("relations", [])): + if not r.get("selected", True): + continue + f = map_id.get(r["from_id"]); t = map_id.get(r["to_id"]) + if not f or not t: + skipped_r += 1 + continue + name = r.get("name") or "RELATED_TO" + existing = conn.execute( + "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? " + "AND name=? LIMIT 1", (f, t, name)).fetchone() + if existing: + skipped_r += 1 + continue + conn.execute( + "INSERT INTO relations (id, name, from_entity, to_entity, " + " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)", + (f"rel_{j}_{name}", name, f, t, ts, ts)) + added_r += 1 + conn.commit() + return {"added_entities": added_e, "dedup_entities": dedup_e, + "added_relations": added_r, "skipped_relations": skipped_r} + finally: + conn.close() + + +def test_apply_inserts_only_selected(ops_db): + """Solo las entidades con selected=true se insertan.""" + proposal = { + "entities": [ + {"id": "tmp_0", "type_ref": "Email", "name": "a@b.com", + "metadata": {}, "selected": True}, + {"id": "tmp_1", "type_ref": "IPAddress", "name": "1.2.3.4", + "metadata": {}, "selected": False}, # NO seleccionada + {"id": "tmp_2", "type_ref": "CVE", "name": "CVE-2024-1", + "metadata": {}, "selected": True}, + ], + "relations": [], + } + stats = _apply_proposal_python(ops_db, proposal) + assert stats["added_entities"] == 2 + types = {e["type_ref"] for e in list_entities(ops_db)} + assert types == {"Email", "CVE"} + + +def test_apply_dedupes_by_type_and_name(ops_db): + """Reaplicar el mismo proposal NO duplica entidades.""" + proposal = { + "entities": [ + {"id": "tmp_0", "type_ref": "Email", "name": "x@y.z", + "metadata": {}, "selected": True}, + ], + "relations": [], + } + s1 = _apply_proposal_python(ops_db, proposal) + s2 = _apply_proposal_python(ops_db, proposal) + assert s1["added_entities"] == 1 + assert s2["added_entities"] == 0 + assert s2["dedup_entities"] == 1 + # Solo una fila en la BD. + rows = list_entities(ops_db) + assert len(rows) == 1 + + +def test_apply_inserts_relations_when_endpoints_resolve(ops_db): + """Relaciones con endpoints validos (selected) se persisten.""" + proposal = { + "entities": [ + {"id": "tmp_0", "type_ref": "Person", "name": "Alice", + "metadata": {}, "selected": True}, + {"id": "tmp_1", "type_ref": "Organization", "name": "Acme", + "metadata": {}, "selected": True}, + ], + "relations": [ + {"from_id": "tmp_0", "to_id": "tmp_1", + "name": "works_at", "selected": True}, + ], + } + stats = _apply_proposal_python(ops_db, proposal) + assert stats["added_entities"] == 2 + assert stats["added_relations"] == 1 + rels = list_relations(ops_db, name="works_at") + assert len(rels) == 1 + + +def test_apply_skips_relation_if_endpoint_unselected(ops_db): + """Si un endpoint no se selecciona, su relacion se descarta.""" + proposal = { + "entities": [ + {"id": "tmp_0", "type_ref": "Person", "name": "Alice", + "metadata": {}, "selected": True}, + {"id": "tmp_1", "type_ref": "Organization", "name": "Acme", + "metadata": {}, "selected": False}, # NO se inserta + ], + "relations": [ + {"from_id": "tmp_0", "to_id": "tmp_1", + "name": "works_at", "selected": True}, + ], + } + stats = _apply_proposal_python(ops_db, proposal) + assert stats["added_entities"] == 1 + assert stats["added_relations"] == 0 + assert stats["skipped_relations"] == 1 + + +def test_apply_dedupes_relation_on_repeat(ops_db): + """Relacion (from, to, name) repetida no se duplica.""" + proposal = { + "entities": [ + {"id": "tmp_0", "type_ref": "Person", "name": "Alice", + "metadata": {}, "selected": True}, + {"id": "tmp_1", "type_ref": "Organization", "name": "Acme", + "metadata": {}, "selected": True}, + ], + "relations": [ + {"from_id": "tmp_0", "to_id": "tmp_1", + "name": "works_at", "selected": True}, + ], + } + s1 = _apply_proposal_python(ops_db, proposal) + s2 = _apply_proposal_python(ops_db, proposal) + assert s1["added_relations"] == 1 + assert s2["added_relations"] == 0 + assert s2["skipped_relations"] == 1 diff --git a/views.h b/views.h index da3ab19..e83015d 100644 --- a/views.h +++ b/views.h @@ -62,6 +62,7 @@ struct AppState { bool panel_note = false; bool panel_jobs = false; // issue 0026 bool panel_chat = false; // claude -p chat (issue 0001) + bool panel_extract = false; // paste & extract (issue 0013) bool show_filters_modal = false; bool show_open_modal = false;