52495af779
Manifest YAML puede declarar 'auto_group_threshold: <int>' a nivel top-level. enrichers.cpp lo parsea y lo guarda en EnricherSpec. jobs.cpp lo inyecta como campo opcional 'auto_group_threshold' en el JSON stdin del subprocess. Los enrichers Python que crean Groups (web_search, split_words, split_sentences, extract_iocs_text) leen el campo y, si viene > 0, lo usan en lugar de su DEFAULT_GROUP_THRESHOLD. Helper _coerce_threshold tolera int / str / None / 0 cayendo al default.
354 lines
12 KiB
C++
354 lines
12 KiB
C++
#include "enrichers.h"
|
|
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <dirent.h>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <sys/stat.h>
|
|
|
|
namespace ge {
|
|
|
|
namespace {
|
|
|
|
std::vector<EnricherSpec> g_enrichers;
|
|
|
|
std::string strip(const std::string& s) {
|
|
size_t a = 0, b = s.size();
|
|
while (a < b && std::isspace((unsigned char)s[a])) ++a;
|
|
while (b > a && std::isspace((unsigned char)s[b - 1])) --b;
|
|
return s.substr(a, b - a);
|
|
}
|
|
|
|
std::string strip_quotes(const std::string& s) {
|
|
if (s.size() >= 2) {
|
|
if ((s.front() == '"' && s.back() == '"') ||
|
|
(s.front() == '\'' && s.back() == '\'')) {
|
|
return s.substr(1, s.size() - 2);
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
std::string lower(std::string s) {
|
|
for (auto& c : s) c = (char)std::tolower((unsigned char)c);
|
|
return s;
|
|
}
|
|
|
|
// Parsea una lista inline `[a, b, c]` o "[Webpage, Url]". Tolerante a
|
|
// espacios y a comillas simples/dobles dentro. NO soporta listas
|
|
// multi-linea — el manifest las usa siempre inline.
|
|
std::vector<std::string> parse_inline_list(const std::string& v) {
|
|
std::vector<std::string> out;
|
|
std::string s = strip(v);
|
|
if (s.size() < 2 || s.front() != '[' || s.back() != ']') return out;
|
|
s = s.substr(1, s.size() - 2);
|
|
std::string token;
|
|
auto flush = [&]() {
|
|
std::string t = strip_quotes(strip(token));
|
|
if (!t.empty()) out.push_back(std::move(t));
|
|
token.clear();
|
|
};
|
|
for (char c : s) {
|
|
if (c == ',') flush();
|
|
else token.push_back(c);
|
|
}
|
|
flush();
|
|
return out;
|
|
}
|
|
|
|
// Split por comas a nivel cero, respetando comillas y nesting de [] / {}.
|
|
// El YAML inline `{ name: limit, type: int, default: 10 }` puede contener
|
|
// strings con comas entre comillas — un split crudo las rompería.
|
|
std::vector<std::string> split_top_level(const std::string& s) {
|
|
std::vector<std::string> out;
|
|
std::string cur;
|
|
int depth_b = 0, depth_c = 0;
|
|
char quote = 0;
|
|
for (char c : s) {
|
|
if (quote) {
|
|
cur.push_back(c);
|
|
if (c == quote) quote = 0;
|
|
continue;
|
|
}
|
|
if (c == '"' || c == '\'') { quote = c; cur.push_back(c); continue; }
|
|
if (c == '[') ++depth_b;
|
|
if (c == ']') --depth_b;
|
|
if (c == '{') ++depth_c;
|
|
if (c == '}') --depth_c;
|
|
if (c == ',' && depth_b == 0 && depth_c == 0) {
|
|
out.push_back(cur);
|
|
cur.clear();
|
|
continue;
|
|
}
|
|
cur.push_back(c);
|
|
}
|
|
if (!cur.empty()) out.push_back(cur);
|
|
return out;
|
|
}
|
|
|
|
// Parsea un objeto YAML inline `{ name: x, type: int, default: 10 }` a un
|
|
// EnricherParam. Retorna true si al menos `name` se resolvio.
|
|
bool parse_inline_param(const std::string& v, EnricherParam* out) {
|
|
std::string s = strip(v);
|
|
if (s.size() < 2 || s.front() != '{' || s.back() != '}') return false;
|
|
s = s.substr(1, s.size() - 2);
|
|
for (auto& kv : split_top_level(s)) {
|
|
size_t colon = kv.find(':');
|
|
if (colon == std::string::npos) continue;
|
|
std::string k = strip(kv.substr(0, colon));
|
|
std::string val = strip_quotes(strip(kv.substr(colon + 1)));
|
|
if (k == "name") out->name = val;
|
|
else if (k == "type") out->type = lower(val);
|
|
else if (k == "default") out->default_value = val;
|
|
else if (k == "description") out->description = val;
|
|
else if (k == "desc") out->description = val;
|
|
}
|
|
if (out->type.empty()) out->type = "string";
|
|
return !out->name.empty();
|
|
}
|
|
|
|
// Manifest YAML soportado (subset):
|
|
// id: fetch_webpage
|
|
// name: "Fetch web page"
|
|
// description: "..."
|
|
// applies_to: [Webpage, Url]
|
|
// lang: python <- issue 0033: go|python|bash (default python)
|
|
// exec: run <- basename del binario/script (default "run")
|
|
// params:
|
|
// - { name: timeout_s, type: int, default: 15 }
|
|
// - { name: region, type: string, default: "" }
|
|
//
|
|
// Solo el bloque `params:` se parsea con detalle. Otros bloques con valor
|
|
// vacio seguido de lineas indentadas (`emits:`, `relations:`,
|
|
// `uses_functions:`) se ignoran como antes.
|
|
bool parse_manifest(const std::string& path, EnricherSpec* out) {
|
|
std::ifstream f(path);
|
|
if (!f) return false;
|
|
std::string line;
|
|
bool in_skip_block = false;
|
|
bool in_params_block = false;
|
|
while (std::getline(f, line)) {
|
|
// Strip CR de Windows.
|
|
if (!line.empty() && line.back() == '\r') line.pop_back();
|
|
|
|
// Linea blanca o comentario.
|
|
std::string trim = strip(line);
|
|
if (trim.empty() || trim.front() == '#') continue;
|
|
|
|
// Si la linea NO empieza con whitespace, salimos de los bloques
|
|
// anidados — el siguiente top-level reinicia el contexto.
|
|
bool indented = !line.empty() && std::isspace((unsigned char)line.front());
|
|
if (!indented) {
|
|
in_skip_block = false;
|
|
in_params_block = false;
|
|
}
|
|
if (in_skip_block) continue;
|
|
if (in_params_block) {
|
|
// Linea esperada: ` - { name: x, type: int, default: 10 }`.
|
|
// Tolera variaciones de indent y comilla.
|
|
std::string body = trim;
|
|
if (!body.empty() && body.front() == '-') {
|
|
body = strip(body.substr(1));
|
|
}
|
|
EnricherParam p;
|
|
if (parse_inline_param(body, &p)) {
|
|
out->params.push_back(std::move(p));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
size_t colon = trim.find(':');
|
|
if (colon == std::string::npos) continue;
|
|
|
|
std::string key = strip(trim.substr(0, colon));
|
|
std::string val = strip(trim.substr(colon + 1));
|
|
|
|
if (key == "id") out->id = strip_quotes(val);
|
|
else if (key == "name") out->name = strip_quotes(val);
|
|
else if (key == "description") out->description = strip_quotes(val);
|
|
else if (key == "applies_to") out->applies_to = parse_inline_list(val);
|
|
else if (key == "lang") out->lang = lower(strip_quotes(val));
|
|
else if (key == "exec") out->exec_basename = strip_quotes(val);
|
|
else if (key == "params") {
|
|
// `params: []` — vacio explicito, nada que hacer.
|
|
// `params:` — siguiente bloque indentado son items.
|
|
std::string vs = strip(val);
|
|
if (vs.empty()) in_params_block = true;
|
|
// Si fuese inline (`params: [{...}]`) — formato no usado en
|
|
// los manifests actuales, lo ignoramos.
|
|
}
|
|
else if (key == "auto_group_threshold") {
|
|
// Issue 0035e: override del threshold de auto-grouping. Si el
|
|
// valor no es un entero parseable, se ignora (queda en 0 =
|
|
// usar default interno del enricher).
|
|
try {
|
|
int v = std::stoi(strip_quotes(val));
|
|
if (v > 0) out->auto_group_threshold = v;
|
|
} catch (...) { /* ignore */ }
|
|
}
|
|
else if (key == "emits" && val.empty()) in_skip_block = true;
|
|
else if (key == "relations" && val.empty()) in_skip_block = true;
|
|
else if (key == "uses_functions" && val.empty()) in_skip_block = true;
|
|
}
|
|
|
|
// Defaults — preservan retrocompat con manifests existentes que no
|
|
// declaran lang/exec.
|
|
if (out->lang.empty()) out->lang = "python";
|
|
if (out->exec_basename.empty()) out->exec_basename = "run";
|
|
|
|
// Validar lang reconocido. Manifests con lang invalido se cargan
|
|
// pero quedan disabled — asi la UI puede informar y el usuario
|
|
// arregla el manifest.
|
|
if (out->lang != "python" && out->lang != "go" && out->lang != "bash") {
|
|
out->disabled = true;
|
|
out->disabled_reason = "lang invalido: '" + out->lang + "'";
|
|
}
|
|
|
|
return !out->id.empty();
|
|
}
|
|
|
|
// Resuelve el path al ejecutable/script segun lang + plataforma.
|
|
// Devuelve "" si no encuentra el archivo y rellena `reason`.
|
|
std::string resolve_run_path(const std::string& dir,
|
|
const EnricherSpec& spec,
|
|
std::string* reason) {
|
|
#ifdef _WIN32
|
|
const char sep = '\\';
|
|
const char* go_ext = ".exe";
|
|
#else
|
|
const char sep = '/';
|
|
const char* go_ext = "";
|
|
#endif
|
|
auto exists = [](const std::string& p) {
|
|
struct stat st{};
|
|
return stat(p.c_str(), &st) == 0 && !S_ISDIR(st.st_mode);
|
|
};
|
|
|
|
std::string base = dir + sep + spec.exec_basename;
|
|
|
|
if (spec.lang == "python") {
|
|
std::string p = base + ".py";
|
|
if (exists(p)) return p;
|
|
if (reason) *reason = "no existe " + p;
|
|
return "";
|
|
}
|
|
if (spec.lang == "bash") {
|
|
std::string p = base + ".sh";
|
|
if (exists(p)) return p;
|
|
if (reason) *reason = "no existe " + p;
|
|
return "";
|
|
}
|
|
if (spec.lang == "go") {
|
|
// En Windows: <base>.exe. En Linux: <base> (sin extension).
|
|
std::string p = base + go_ext;
|
|
if (exists(p)) return p;
|
|
if (reason) {
|
|
*reason = "binario Go no compilado: " + p
|
|
+ " (corre el build script del enricher)";
|
|
}
|
|
return "";
|
|
}
|
|
if (reason) *reason = "lang no soportado";
|
|
return "";
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int enrichers_load(const char* enrichers_dir) {
|
|
g_enrichers.clear();
|
|
if (!enrichers_dir || !*enrichers_dir) return -1;
|
|
|
|
// En Windows los UNC paths esperan backslashes consistentes; mixed
|
|
// separators (`\\wsl$\<distro>\foo/bar`) confunden a opendir de MinGW.
|
|
std::string dir = enrichers_dir;
|
|
#ifdef _WIN32
|
|
for (char& c : dir) if (c == '/') c = '\\';
|
|
#endif
|
|
|
|
DIR* d = opendir(dir.c_str());
|
|
if (!d) {
|
|
std::fprintf(stderr, "[enrichers] opendir failed: %s\n", dir.c_str());
|
|
return -1;
|
|
}
|
|
|
|
struct dirent* ent;
|
|
while ((ent = readdir(d)) != nullptr) {
|
|
if (ent->d_name[0] == '.') continue;
|
|
|
|
#ifdef _WIN32
|
|
const char sep = '\\';
|
|
#else
|
|
const char sep = '/';
|
|
#endif
|
|
std::string sub = dir + sep + ent->d_name;
|
|
struct stat st{};
|
|
if (stat(sub.c_str(), &st) != 0 || !S_ISDIR(st.st_mode)) continue;
|
|
|
|
std::string manifest = sub + sep + "manifest.yaml";
|
|
if (stat(manifest.c_str(), &st) != 0) continue;
|
|
|
|
EnricherSpec spec;
|
|
if (!parse_manifest(manifest, &spec)) {
|
|
std::fprintf(stderr, "[enrichers] parse failed: %s\n", manifest.c_str());
|
|
continue;
|
|
}
|
|
|
|
// Resolver el ejecutable segun lang. Si falla (binario Go no
|
|
// compilado, script ausente, etc.) registramos el spec como
|
|
// disabled — sigue apareciendo en `enrichers_all()` para que
|
|
// la UI pueda mostrar warning, pero `enrichers_for_type` lo
|
|
// oculta del menu de ejecucion.
|
|
std::string reason;
|
|
std::string run_path = resolve_run_path(sub, spec, &reason);
|
|
if (run_path.empty()) {
|
|
spec.disabled = true;
|
|
if (spec.disabled_reason.empty()) spec.disabled_reason = reason;
|
|
std::fprintf(stderr, "[enrichers] %s deshabilitado: %s\n",
|
|
spec.id.c_str(), spec.disabled_reason.c_str());
|
|
}
|
|
spec.run_path = run_path;
|
|
g_enrichers.push_back(std::move(spec));
|
|
}
|
|
closedir(d);
|
|
|
|
std::sort(g_enrichers.begin(), g_enrichers.end(),
|
|
[](const EnricherSpec& a, const EnricherSpec& b) {
|
|
return a.name < b.name;
|
|
});
|
|
return (int)g_enrichers.size();
|
|
}
|
|
|
|
const std::vector<EnricherSpec>& enrichers_all() {
|
|
return g_enrichers;
|
|
}
|
|
|
|
std::vector<EnricherSpec> enrichers_for_type(const char* type_ref) {
|
|
std::vector<EnricherSpec> out;
|
|
if (!type_ref || !*type_ref) return out;
|
|
std::string want = lower(type_ref);
|
|
for (const auto& e : g_enrichers) {
|
|
if (e.disabled) continue; // no ofrecer enrichers no resueltos
|
|
if (e.applies_to.empty()) {
|
|
out.push_back(e);
|
|
continue;
|
|
}
|
|
for (const auto& t : e.applies_to) {
|
|
if (lower(t) == want) { out.push_back(e); break; }
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
const EnricherSpec* enricher_by_id(const char* id) {
|
|
if (!id || !*id) return nullptr;
|
|
for (const auto& e : g_enrichers) {
|
|
if (e.id == id) return &e;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
} // namespace ge
|