Files
navegator_dashboard/autoextract_panel.cpp
egutierrez 3ad26e4f6b chore: auto-commit (4 archivos)
- app.md
- appicon.ico
- autoextract_panel.cpp
- recipes_panel.cpp

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 00:31:34 +02:00

601 lines
20 KiB
C++

// autoextract_panel — UI para AutoExtract:
// 1. URL -> Open & Analyze: pipeline Python
// cdp_open_url_and_wait + cdp_get_ax_tree + llm_propose_scraping_schema.
// 2. Editar schema propuesto (selectores + keep checkbox + tipo).
// 3. Test extraction via Runtime.evaluate (JS construido).
// 4. Save as recipe -> YAML en projects/navegator/profiles/default/recipes/.
// + INSERT en data_factory.db (subprocess sqlite3 inline).
//
// La comunicacion con Python es subprocess (py_subprocess.h) — el script Python
// reusa funciones del registry via sys.path injection desde FN_REGISTRY_ROOT.
#include "imgui.h"
#include "core/icons_tabler.h"
#include "core/tokens.h"
#include "session_state.h"
#include "py_subprocess.h"
#include "picker_state.h"
#include "app_base.h"
#include "crude_json.h"
#include <algorithm>
#include <atomic>
#include <cerrno>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <mutex>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#else
# include <sys/stat.h>
# include <sys/types.h>
#endif
namespace navegator {
namespace {
struct SchemaField {
std::string field;
std::string selector;
std::string sample;
std::string type; // string|number|bool|url
bool keep = true;
};
struct AutoExtractState {
std::mutex mu;
char url_input[1024] = "https://news.ycombinator.com";
char recipe_name[256] = "";
std::atomic<bool> busy{false};
std::string status;
std::string last_error;
std::string raw_python_output; // diagnostico
std::vector<SchemaField> schema;
std::string proposed_tab_id;
std::string test_output;
};
AutoExtractState g_ax;
// JSON-escape de selector para serializar el JS de extraccion.
std::string js_escape(const std::string& s) {
std::string out; out.reserve(s.size() + 4);
for (char c : s) {
switch (c) {
case '\\': out += "\\\\"; break;
case '"': out += "\\\""; break;
case '\n': out += "\\n"; break;
case '\r': out += "\\r"; break;
default: out += c;
}
}
return out;
}
std::string slugify(const std::string& s) {
std::string out; out.reserve(s.size());
for (char c : s) {
if (std::isalnum((unsigned char)c)) out += (char)std::tolower((unsigned char)c);
else if (c == ' ' || c == '-' || c == '_') out += '-';
}
if (out.empty()) out = "recipe";
return out;
}
void run_open_and_analyze(int port, std::string url) {
if (g_ax.busy.exchange(true)) return;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.status = "Spawning python pipeline...";
g_ax.last_error.clear();
g_ax.schema.clear();
}
std::thread([port, url]() {
const char* code = R"PY(
import sys, os, json, traceback
root = os.environ.get('FN_REGISTRY_ROOT', '')
if not root:
print(json.dumps({"error": "FN_REGISTRY_ROOT not set"}))
sys.exit(2)
for sub in ('pipelines','core','infra'):
sys.path.insert(0, os.path.join(root, 'python', 'functions', sub))
try:
from cdp_open_url_and_wait import cdp_open_url_and_wait
from cdp_get_ax_tree import cdp_get_ax_tree
from llm_propose_scraping_schema import llm_propose_scraping_schema
url = sys.argv[1]
port = int(sys.argv[2])
tab_id = cdp_open_url_and_wait(port, url, timeout_s=30)
ax = cdp_get_ax_tree(port, tab_id)
schema = llm_propose_scraping_schema(url, ax)
out = {"tab_id": tab_id}
if isinstance(schema, dict):
out.update(schema)
# llm_propose_scraping_schema returns "schema" key; remap to "fields" for parser.
if "schema" in schema and "fields" not in schema:
out["fields"] = schema["schema"]
else:
out["fields"] = schema
print(json.dumps(out))
except Exception as e:
print(json.dumps({"error": str(e), "trace": traceback.format_exc()}))
sys.exit(1)
)PY";
std::vector<std::string> argv;
argv.push_back(py_resolve_interpreter());
argv.push_back("-c");
argv.push_back(code);
argv.push_back(url);
argv.push_back(std::to_string(port));
PyResult r = py_run(argv, 120000);
{
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.raw_python_output = r.stdout_data;
}
// Debug dump: stdout + diagnostic context to disk for offline inspection.
{
FILE* f = std::fopen(fn::local_path("autoextract_last.txt"), "wb");
if (f) {
std::fprintf(f, "EXIT=%d ERR=%s\n", r.exit_code, r.error.c_str());
std::fprintf(f, "STDOUT_LEN=%zu\n--- STDOUT ---\n", r.stdout_data.size());
std::fwrite(r.stdout_data.data(), 1, r.stdout_data.size(), f);
std::fclose(f);
}
}
if (r.exit_code != 0 || r.stdout_data.empty()) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = r.error.empty() ? "python exited non-zero" : r.error;
g_ax.status = "Failed";
g_ax.busy.store(false);
return;
}
// Parse JSON (puede haber varias lineas — tomamos la ultima no vacia).
std::string json_line;
{
std::stringstream ss(r.stdout_data);
std::string line;
while (std::getline(ss, line)) {
if (!line.empty() && line.front() == '{') json_line = line;
}
}
if (json_line.empty()) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = "no JSON object in stdout";
g_ax.status = "Failed";
g_ax.busy.store(false);
return;
}
crude_json::value v = crude_json::value::parse(json_line);
if (!v.is_object()) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = "stdout is not a JSON object";
g_ax.status = "Failed";
g_ax.busy.store(false);
return;
}
if (v.contains("error")) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = v["error"].is_string() ? v["error"].get<std::string>() : "error";
g_ax.status = "Failed";
g_ax.busy.store(false);
return;
}
std::vector<SchemaField> fields;
std::string tab_id;
if (v.contains("tab_id") && v["tab_id"].is_string()) tab_id = v["tab_id"].get<std::string>();
if (v.contains("fields") && v["fields"].is_array()) {
const auto& arr = v["fields"].get<crude_json::array>();
for (size_t i = 0; i < arr.size(); ++i) {
const auto& f = arr[i];
if (!f.is_object()) continue;
SchemaField sf;
auto getstr = [&](const char* k){
if (!f.contains(k)) return std::string();
const auto& x = f[k];
if (x.is_string()) return x.get<std::string>();
if (x.is_null()) return std::string();
return x.dump();
};
sf.field = getstr("field");
sf.selector = getstr("selector");
sf.sample = getstr("sample_value");
if (sf.sample.empty()) sf.sample = getstr("sample");
sf.type = getstr("type");
if (sf.type.empty()) sf.type = "string";
sf.keep = true;
if (!sf.field.empty()) fields.push_back(std::move(sf));
}
}
{
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.schema = std::move(fields);
g_ax.proposed_tab_id = tab_id;
g_ax.status = "Schema proposed (" + std::to_string(g_ax.schema.size()) + " fields)";
}
g_ax.busy.store(false);
}).detach();
}
std::string build_extraction_js(const std::vector<SchemaField>& schema) {
std::ostringstream js;
js << "(function(){var o={};";
for (const auto& f : schema) {
if (!f.keep || f.field.empty() || f.selector.empty()) continue;
// Map type -> coerce.
std::string sel = js_escape(f.selector);
js << "try{var e=document.querySelector(\"" << sel << "\");";
js << "o[\"" << js_escape(f.field) << "\"]=";
if (f.type == "number") {
js << "e?parseFloat((e.innerText||e.textContent||'').replace(/[^0-9.\\-]/g,'')):null;";
} else if (f.type == "url") {
js << "e?(e.href||e.src||null):null;";
} else if (f.type == "bool") {
js << "e?true:false;";
} else {
js << "e?(e.innerText||e.textContent||'').trim():null;";
}
js << "}catch(_){o[\"" << js_escape(f.field) << "\"]=null;}";
}
js << "return o;})()";
return js.str();
}
void run_test_extraction(int port, const std::string& tab_id) {
std::vector<SchemaField> sc;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
sc = g_ax.schema;
}
std::string js = build_extraction_js(sc);
if (g_ax.busy.exchange(true)) return;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.status = "Running extraction in tab...";
g_ax.test_output.clear();
}
std::thread([port, tab_id, js]() {
const char* code = R"PY(
import sys, os, json, traceback
root = os.environ.get('FN_REGISTRY_ROOT', '')
if not root:
print(json.dumps({"error":"FN_REGISTRY_ROOT not set"})); sys.exit(2)
for sub in ('pipelines','core','infra','browser'):
sys.path.insert(0, os.path.join(root, 'python', 'functions', sub))
try:
port = int(sys.argv[1])
tab_id = sys.argv[2]
js = sys.argv[3]
# Best-effort: reuse cdp_runtime_evaluate si existe; sino, hablar CDP directo.
try:
from cdp_runtime_evaluate import cdp_runtime_evaluate
out = cdp_runtime_evaluate(port, tab_id, js, return_by_value=True)
except Exception:
import urllib.request, json as _j, websocket
ws_url = None
with urllib.request.urlopen(f'http://127.0.0.1:{port}/json') as r:
for t in _j.loads(r.read()):
if t.get('id') == tab_id:
ws_url = t.get('webSocketDebuggerUrl'); break
if not ws_url: raise RuntimeError('tab not found')
w = websocket.create_connection(ws_url, timeout=10)
w.send(_j.dumps({"id":1,"method":"Runtime.evaluate","params":{
"expression": js, "returnByValue": True}}))
out = _j.loads(w.recv()).get('result', {}).get('result', {}).get('value')
w.close()
print(json.dumps({"result": out}))
except Exception as e:
print(json.dumps({"error": str(e), "trace": traceback.format_exc()})); sys.exit(1)
)PY";
std::vector<std::string> argv;
argv.push_back(py_resolve_interpreter());
argv.push_back("-c");
argv.push_back(code);
argv.push_back(std::to_string(port));
argv.push_back(tab_id);
argv.push_back(js);
PyResult r = py_run(argv, 30000);
{
std::lock_guard<std::mutex> lk(g_ax.mu);
if (r.exit_code != 0) {
g_ax.last_error = r.error.empty() ? "python exited non-zero" : r.error;
g_ax.status = "Test failed";
g_ax.test_output = r.stdout_data;
} else {
g_ax.test_output = r.stdout_data;
g_ax.status = "Test OK";
}
g_ax.raw_python_output = r.stdout_data;
}
g_ax.busy.store(false);
}).detach();
}
std::string yaml_quote(const std::string& s) {
bool needs = s.empty() || s.find_first_of(":#\"'") != std::string::npos
|| s.find_first_of(" \t") == 0;
if (!needs) return s;
std::string out = "\"";
for (char c : s) {
if (c == '"' || c == '\\') out += '\\';
out += c;
}
out += "\"";
return out;
}
bool ensure_dir(const std::string& path) {
#ifdef _WIN32
return CreateDirectoryA(path.c_str(), nullptr) || GetLastError() == ERROR_ALREADY_EXISTS;
#else
if (mkdir(path.c_str(), 0755) == 0) return true;
return errno == EEXIST;
#endif
}
void save_recipe(const std::string& name, const std::string& url) {
std::string root = py_resolve_registry_root();
if (root.empty()) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = "FN_REGISTRY_ROOT not set; cannot resolve recipes dir";
g_ax.status = "Save failed";
return;
}
std::string slug = slugify(name);
std::string sep =
#ifdef _WIN32
"\\";
#else
"/";
#endif
std::string dir = root + sep + "projects" + sep + "navegator" + sep + "profiles"
+ sep + "default" + sep + "recipes";
// crear directorios padres uno a uno (best effort).
#ifdef _WIN32
{
std::string acc;
for (size_t i = 0; i < dir.size(); ++i) {
if (dir[i] == '\\' && i > 2) {
acc.assign(dir.begin(), dir.begin() + i);
CreateDirectoryA(acc.c_str(), nullptr);
}
}
CreateDirectoryA(dir.c_str(), nullptr);
}
#else
{
std::string acc;
for (size_t i = 0; i < dir.size(); ++i) {
if (dir[i] == '/' && i > 0) {
acc.assign(dir.begin(), dir.begin() + i);
mkdir(acc.c_str(), 0755);
}
}
mkdir(dir.c_str(), 0755);
}
#endif
std::string path = dir + sep + slug + ".yaml";
std::vector<SchemaField> sc;
std::string url_used = url;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
sc = g_ax.schema;
}
std::string js = build_extraction_js(sc);
std::ostringstream y;
y << "name: " << yaml_quote(slug) << "\n";
y << "url_pattern: " << yaml_quote(url_used) << "\n";
y << "steps:\n";
if (!sc.empty() && !sc.front().selector.empty()) {
y << " - wait_selector: " << yaml_quote(sc.front().selector) << "\n";
}
y << " - js: |\n";
// indent js with 6 spaces
{
std::stringstream ss(js);
std::string line;
while (std::getline(ss, line)) y << " " << line << "\n";
}
y << "output:\n";
y << " schema:\n";
for (const auto& f : sc) {
if (!f.keep) continue;
y << " - name: " << yaml_quote(f.field)
<< " type: " << yaml_quote(f.type)
<< " selector: " << yaml_quote(f.selector) << "\n";
}
y << " sink: data_factory.runs\n";
std::ofstream f(path, std::ios::binary);
if (!f) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.last_error = "could not write " + path;
g_ax.status = "Save failed";
return;
}
f << y.str();
f.close();
// INSERT en data_factory.db via subprocess sqlite3 (NO bloquea si falla).
std::thread([root, slug]() {
const char* code = R"PY(
import sys, os, sqlite3, traceback
try:
root = sys.argv[1]
name = sys.argv[2]
db_path = os.path.join(root, 'apps', 'data_factory', 'data_factory.db')
if not os.path.exists(db_path):
print("data_factory.db not found at " + db_path); sys.exit(0)
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("""INSERT OR IGNORE INTO nodes(id, kind, name, function_id, description,
schedule_cron, enabled, tags_csv, created_at, updated_at)
VALUES(?, 'extractor', ?, 'cdp_extract_recipe_py_pipelines', ?, '',
1, 'navegator,recipe', datetime('now'), datetime('now'))""",
(name, name, "auto-extract recipe " + name))
conn.commit(); conn.close()
print("ok")
except Exception as e:
print(traceback.format_exc())
)PY";
std::vector<std::string> argv;
argv.push_back(py_resolve_interpreter());
argv.push_back("-c");
argv.push_back(code);
argv.push_back(root);
argv.push_back(slug);
(void)py_run(argv, 10000);
}).detach();
{
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.status = "Recipe saved: " + path;
}
}
} // anon
void render_autoextract_panel(bool* p_open) {
if (!ImGui::Begin(TI_BOX " AutoExtract", p_open)) {
ImGui::End();
return;
}
int port = 0;
{
std::lock_guard<std::mutex> lk(g_session().mu);
port = g_session().selected_port;
}
if (port <= 0) {
ImGui::TextDisabled("Select a browser in the Browsers panel.");
ImGui::End();
return;
}
ImGui::TextUnformatted("URL:");
ImGui::SameLine();
ImGui::SetNextItemWidth(420);
ImGui::InputText("##ax_url", g_ax.url_input, sizeof(g_ax.url_input));
ImGui::SameLine();
bool busy = g_ax.busy.load();
if (busy) ImGui::BeginDisabled();
if (ImGui::Button(TI_PLAYER_PLAY " Open & Analyze")) {
run_open_and_analyze(port, g_ax.url_input);
}
if (busy) ImGui::EndDisabled();
if (busy) {
ImGui::SameLine();
ImGui::TextDisabled("working...");
}
{
std::lock_guard<std::mutex> lk(g_ax.mu);
if (!g_ax.status.empty()) ImGui::Text("Status: %s", g_ax.status.c_str());
if (!g_ax.last_error.empty()) {
ImGui::PushStyleColor(ImGuiCol_Text, fn_tokens::colors::error);
ImGui::TextWrapped("Error: %s", g_ax.last_error.c_str());
ImGui::PopStyleColor();
}
}
ImGui::Separator();
ImGui::TextDisabled("Proposed schema (editable):");
std::vector<SchemaField> sc_copy;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
sc_copy = g_ax.schema;
}
// LAYOUT-TABLE — schema editor form con InputText/Checkbox editables inline; keep BeginTable inline.
if (ImGui::BeginTable("##ax_schema", 5, ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg)) {
ImGui::TableSetupColumn("field");
ImGui::TableSetupColumn("selector");
ImGui::TableSetupColumn("sample");
ImGui::TableSetupColumn("type");
ImGui::TableSetupColumn("keep");
ImGui::TableHeadersRow();
bool dirty = false;
for (size_t i = 0; i < sc_copy.size(); ++i) {
ImGui::TableNextRow();
ImGui::PushID((int)i);
ImGui::TableNextColumn();
char fb[128]; std::snprintf(fb, sizeof(fb), "%s", sc_copy[i].field.c_str());
if (ImGui::InputText("##field", fb, sizeof(fb))) { sc_copy[i].field = fb; dirty = true; }
ImGui::TableNextColumn();
char sb[512]; std::snprintf(sb, sizeof(sb), "%s", sc_copy[i].selector.c_str());
if (ImGui::InputText("##selector", sb, sizeof(sb))) { sc_copy[i].selector = sb; dirty = true; }
ImGui::TableNextColumn();
ImGui::TextWrapped("%s", sc_copy[i].sample.c_str());
ImGui::TableNextColumn();
char tb[32]; std::snprintf(tb, sizeof(tb), "%s", sc_copy[i].type.c_str());
if (ImGui::InputText("##type", tb, sizeof(tb))) { sc_copy[i].type = tb; dirty = true; }
ImGui::TableNextColumn();
bool keep = sc_copy[i].keep;
if (ImGui::Checkbox("##keep", &keep)) { sc_copy[i].keep = keep; dirty = true; }
ImGui::PopID();
}
ImGui::EndTable();
if (dirty) {
std::lock_guard<std::mutex> lk(g_ax.mu);
g_ax.schema = sc_copy;
}
}
ImGui::Separator();
std::string tab_id;
{
std::lock_guard<std::mutex> lk(g_ax.mu);
tab_id = g_ax.proposed_tab_id;
}
if (busy) ImGui::BeginDisabled();
if (ImGui::Button(TI_FLASK " Test extraction") && !tab_id.empty()) {
run_test_extraction(port, tab_id);
}
if (busy) ImGui::EndDisabled();
ImGui::SameLine();
ImGui::TextDisabled("Recipe name:");
ImGui::SameLine();
ImGui::SetNextItemWidth(200);
ImGui::InputText("##rname", g_ax.recipe_name, sizeof(g_ax.recipe_name));
ImGui::SameLine();
if (ImGui::Button(TI_DEVICE_FLOPPY " Save as recipe")) {
if (g_ax.recipe_name[0]) save_recipe(g_ax.recipe_name, g_ax.url_input);
}
{
std::lock_guard<std::mutex> lk(g_ax.mu);
if (!g_ax.test_output.empty()) {
ImGui::Separator();
ImGui::TextDisabled("Test output:");
ImGui::InputTextMultiline("##test_out", (char*)g_ax.test_output.c_str(),
g_ax.test_output.size() + 1,
ImVec2(-1, 120), ImGuiInputTextFlags_ReadOnly);
}
}
ImGui::End();
}
} // namespace navegator