// autoextract_panel — UI para AutoExtract: // 1. URL -> Open & Analyze: pipeline Python // cdp_open_url_and_wait + cdp_get_ax_tree + llm_propose_scraping_schema. // 2. Editar schema propuesto (selectores + keep checkbox + tipo). // 3. Test extraction via Runtime.evaluate (JS construido). // 4. Save as recipe -> YAML en projects/navegator/profiles/default/recipes/. // + INSERT en data_factory.db (subprocess sqlite3 inline). // // La comunicacion con Python es subprocess (py_subprocess.h) — el script Python // reusa funciones del registry via sys.path injection desde FN_REGISTRY_ROOT. #include "imgui.h" #include "core/icons_tabler.h" #include "core/tokens.h" #include "session_state.h" #include "py_subprocess.h" #include "picker_state.h" #include "app_base.h" #include "crude_json.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN # include #else # include # include #endif namespace navegator { namespace { struct SchemaField { std::string field; std::string selector; std::string sample; std::string type; // string|number|bool|url bool keep = true; }; struct AutoExtractState { std::mutex mu; char url_input[1024] = "https://news.ycombinator.com"; char recipe_name[256] = ""; std::atomic busy{false}; std::string status; std::string last_error; std::string raw_python_output; // diagnostico std::vector schema; std::string proposed_tab_id; std::string test_output; }; AutoExtractState g_ax; // JSON-escape de selector para serializar el JS de extraccion. std::string js_escape(const std::string& s) { std::string out; out.reserve(s.size() + 4); for (char c : s) { switch (c) { case '\\': out += "\\\\"; break; case '"': out += "\\\""; break; case '\n': out += "\\n"; break; case '\r': out += "\\r"; break; default: out += c; } } return out; } std::string slugify(const std::string& s) { std::string out; out.reserve(s.size()); for (char c : s) { if (std::isalnum((unsigned char)c)) out += (char)std::tolower((unsigned char)c); else if (c == ' ' || c == '-' || c == '_') out += '-'; } if (out.empty()) out = "recipe"; return out; } void run_open_and_analyze(int port, std::string url) { if (g_ax.busy.exchange(true)) return; { std::lock_guard lk(g_ax.mu); g_ax.status = "Spawning python pipeline..."; g_ax.last_error.clear(); g_ax.schema.clear(); } std::thread([port, url]() { const char* code = R"PY( import sys, os, json, traceback root = os.environ.get('FN_REGISTRY_ROOT', '') if not root: print(json.dumps({"error": "FN_REGISTRY_ROOT not set"})) sys.exit(2) for sub in ('pipelines','core','infra'): sys.path.insert(0, os.path.join(root, 'python', 'functions', sub)) try: from cdp_open_url_and_wait import cdp_open_url_and_wait from cdp_get_ax_tree import cdp_get_ax_tree from llm_propose_scraping_schema import llm_propose_scraping_schema url = sys.argv[1] port = int(sys.argv[2]) tab_id = cdp_open_url_and_wait(port, url, timeout_s=30) ax = cdp_get_ax_tree(port, tab_id) schema = llm_propose_scraping_schema(url, ax) out = {"tab_id": tab_id} if isinstance(schema, dict): out.update(schema) # llm_propose_scraping_schema returns "schema" key; remap to "fields" for parser. if "schema" in schema and "fields" not in schema: out["fields"] = schema["schema"] else: out["fields"] = schema print(json.dumps(out)) except Exception as e: print(json.dumps({"error": str(e), "trace": traceback.format_exc()})) sys.exit(1) )PY"; std::vector argv; argv.push_back(py_resolve_interpreter()); argv.push_back("-c"); argv.push_back(code); argv.push_back(url); argv.push_back(std::to_string(port)); PyResult r = py_run(argv, 120000); { std::lock_guard lk(g_ax.mu); g_ax.raw_python_output = r.stdout_data; } // Debug dump: stdout + diagnostic context to disk for offline inspection. { FILE* f = std::fopen(fn::local_path("autoextract_last.txt"), "wb"); if (f) { std::fprintf(f, "EXIT=%d ERR=%s\n", r.exit_code, r.error.c_str()); std::fprintf(f, "STDOUT_LEN=%zu\n--- STDOUT ---\n", r.stdout_data.size()); std::fwrite(r.stdout_data.data(), 1, r.stdout_data.size(), f); std::fclose(f); } } if (r.exit_code != 0 || r.stdout_data.empty()) { std::lock_guard lk(g_ax.mu); g_ax.last_error = r.error.empty() ? "python exited non-zero" : r.error; g_ax.status = "Failed"; g_ax.busy.store(false); return; } // Parse JSON (puede haber varias lineas — tomamos la ultima no vacia). std::string json_line; { std::stringstream ss(r.stdout_data); std::string line; while (std::getline(ss, line)) { if (!line.empty() && line.front() == '{') json_line = line; } } if (json_line.empty()) { std::lock_guard lk(g_ax.mu); g_ax.last_error = "no JSON object in stdout"; g_ax.status = "Failed"; g_ax.busy.store(false); return; } crude_json::value v = crude_json::value::parse(json_line); if (!v.is_object()) { std::lock_guard lk(g_ax.mu); g_ax.last_error = "stdout is not a JSON object"; g_ax.status = "Failed"; g_ax.busy.store(false); return; } if (v.contains("error")) { std::lock_guard lk(g_ax.mu); g_ax.last_error = v["error"].is_string() ? v["error"].get() : "error"; g_ax.status = "Failed"; g_ax.busy.store(false); return; } std::vector fields; std::string tab_id; if (v.contains("tab_id") && v["tab_id"].is_string()) tab_id = v["tab_id"].get(); if (v.contains("fields") && v["fields"].is_array()) { const auto& arr = v["fields"].get(); for (size_t i = 0; i < arr.size(); ++i) { const auto& f = arr[i]; if (!f.is_object()) continue; SchemaField sf; auto getstr = [&](const char* k){ if (!f.contains(k)) return std::string(); const auto& x = f[k]; if (x.is_string()) return x.get(); if (x.is_null()) return std::string(); return x.dump(); }; sf.field = getstr("field"); sf.selector = getstr("selector"); sf.sample = getstr("sample_value"); if (sf.sample.empty()) sf.sample = getstr("sample"); sf.type = getstr("type"); if (sf.type.empty()) sf.type = "string"; sf.keep = true; if (!sf.field.empty()) fields.push_back(std::move(sf)); } } { std::lock_guard lk(g_ax.mu); g_ax.schema = std::move(fields); g_ax.proposed_tab_id = tab_id; g_ax.status = "Schema proposed (" + std::to_string(g_ax.schema.size()) + " fields)"; } g_ax.busy.store(false); }).detach(); } std::string build_extraction_js(const std::vector& schema) { std::ostringstream js; js << "(function(){var o={};"; for (const auto& f : schema) { if (!f.keep || f.field.empty() || f.selector.empty()) continue; // Map type -> coerce. std::string sel = js_escape(f.selector); js << "try{var e=document.querySelector(\"" << sel << "\");"; js << "o[\"" << js_escape(f.field) << "\"]="; if (f.type == "number") { js << "e?parseFloat((e.innerText||e.textContent||'').replace(/[^0-9.\\-]/g,'')):null;"; } else if (f.type == "url") { js << "e?(e.href||e.src||null):null;"; } else if (f.type == "bool") { js << "e?true:false;"; } else { js << "e?(e.innerText||e.textContent||'').trim():null;"; } js << "}catch(_){o[\"" << js_escape(f.field) << "\"]=null;}"; } js << "return o;})()"; return js.str(); } void run_test_extraction(int port, const std::string& tab_id) { std::vector sc; { std::lock_guard lk(g_ax.mu); sc = g_ax.schema; } std::string js = build_extraction_js(sc); if (g_ax.busy.exchange(true)) return; { std::lock_guard lk(g_ax.mu); g_ax.status = "Running extraction in tab..."; g_ax.test_output.clear(); } std::thread([port, tab_id, js]() { const char* code = R"PY( import sys, os, json, traceback root = os.environ.get('FN_REGISTRY_ROOT', '') if not root: print(json.dumps({"error":"FN_REGISTRY_ROOT not set"})); sys.exit(2) for sub in ('pipelines','core','infra','browser'): sys.path.insert(0, os.path.join(root, 'python', 'functions', sub)) try: port = int(sys.argv[1]) tab_id = sys.argv[2] js = sys.argv[3] # Best-effort: reuse cdp_runtime_evaluate si existe; sino, hablar CDP directo. try: from cdp_runtime_evaluate import cdp_runtime_evaluate out = cdp_runtime_evaluate(port, tab_id, js, return_by_value=True) except Exception: import urllib.request, json as _j, websocket ws_url = None with urllib.request.urlopen(f'http://127.0.0.1:{port}/json') as r: for t in _j.loads(r.read()): if t.get('id') == tab_id: ws_url = t.get('webSocketDebuggerUrl'); break if not ws_url: raise RuntimeError('tab not found') w = websocket.create_connection(ws_url, timeout=10) w.send(_j.dumps({"id":1,"method":"Runtime.evaluate","params":{ "expression": js, "returnByValue": True}})) out = _j.loads(w.recv()).get('result', {}).get('result', {}).get('value') w.close() print(json.dumps({"result": out})) except Exception as e: print(json.dumps({"error": str(e), "trace": traceback.format_exc()})); sys.exit(1) )PY"; std::vector argv; argv.push_back(py_resolve_interpreter()); argv.push_back("-c"); argv.push_back(code); argv.push_back(std::to_string(port)); argv.push_back(tab_id); argv.push_back(js); PyResult r = py_run(argv, 30000); { std::lock_guard lk(g_ax.mu); if (r.exit_code != 0) { g_ax.last_error = r.error.empty() ? "python exited non-zero" : r.error; g_ax.status = "Test failed"; g_ax.test_output = r.stdout_data; } else { g_ax.test_output = r.stdout_data; g_ax.status = "Test OK"; } g_ax.raw_python_output = r.stdout_data; } g_ax.busy.store(false); }).detach(); } std::string yaml_quote(const std::string& s) { bool needs = s.empty() || s.find_first_of(":#\"'") != std::string::npos || s.find_first_of(" \t") == 0; if (!needs) return s; std::string out = "\""; for (char c : s) { if (c == '"' || c == '\\') out += '\\'; out += c; } out += "\""; return out; } bool ensure_dir(const std::string& path) { #ifdef _WIN32 return CreateDirectoryA(path.c_str(), nullptr) || GetLastError() == ERROR_ALREADY_EXISTS; #else if (mkdir(path.c_str(), 0755) == 0) return true; return errno == EEXIST; #endif } void save_recipe(const std::string& name, const std::string& url) { std::string root = py_resolve_registry_root(); if (root.empty()) { std::lock_guard lk(g_ax.mu); g_ax.last_error = "FN_REGISTRY_ROOT not set; cannot resolve recipes dir"; g_ax.status = "Save failed"; return; } std::string slug = slugify(name); std::string sep = #ifdef _WIN32 "\\"; #else "/"; #endif std::string dir = root + sep + "projects" + sep + "navegator" + sep + "profiles" + sep + "default" + sep + "recipes"; // crear directorios padres uno a uno (best effort). #ifdef _WIN32 { std::string acc; for (size_t i = 0; i < dir.size(); ++i) { if (dir[i] == '\\' && i > 2) { acc.assign(dir.begin(), dir.begin() + i); CreateDirectoryA(acc.c_str(), nullptr); } } CreateDirectoryA(dir.c_str(), nullptr); } #else { std::string acc; for (size_t i = 0; i < dir.size(); ++i) { if (dir[i] == '/' && i > 0) { acc.assign(dir.begin(), dir.begin() + i); mkdir(acc.c_str(), 0755); } } mkdir(dir.c_str(), 0755); } #endif std::string path = dir + sep + slug + ".yaml"; std::vector sc; std::string url_used = url; { std::lock_guard lk(g_ax.mu); sc = g_ax.schema; } std::string js = build_extraction_js(sc); std::ostringstream y; y << "name: " << yaml_quote(slug) << "\n"; y << "url_pattern: " << yaml_quote(url_used) << "\n"; y << "steps:\n"; if (!sc.empty() && !sc.front().selector.empty()) { y << " - wait_selector: " << yaml_quote(sc.front().selector) << "\n"; } y << " - js: |\n"; // indent js with 6 spaces { std::stringstream ss(js); std::string line; while (std::getline(ss, line)) y << " " << line << "\n"; } y << "output:\n"; y << " schema:\n"; for (const auto& f : sc) { if (!f.keep) continue; y << " - name: " << yaml_quote(f.field) << " type: " << yaml_quote(f.type) << " selector: " << yaml_quote(f.selector) << "\n"; } y << " sink: data_factory.runs\n"; std::ofstream f(path, std::ios::binary); if (!f) { std::lock_guard lk(g_ax.mu); g_ax.last_error = "could not write " + path; g_ax.status = "Save failed"; return; } f << y.str(); f.close(); // INSERT en data_factory.db via subprocess sqlite3 (NO bloquea si falla). std::thread([root, slug]() { const char* code = R"PY( import sys, os, sqlite3, traceback try: root = sys.argv[1] name = sys.argv[2] db_path = os.path.join(root, 'apps', 'data_factory', 'data_factory.db') if not os.path.exists(db_path): print("data_factory.db not found at " + db_path); sys.exit(0) conn = sqlite3.connect(db_path) cur = conn.cursor() cur.execute("""INSERT OR IGNORE INTO nodes(id, kind, name, function_id, description, schedule_cron, enabled, tags_csv, created_at, updated_at) VALUES(?, 'extractor', ?, 'cdp_extract_recipe_py_pipelines', ?, '', 1, 'navegator,recipe', datetime('now'), datetime('now'))""", (name, name, "auto-extract recipe " + name)) conn.commit(); conn.close() print("ok") except Exception as e: print(traceback.format_exc()) )PY"; std::vector argv; argv.push_back(py_resolve_interpreter()); argv.push_back("-c"); argv.push_back(code); argv.push_back(root); argv.push_back(slug); (void)py_run(argv, 10000); }).detach(); { std::lock_guard lk(g_ax.mu); g_ax.status = "Recipe saved: " + path; } } } // anon void render_autoextract_panel(bool* p_open) { if (!ImGui::Begin(TI_BOX " AutoExtract", p_open)) { ImGui::End(); return; } int port = 0; { std::lock_guard lk(g_session().mu); port = g_session().selected_port; } if (port <= 0) { ImGui::TextDisabled("Select a browser in the Browsers panel."); ImGui::End(); return; } ImGui::TextUnformatted("URL:"); ImGui::SameLine(); ImGui::SetNextItemWidth(420); ImGui::InputText("##ax_url", g_ax.url_input, sizeof(g_ax.url_input)); ImGui::SameLine(); bool busy = g_ax.busy.load(); if (busy) ImGui::BeginDisabled(); if (ImGui::Button(TI_PLAYER_PLAY " Open & Analyze")) { run_open_and_analyze(port, g_ax.url_input); } if (busy) ImGui::EndDisabled(); if (busy) { ImGui::SameLine(); ImGui::TextDisabled("working..."); } { std::lock_guard lk(g_ax.mu); if (!g_ax.status.empty()) ImGui::Text("Status: %s", g_ax.status.c_str()); if (!g_ax.last_error.empty()) { ImGui::PushStyleColor(ImGuiCol_Text, fn_tokens::colors::error); ImGui::TextWrapped("Error: %s", g_ax.last_error.c_str()); ImGui::PopStyleColor(); } } ImGui::Separator(); ImGui::TextDisabled("Proposed schema (editable):"); std::vector sc_copy; { std::lock_guard lk(g_ax.mu); sc_copy = g_ax.schema; } // LAYOUT-TABLE — schema editor form con InputText/Checkbox editables inline; keep BeginTable inline. if (ImGui::BeginTable("##ax_schema", 5, ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg)) { ImGui::TableSetupColumn("field"); ImGui::TableSetupColumn("selector"); ImGui::TableSetupColumn("sample"); ImGui::TableSetupColumn("type"); ImGui::TableSetupColumn("keep"); ImGui::TableHeadersRow(); bool dirty = false; for (size_t i = 0; i < sc_copy.size(); ++i) { ImGui::TableNextRow(); ImGui::PushID((int)i); ImGui::TableNextColumn(); char fb[128]; std::snprintf(fb, sizeof(fb), "%s", sc_copy[i].field.c_str()); if (ImGui::InputText("##field", fb, sizeof(fb))) { sc_copy[i].field = fb; dirty = true; } ImGui::TableNextColumn(); char sb[512]; std::snprintf(sb, sizeof(sb), "%s", sc_copy[i].selector.c_str()); if (ImGui::InputText("##selector", sb, sizeof(sb))) { sc_copy[i].selector = sb; dirty = true; } ImGui::TableNextColumn(); ImGui::TextWrapped("%s", sc_copy[i].sample.c_str()); ImGui::TableNextColumn(); char tb[32]; std::snprintf(tb, sizeof(tb), "%s", sc_copy[i].type.c_str()); if (ImGui::InputText("##type", tb, sizeof(tb))) { sc_copy[i].type = tb; dirty = true; } ImGui::TableNextColumn(); bool keep = sc_copy[i].keep; if (ImGui::Checkbox("##keep", &keep)) { sc_copy[i].keep = keep; dirty = true; } ImGui::PopID(); } ImGui::EndTable(); if (dirty) { std::lock_guard lk(g_ax.mu); g_ax.schema = sc_copy; } } ImGui::Separator(); std::string tab_id; { std::lock_guard lk(g_ax.mu); tab_id = g_ax.proposed_tab_id; } if (busy) ImGui::BeginDisabled(); if (ImGui::Button(TI_FLASK " Test extraction") && !tab_id.empty()) { run_test_extraction(port, tab_id); } if (busy) ImGui::EndDisabled(); ImGui::SameLine(); ImGui::TextDisabled("Recipe name:"); ImGui::SameLine(); ImGui::SetNextItemWidth(200); ImGui::InputText("##rname", g_ax.recipe_name, sizeof(g_ax.recipe_name)); ImGui::SameLine(); if (ImGui::Button(TI_DEVICE_FLOPPY " Save as recipe")) { if (g_ax.recipe_name[0]) save_recipe(g_ax.recipe_name, g_ax.url_input); } { std::lock_guard lk(g_ax.mu); if (!g_ax.test_output.empty()) { ImGui::Separator(); ImGui::TextDisabled("Test output:"); ImGui::InputTextMultiline("##test_out", (char*)g_ax.test_output.c_str(), g_ax.test_output.size() + 1, ImVec2(-1, 120), ImGuiInputTextFlags_ReadOnly); } } ImGui::End(); } } // namespace navegator