1430039688
Añade fetch_http_fingerprint_cdp_py_browser (domain browser): recoge el HTML renderizado tras ejecutar JavaScript usando un Chrome remoto via CDP, componiendo cdp_open_url_and_wait + cdp_eval. Devuelve la misma estructura que el fetch estático para que detect_web_tech lo consuma sin cambios. Integra use_cdp en el pipeline fingerprint_web_stack (v1.1.0): combina los headers reales del fetch estático con el HTML post-JS del CDP. Detecta frameworks de SPA (React/Vue/Angular/Next) que el fetch estático no ve porque montan el DOM en runtime. Si no hay Chrome en cdp_port, degrada al fetch estático con un warning (no rompe). cdp_port=9333 (Chrome aislado) recomendado para terceros, 9222 diario. Verificado en vivo (Chrome 9333): sobre una SPA cuyo marcador de framework solo aparece tras ejecutar JS, el estático detecta solo nginx; con use_cdp=True detecta además Next.js, React y Node.js. Tests: 48 verdes (error path sin Chrome + happy path mockeado + degradación). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
222 lines
8.0 KiB
Python
222 lines
8.0 KiB
Python
"""Tests para el pipeline fingerprint_web_stack — SIN red externa ni service real.
|
|
|
|
El golden levanta un HTTPServer local efimero en 127.0.0.1 que emite cabeceras
|
|
(Server: nginx, X-Powered-By: PHP) + un HTML con `<meta name=generator>`
|
|
WordPress y marcadores `wp-content`. El pipeline compone fetch_http_fingerprint
|
|
+ detect_web_tech contra ese servidor real, asi se ejercita la composicion
|
|
end-to-end sin tocar internet. save=False en todos los tests para no escribir en
|
|
el vault OSINT ni hacer POST al service.
|
|
|
|
Para el error path, save_scan_to_osint se parchea sobre los globals del modulo
|
|
del pipeline (importlib + monkeypatch) por si acaso, pero con save=False nunca
|
|
debe invocarse.
|
|
"""
|
|
|
|
import http.server
|
|
import importlib
|
|
import os
|
|
import socketserver
|
|
import sys
|
|
import threading
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
# Globals del modulo del pipeline (donde viven fetch_http_fingerprint,
|
|
# detect_web_tech, save_scan_to_osint...).
|
|
mod = importlib.import_module("pipelines.fingerprint_web_stack")
|
|
fingerprint_web_stack = mod.fingerprint_web_stack
|
|
|
|
|
|
# HTML servido por el server local: marcadores claros de WordPress (meta
|
|
# generator + wp-content) para que detect_web_tech lo detecte high/medium.
|
|
_WP_HTML = (
|
|
b"<!DOCTYPE html>\n"
|
|
b"<html>\n<head>\n"
|
|
b"<meta charset=\"utf-8\">\n"
|
|
b"<meta name=\"generator\" content=\"WordPress 6.4.2\">\n"
|
|
b"<title>Mi Blog WordPress</title>\n"
|
|
b"<link rel=\"stylesheet\" href=\"/wp-content/themes/twenty/style.css\">\n"
|
|
b"</head>\n<body>\n"
|
|
b"<script src=\"/wp-includes/js/jquery/jquery.min.js\"></script>\n"
|
|
b"<p>Hola mundo desde wp-content.</p>\n"
|
|
b"</body>\n</html>\n"
|
|
)
|
|
|
|
|
|
class _WPHandler(http.server.BaseHTTPRequestHandler):
|
|
"""Handler que finge ser un WordPress detras de nginx + PHP."""
|
|
|
|
# Silencia el logging del server a stderr durante el test.
|
|
def log_message(self, *args, **kwargs): # noqa: D102
|
|
pass
|
|
|
|
def do_GET(self): # noqa: N802 - firma impuesta por BaseHTTPRequestHandler
|
|
self.send_response(200)
|
|
self.send_header("Server", "nginx/1.24.0")
|
|
self.send_header("X-Powered-By", "PHP/8.2.10")
|
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
self.send_header("Content-Length", str(len(_WP_HTML)))
|
|
self.end_headers()
|
|
self.wfile.write(_WP_HTML)
|
|
|
|
|
|
def _start_wp_server() -> tuple[socketserver.TCPServer, int, threading.Thread]:
|
|
"""Levanta un HTTPServer efimero en 127.0.0.1 que sirve el HTML WordPress.
|
|
|
|
Returns:
|
|
(httpd, port, thread). El caller debe llamar httpd.shutdown() al final.
|
|
"""
|
|
httpd = http.server.HTTPServer(("127.0.0.1", 0), _WPHandler)
|
|
port = httpd.server_address[1]
|
|
t = threading.Thread(target=httpd.serve_forever, daemon=True)
|
|
t.start()
|
|
return httpd, port, t
|
|
|
|
|
|
# --- 1. Golden: fingerprint contra un servidor WordPress/nginx/PHP local ------
|
|
|
|
def test_golden_fingerprint_servidor_local_wordpress_nginx():
|
|
"""Detecta WordPress (CMS), nginx (servidor) y PHP en el HTML/headers locales."""
|
|
httpd, port, thread = _start_wp_server()
|
|
try:
|
|
result = fingerprint_web_stack(
|
|
f"http://127.0.0.1:{port}/",
|
|
timeout_s=5.0,
|
|
save=False,
|
|
)
|
|
|
|
assert result["status"] == "ok", result
|
|
assert result["status_code"] == 200, result
|
|
# No se archivo en OSINT (save=False).
|
|
assert result["saved"] is None, result
|
|
# Hubo al menos una tecnologia detectada.
|
|
assert result["count"] > 0, result
|
|
|
|
names = {t["name"] for t in result["technologies"]}
|
|
# WordPress por meta generator; nginx por cabecera Server.
|
|
assert "WordPress" in names, names
|
|
assert "nginx" in names, names
|
|
|
|
# by_category coherente con las tecnologias.
|
|
by_cat = result["by_category"]
|
|
assert "WordPress" in by_cat.get("cms", []), by_cat
|
|
assert "nginx" in by_cat.get("web-server", []), by_cat
|
|
|
|
# server y title vienen del fetch.
|
|
assert "nginx" in (result["server"] or ""), result["server"]
|
|
assert "WordPress" in (result["title"] or ""), result["title"]
|
|
|
|
# raw es la tabla legible con cabeceras y columnas.
|
|
raw = result["raw"]
|
|
assert isinstance(raw, str)
|
|
assert "TECHNOLOGY" in raw
|
|
assert "WordPress" in raw
|
|
assert "nginx" in raw
|
|
assert str(port) in raw # la URL solicitada aparece en la cabecera
|
|
finally:
|
|
httpd.shutdown()
|
|
httpd.server_close()
|
|
thread.join(timeout=2.0)
|
|
|
|
|
|
# --- 2. save=False: corre fetch + matching pero NO archiva en OSINT -----------
|
|
|
|
def test_save_false_no_archiva_osint():
|
|
"""save=False: technologies poblado pero el sink nunca se invoca."""
|
|
save_called = {"n": 0}
|
|
|
|
def fake_save(*args, **kwargs): # pragma: no cover - no debe llamarse
|
|
save_called["n"] += 1
|
|
return {"status": "ok"}
|
|
|
|
httpd, port, thread = _start_wp_server()
|
|
original_save = mod.save_scan_to_osint
|
|
mod.save_scan_to_osint = fake_save
|
|
try:
|
|
result = fingerprint_web_stack(
|
|
f"http://127.0.0.1:{port}/",
|
|
timeout_s=5.0,
|
|
save=False,
|
|
)
|
|
finally:
|
|
mod.save_scan_to_osint = original_save
|
|
httpd.shutdown()
|
|
httpd.server_close()
|
|
thread.join(timeout=2.0)
|
|
|
|
assert result["status"] == "ok", result
|
|
assert result["count"] > 0, result
|
|
assert result["saved"] is None, result
|
|
# El sink nunca se invoco con save=False.
|
|
assert save_called["n"] == 0, save_called
|
|
|
|
|
|
# --- 3. Error path: el fetch HTTP falla -> error sin red externa --------------
|
|
|
|
def test_fetch_fallido_propaga_error_sin_red():
|
|
"""Host que no resuelve: fetch_http_fingerprint da error y el pipeline lo propaga."""
|
|
save_called = {"n": 0}
|
|
|
|
def fake_save(*args, **kwargs): # pragma: no cover - no debe llamarse
|
|
save_called["n"] += 1
|
|
return {"status": "ok"}
|
|
|
|
# Parcheamos el sink: aunque save=True, con fetch fallido no debe invocarse.
|
|
original_save = mod.save_scan_to_osint
|
|
mod.save_scan_to_osint = fake_save
|
|
try:
|
|
result = fingerprint_web_stack(
|
|
"http://nohost.invalid.tld.example/",
|
|
timeout_s=2.0,
|
|
save=True,
|
|
)
|
|
finally:
|
|
mod.save_scan_to_osint = original_save
|
|
|
|
assert result["status"] == "error", result
|
|
assert result["stage"] == "fetch", result
|
|
assert result["fetch"]["status"] == "error", result
|
|
# No se intento archivar nada.
|
|
assert save_called["n"] == 0, save_called
|
|
|
|
|
|
# --- 4. use_cdp sin Chrome: DEGRADA a estatico con warning (no falla) ---------
|
|
|
|
def test_use_cdp_sin_chrome_degrada_a_estatico():
|
|
"""use_cdp=True sin Chrome (cdp_port=1) degrada al fetch estatico con warning.
|
|
|
|
Levanta el mismo HTTPServer WordPress/nginx/PHP local que el golden y pide
|
|
use_cdp con cdp_port=1 (donde no hay ningun Chrome escuchando). El fetch CDP
|
|
falla, el pipeline NO rompe: usa el HTML estatico, marca html_source=static,
|
|
rellena warnings y sigue detectando WordPress/nginx por el html/headers.
|
|
"""
|
|
httpd, port, thread = _start_wp_server()
|
|
try:
|
|
result = fingerprint_web_stack(
|
|
f"http://127.0.0.1:{port}/",
|
|
timeout_s=5.0,
|
|
save=False,
|
|
use_cdp=True,
|
|
cdp_port=1, # puerto sin Chrome: el fetch CDP falla -> degrada
|
|
wait_render_s=0.0,
|
|
)
|
|
finally:
|
|
httpd.shutdown()
|
|
httpd.server_close()
|
|
thread.join(timeout=2.0)
|
|
|
|
# Degrado, no fallo.
|
|
assert result["status"] == "ok", result
|
|
# Cayo al HTML estatico (CDP no disponible).
|
|
assert result["html_source"] == "static", result
|
|
assert result["rendered"] is False, result
|
|
# Hubo warning de degradacion.
|
|
assert result["warnings"], result
|
|
assert any("cdp no disponible" in w for w in result["warnings"]), result["warnings"]
|
|
# La deteccion estatica sigue funcionando.
|
|
names = {t["name"] for t in result["technologies"]}
|
|
assert "WordPress" in names, names
|
|
assert "nginx" in names, names
|
|
# No se archivo (save=False).
|
|
assert result["saved"] is None, result
|