935008ec3f
Añade el capability group `recon` (dominio cybersecurity + pipelines, Python),
con la política de archivado OSINT y página madre docs/capabilities/recon.md.
Lookups y sondeo (wrappers de CLI):
- whois_lookup, rdap_lookup, dns_records, ping_host, traceroute_host, nmap_scan
- save_scan_to_osint (sink común) + recon_osint (pipeline one-shot scan+archivado)
Escaneo de puertos/servicios nativo (stdlib, sin nmap ni sudo):
- scan_tcp_ports: connect-scan TCP concurrente (open/closed/filtered)
- grab_service_banner: banner grab + identificación de servicio/versión real
- identify_port_service: puro, puerto -> servicio IANA esperado (~120 puertos)
- scan_port_services: pipeline one-shot (scan -> identify + banner por puerto abierto)
Fingerprint de tecnología web (estilo Wappalyzer), patrón pura/impura:
- fetch_http_fingerprint: GET stdlib, recoge headers/html/cookies (solo nombres)
- detect_web_tech: puro, matchea ~50 firmas regex -> tecnologías por categoría
- fingerprint_web_stack: pipeline one-shot url -> tecnologías
Todas devuelven dict {status} sin lanzar. Tests: 43 verdes, sin red externa.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
432 lines
19 KiB
Python
432 lines
19 KiB
Python
"""Detector de tecnologia web estilo Wappalyzer (pieza pura).
|
|
|
|
Dado el resultado crudo de un fetch HTTP (cabeceras, HTML, cookies, URL final),
|
|
identifica las tecnologias web que usa un sitio matcheando contra una tabla de
|
|
firmas embebida (regex): servidor, lenguaje, CMS, frameworks JS, librerias,
|
|
analytics, CDN, e-commerce, WAF, etc.
|
|
|
|
Esta funcion es PURA: no toca la red ni hace I/O. Recibe las senales ya
|
|
recogidas por la capa impura hermana (`fetch_http_fingerprint_py_cybersecurity`)
|
|
y se limita a aplicar regex deterministas sobre ellas. Separar el matching de la
|
|
recoleccion permite testear las firmas sin red y reutilizar la tabla.
|
|
|
|
La tabla `SIGNATURES` es un subconjunto curado de lo que cubre Wappalyzer (no es
|
|
exhaustiva). Para ampliarla, anadir entradas nuevas a `SIGNATURES` siguiendo el
|
|
formato documentado mas abajo.
|
|
"""
|
|
|
|
import re
|
|
|
|
__all__ = ["detect_web_tech", "SIGNATURES"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tabla de firmas embebida.
|
|
#
|
|
# Cada firma es un dict con un `name`, una `category` y uno o varios matchers
|
|
# (todos opcionales; OR entre tipos: basta que UNO matchee para detectar):
|
|
#
|
|
# "headers": {"<header-lowercase>": r"<regex>"} -> regex por header
|
|
# "html": r"<regex>" -> regex sobre el HTML
|
|
# "meta_generator": r"<regex>" -> regex sobre <meta name=generator content=...>
|
|
# "cookies": r"<regex>" -> regex sobre nombres de cookies
|
|
# "script_src": r"<regex>" -> regex sobre src de <script>
|
|
# "url": r"<regex>" -> regex sobre la URL final
|
|
#
|
|
# Campos opcionales:
|
|
# "version_group": <int> -> de que group del matcher que disparo sacar la version
|
|
# "implies": ["Tech", ...] -> tecnologias implicadas (confidence menor)
|
|
#
|
|
# Confidence: "high" si matchea header/meta/cookie/url directo; "medium" si por
|
|
# HTML/script_src generico o por `implies`.
|
|
# ---------------------------------------------------------------------------
|
|
SIGNATURES = [
|
|
# ---- web-server -------------------------------------------------------
|
|
{"name": "nginx", "category": "web-server",
|
|
"headers": {"server": r"nginx(?:/([\d.]+))?"}, "version_group": 1},
|
|
{"name": "Apache", "category": "web-server",
|
|
"headers": {"server": r"Apache(?:/([\d.]+))?"}, "version_group": 1},
|
|
{"name": "IIS", "category": "web-server",
|
|
"headers": {"server": r"(?:Microsoft-)?IIS(?:/([\d.]+))?"}, "version_group": 1},
|
|
{"name": "LiteSpeed", "category": "web-server",
|
|
"headers": {"server": r"LiteSpeed"}},
|
|
{"name": "Caddy", "category": "web-server",
|
|
"headers": {"server": r"Caddy"}},
|
|
{"name": "Gunicorn", "category": "web-server",
|
|
"headers": {"server": r"gunicorn(?:/([\d.]+))?"}, "version_group": 1,
|
|
"implies": ["Python"]},
|
|
{"name": "Werkzeug", "category": "web-server",
|
|
"headers": {"server": r"Werkzeug(?:/([\d.]+))?"}, "version_group": 1,
|
|
"implies": ["Python"]},
|
|
{"name": "Phusion Passenger", "category": "web-server",
|
|
"headers": {"server": r"Phusion Passenger(?:[ /]([\d.]+))?"}, "version_group": 1,
|
|
"implies": ["Ruby"]},
|
|
{"name": "Tomcat", "category": "web-server",
|
|
"headers": {"server": r"(?:Apache-Coyote|Tomcat)(?:/([\d.]+))?"}, "version_group": 1,
|
|
"implies": ["Java"]},
|
|
|
|
# ---- programming-language --------------------------------------------
|
|
{"name": "PHP", "category": "programming-language",
|
|
"headers": {"x-powered-by": r"PHP(?:/([\d.]+))?"},
|
|
"cookies": r"^PHPSESSID$", "version_group": 1},
|
|
{"name": "ASP.NET", "category": "programming-language",
|
|
"headers": {"x-aspnet-version": r"([\d.]+)", "x-powered-by": r"ASP\.NET"},
|
|
"cookies": r"^ASP\.NET_SessionId$", "version_group": 1},
|
|
{"name": "Java", "category": "programming-language",
|
|
"cookies": r"^JSESSIONID$"},
|
|
{"name": "Python", "category": "programming-language",
|
|
"headers": {"x-powered-by": r"Python(?:/([\d.]+))?"}, "version_group": 1},
|
|
{"name": "Ruby", "category": "programming-language",
|
|
"headers": {"x-powered-by": r"Phusion Passenger|mod_rails"}},
|
|
|
|
# ---- web-framework ----------------------------------------------------
|
|
{"name": "Express", "category": "web-framework",
|
|
"headers": {"x-powered-by": r"Express"}, "implies": ["Node.js"]},
|
|
{"name": "Django", "category": "web-framework",
|
|
"cookies": r"^csrftoken$|^django", "implies": ["Python"]},
|
|
{"name": "Flask", "category": "web-framework",
|
|
"cookies": r"^session$", "headers": {"server": r"Werkzeug"},
|
|
"implies": ["Python"]},
|
|
{"name": "Laravel", "category": "web-framework",
|
|
"cookies": r"^laravel_session$|^XSRF-TOKEN$", "implies": ["PHP"]},
|
|
{"name": "Ruby on Rails", "category": "web-framework",
|
|
"headers": {"x-runtime": r"([\d.]+)"},
|
|
"cookies": r"_session_id$|^_rails", "version_group": 1, "implies": ["Ruby"]},
|
|
{"name": "ASP.NET MVC", "category": "web-framework",
|
|
"headers": {"x-aspnetmvc-version": r"([\d.]+)"}, "version_group": 1,
|
|
"implies": ["ASP.NET"]},
|
|
{"name": "Next.js", "category": "web-framework",
|
|
"html": r"id=[\"']__NEXT_DATA__[\"']|/_next/static/",
|
|
"headers": {"x-powered-by": r"Next\.js"}, "implies": ["React", "Node.js"]},
|
|
{"name": "Nuxt.js", "category": "web-framework",
|
|
"html": r"window\.__NUXT__|/_nuxt/", "implies": ["Vue.js"]},
|
|
|
|
# ---- cms --------------------------------------------------------------
|
|
{"name": "WordPress", "category": "cms",
|
|
"html": r"wp-content|wp-includes",
|
|
"meta_generator": r"WordPress(?:[ /]?([\d.]+))?",
|
|
"cookies": r"^wordpress_|^wp-settings",
|
|
"script_src": r"/wp-includes/js/", "version_group": 1, "implies": ["PHP"]},
|
|
{"name": "Drupal", "category": "cms",
|
|
"headers": {"x-generator": r"Drupal(?:[ /]?([\d.]+))?"},
|
|
"html": r"Drupal\.settings|sites/(?:all|default)/(?:themes|modules)",
|
|
"meta_generator": r"Drupal(?:[ /]?([\d.]+))?", "version_group": 1,
|
|
"implies": ["PHP"]},
|
|
{"name": "Joomla", "category": "cms",
|
|
"meta_generator": r"Joomla!?(?:[ /]?([\d.]+))?",
|
|
"html": r"/media/jui/|com_content", "version_group": 1, "implies": ["PHP"]},
|
|
{"name": "Ghost", "category": "cms",
|
|
"meta_generator": r"Ghost(?:[ /]?([\d.]+))?",
|
|
"html": r"content=[\"']Ghost", "version_group": 1, "implies": ["Node.js"]},
|
|
{"name": "Wix", "category": "cms",
|
|
"headers": {"x-wix-request-id": r".+"},
|
|
"html": r"static\.wixstatic\.com|X-Wix"},
|
|
{"name": "Squarespace", "category": "cms",
|
|
"html": r"static\.squarespace\.com|squarespace\.com",
|
|
"meta_generator": r"Squarespace"},
|
|
|
|
# ---- js-framework -----------------------------------------------------
|
|
{"name": "React", "category": "js-framework",
|
|
"html": r"data-reactroot|data-reactid|__REACT_DEVTOOLS"},
|
|
{"name": "Vue.js", "category": "js-framework",
|
|
"html": r"data-v-[0-9a-f]{6,}|__VUE__|id=[\"']app[\"'][^>]*data-v-"},
|
|
{"name": "Angular", "category": "js-framework",
|
|
"html": r"ng-version=[\"']([\d.]+)[\"']|ng-app|<app-root", "version_group": 1},
|
|
{"name": "Svelte", "category": "js-framework",
|
|
"html": r"svelte-[0-9a-z]{6,}|__svelte"},
|
|
{"name": "Ember.js", "category": "js-framework",
|
|
"html": r"ember-application|id=[\"']ember"},
|
|
{"name": "Backbone.js", "category": "js-framework",
|
|
"script_src": r"backbone(?:\.min)?\.js"},
|
|
|
|
# ---- js-library -------------------------------------------------------
|
|
{"name": "jQuery", "category": "js-library",
|
|
"script_src": r"jquery[.-]?([\d.]+)?(?:\.min)?\.js", "version_group": 1},
|
|
{"name": "Bootstrap", "category": "js-library",
|
|
"script_src": r"bootstrap[.-]?([\d.]+)?(?:\.min)?\.js",
|
|
"html": r"class=[\"'][^\"']*\b(?:container-fluid|navbar-toggler|col-md-)\b",
|
|
"version_group": 1},
|
|
{"name": "Lodash", "category": "js-library",
|
|
"script_src": r"lodash(?:\.min)?\.js"},
|
|
{"name": "Underscore.js", "category": "js-library",
|
|
"script_src": r"underscore(?:\.min)?\.js"},
|
|
{"name": "Modernizr", "category": "js-library",
|
|
"script_src": r"modernizr(?:[.-]?[\d.]+)?(?:\.min)?\.js",
|
|
"html": r"class=[\"'][^\"']*\bjs\b[^\"']*\bno-js\b"},
|
|
{"name": "Moment.js", "category": "js-library",
|
|
"script_src": r"moment(?:\.min)?\.js"},
|
|
|
|
# ---- analytics / tag --------------------------------------------------
|
|
{"name": "Google Analytics", "category": "analytics",
|
|
"html": r"google-analytics\.com/(?:ga|analytics)\.js|gtag\(|googletagmanager\.com/gtag/js",
|
|
"script_src": r"google-analytics\.com|googletagmanager\.com/gtag"},
|
|
{"name": "Google Tag Manager", "category": "analytics",
|
|
"html": r"googletagmanager\.com/gtm\.js|GTM-[A-Z0-9]+",
|
|
"script_src": r"googletagmanager\.com/gtm\.js"},
|
|
{"name": "Facebook Pixel", "category": "analytics",
|
|
"html": r"connect\.facebook\.net/[^/]+/fbevents\.js|fbq\("},
|
|
{"name": "Hotjar", "category": "analytics",
|
|
"html": r"static\.hotjar\.com|hotjar\.com/c/hotjar|hjid:",
|
|
"script_src": r"static\.hotjar\.com"},
|
|
{"name": "Matomo", "category": "analytics",
|
|
"html": r"matomo\.js|piwik\.js|_paq\.push",
|
|
"script_src": r"matomo\.js|piwik\.js"},
|
|
|
|
# ---- cdn --------------------------------------------------------------
|
|
{"name": "Cloudflare", "category": "cdn",
|
|
"headers": {"cf-ray": r".+", "server": r"cloudflare"}},
|
|
{"name": "Fastly", "category": "cdn",
|
|
"headers": {"x-served-by": r"cache-.+|.+fastly.*", "x-fastly-request-id": r".+",
|
|
"via": r".*Fastly.*"}},
|
|
{"name": "Akamai", "category": "cdn",
|
|
"headers": {"x-akamai-transformed": r".+", "server": r"AkamaiGHost"}},
|
|
{"name": "Amazon CloudFront", "category": "cdn",
|
|
"headers": {"x-amz-cf-id": r".+", "via": r".*CloudFront.*",
|
|
"x-cache": r".*cloudfront.*"}},
|
|
{"name": "jsDelivr", "category": "cdn",
|
|
"script_src": r"cdn\.jsdelivr\.net"},
|
|
{"name": "unpkg", "category": "cdn",
|
|
"script_src": r"unpkg\.com"},
|
|
|
|
# ---- ecommerce --------------------------------------------------------
|
|
{"name": "Shopify", "category": "ecommerce",
|
|
"headers": {"x-shopify-stage": r".+", "x-sorting-hat-shopid": r".+"},
|
|
"html": r"cdn\.shopify\.com|Shopify\.theme|shopify\.com/s/",
|
|
"script_src": r"cdn\.shopify\.com"},
|
|
{"name": "WooCommerce", "category": "ecommerce",
|
|
"html": r"woocommerce|wc-(?:cart|checkout)|class=[\"'][^\"']*woocommerce",
|
|
"cookies": r"^woocommerce_", "implies": ["WordPress", "PHP"]},
|
|
{"name": "Magento", "category": "ecommerce",
|
|
"html": r"Mage\.Cookies|/skin/frontend/|Magento_",
|
|
"cookies": r"^frontend$|^X-Magento", "implies": ["PHP"]},
|
|
{"name": "PrestaShop", "category": "ecommerce",
|
|
"html": r"prestashop|/modules/.*prestashop",
|
|
"meta_generator": r"PrestaShop", "cookies": r"^PrestaShop-",
|
|
"implies": ["PHP"]},
|
|
|
|
# ---- security / waf ---------------------------------------------------
|
|
{"name": "Sucuri", "category": "security",
|
|
"headers": {"x-sucuri-id": r".+", "x-sucuri-cache": r".+",
|
|
"server": r"Sucuri/Cloudproxy"}},
|
|
{"name": "Imperva Incapsula", "category": "security",
|
|
"headers": {"x-iinfo": r".+", "x-cdn": r"Incapsula"},
|
|
"cookies": r"^(?:incap_ses|visid_incap)"},
|
|
{"name": "Cloudflare WAF", "category": "security",
|
|
"cookies": r"^__cf(?:duid|_bm)$|^cf_clearance$"},
|
|
|
|
# ---- runtime ----------------------------------------------------------
|
|
{"name": "Node.js", "category": "programming-language",
|
|
"headers": {"x-powered-by": r"Express|Node"}},
|
|
]
|
|
|
|
|
|
def _compile_signatures(signatures):
|
|
"""Compila los regex de cada firma a nivel modulo (una sola vez).
|
|
|
|
Devuelve una lista paralela a `signatures` donde cada matcher textual ha
|
|
sido reemplazado por un patron compilado con re.IGNORECASE. Es una
|
|
transformacion pura sobre la constante; no muta `signatures`.
|
|
"""
|
|
compiled = []
|
|
for sig in signatures:
|
|
c = {"name": sig["name"], "category": sig["category"]}
|
|
if "version_group" in sig:
|
|
c["version_group"] = sig["version_group"]
|
|
if "implies" in sig:
|
|
c["implies"] = list(sig["implies"])
|
|
if "headers" in sig:
|
|
c["headers"] = {
|
|
k.lower(): re.compile(v, re.IGNORECASE)
|
|
for k, v in sig["headers"].items()
|
|
}
|
|
for key in ("html", "meta_generator", "cookies", "script_src", "url"):
|
|
if key in sig:
|
|
c[key] = re.compile(sig[key], re.IGNORECASE)
|
|
compiled.append(c)
|
|
return compiled
|
|
|
|
|
|
# Regex compilados a nivel modulo: constante, inmutable en la practica.
|
|
_COMPILED = _compile_signatures(SIGNATURES)
|
|
|
|
# Regex auxiliares para extraer senales del HTML (compilados una vez).
|
|
_META_GENERATOR_RE = re.compile(
|
|
r"<meta[^>]+name=[\"']generator[\"'][^>]+content=[\"']([^\"']*)[\"']",
|
|
re.IGNORECASE,
|
|
)
|
|
_SCRIPT_SRC_RE = re.compile(
|
|
r"<script[^>]+src=[\"']([^\"']+)[\"']", re.IGNORECASE
|
|
)
|
|
|
|
|
|
def _version_from(match, version_group):
|
|
"""Extrae la version de un match dado el group, best-effort.
|
|
|
|
Devuelve "" si no hay group, el group esta vacio o el indice no existe.
|
|
"""
|
|
if not match or not version_group:
|
|
return ""
|
|
try:
|
|
v = match.group(version_group)
|
|
except (IndexError, re.error):
|
|
return ""
|
|
return v or ""
|
|
|
|
|
|
def detect_web_tech(headers, html="", cookies=None, final_url=""):
|
|
"""Detecta tecnologias web a partir de senales de un fetch HTTP.
|
|
|
|
Pieza PURA de un detector estilo Wappalyzer: matchea una tabla de firmas
|
|
embebida (regex) contra las cabeceras, el HTML, los nombres de cookies y la
|
|
URL final ya recogidos por la capa impura hermana
|
|
(`fetch_http_fingerprint_py_cybersecurity`). No toca la red ni hace I/O.
|
|
|
|
Args:
|
|
headers: dict de cabeceras de respuesta con claves LOWERCASE (tal como
|
|
las devuelve fetch_http_fingerprint en su campo `headers`). Los
|
|
valores son strings. Si las claves no vinieran en minusculas se
|
|
normalizan internamente.
|
|
html: HTML de la pagina como string. Default "" (permite detectar solo
|
|
por cabeceras y cookies).
|
|
cookies: lista de NOMBRES de cookies (no valores). Default None -> [].
|
|
final_url: URL final tras redirects (para firmas basadas en host/path).
|
|
Opcional, default "".
|
|
|
|
Returns:
|
|
dict con:
|
|
- "technologies": lista de dicts
|
|
{name, category, version, confidence, evidence}, ordenada por
|
|
(categoria, nombre) de forma determinista.
|
|
- "by_category": dict categoria -> lista ordenada de nombres.
|
|
- "count": numero de tecnologias detectadas.
|
|
|
|
Para entrada vacia (headers={}, html="") devuelve
|
|
{"technologies": [], "by_category": {}, "count": 0}. Nunca lanza.
|
|
"""
|
|
headers = {str(k).lower(): str(v) for k, v in (headers or {}).items()}
|
|
cookies = list(cookies or [])
|
|
html = html or ""
|
|
final_url = final_url or ""
|
|
|
|
# Pre-extrae senales derivadas del HTML una sola vez.
|
|
meta_generators = _META_GENERATOR_RE.findall(html)
|
|
script_srcs = _SCRIPT_SRC_RE.findall(html)
|
|
|
|
# name -> registro acumulado de la deteccion.
|
|
detected = {}
|
|
|
|
def _record(name, category, version, confidence, evidence):
|
|
prev = detected.get(name)
|
|
if prev is None:
|
|
detected[name] = {
|
|
"name": name,
|
|
"category": category,
|
|
"version": version or "",
|
|
"confidence": confidence,
|
|
"evidence": evidence,
|
|
}
|
|
return
|
|
# Dedup: combina. Mejor version no vacia y mejor confidence ganan.
|
|
if not prev["version"] and version:
|
|
prev["version"] = version
|
|
if confidence == "high" and prev["confidence"] != "high":
|
|
prev["confidence"] = "high"
|
|
prev["evidence"] = evidence
|
|
|
|
for sig in _COMPILED:
|
|
name = sig["name"]
|
|
category = sig["category"]
|
|
vgroup = sig.get("version_group", 0)
|
|
|
|
# ---- headers (high) ----
|
|
matched = False
|
|
if "headers" in sig:
|
|
for hkey, hre in sig["headers"].items():
|
|
val = headers.get(hkey)
|
|
if val is None:
|
|
continue
|
|
m = hre.search(val)
|
|
if m:
|
|
version = _version_from(m, vgroup)
|
|
_record(name, category, version, "high",
|
|
f"header {hkey}: {val}")
|
|
matched = True
|
|
break
|
|
|
|
# ---- meta generator (high) ----
|
|
if not matched and "meta_generator" in sig:
|
|
for gen in meta_generators:
|
|
m = sig["meta_generator"].search(gen)
|
|
if m:
|
|
version = _version_from(m, vgroup)
|
|
_record(name, category, version, "high",
|
|
f"meta generator: {gen}")
|
|
matched = True
|
|
break
|
|
|
|
# ---- cookies (high) ----
|
|
if not matched and "cookies" in sig:
|
|
for ck in cookies:
|
|
m = sig["cookies"].search(ck)
|
|
if m:
|
|
_record(name, category, "", "high", f"cookie: {ck}")
|
|
matched = True
|
|
break
|
|
|
|
# ---- url (high) ----
|
|
if not matched and "url" in sig and final_url:
|
|
m = sig["url"].search(final_url)
|
|
if m:
|
|
version = _version_from(m, vgroup)
|
|
_record(name, category, version, "high",
|
|
f"url: {final_url}")
|
|
matched = True
|
|
|
|
# ---- script src (medium) ----
|
|
if not matched and "script_src" in sig:
|
|
for src in script_srcs:
|
|
m = sig["script_src"].search(src)
|
|
if m:
|
|
version = _version_from(m, vgroup)
|
|
_record(name, category, version, "medium",
|
|
f"script src: {src}")
|
|
matched = True
|
|
break
|
|
|
|
# ---- html generico (medium) ----
|
|
if not matched and "html" in sig and html:
|
|
m = sig["html"].search(html)
|
|
if m:
|
|
version = _version_from(m, vgroup)
|
|
_record(name, category, version, "medium",
|
|
"html pattern")
|
|
matched = True
|
|
|
|
# ---- implies: anade tecnologias implicadas (confidence medium) ----
|
|
# Itera sobre una copia: si una tech directa implica otra, la implicada se
|
|
# anade solo si no estaba ya detectada directamente.
|
|
catalog = {sig["name"]: sig["category"] for sig in _COMPILED}
|
|
for sig in _COMPILED:
|
|
if sig["name"] not in detected or "implies" not in sig:
|
|
continue
|
|
for imp_name in sig["implies"]:
|
|
if imp_name in detected:
|
|
continue
|
|
imp_cat = catalog.get(imp_name, "unknown")
|
|
_record(imp_name, imp_cat, "", "medium",
|
|
f"implied by {sig['name']}")
|
|
|
|
technologies = sorted(
|
|
detected.values(), key=lambda t: (t["category"], t["name"])
|
|
)
|
|
|
|
by_category = {}
|
|
for tech in technologies:
|
|
by_category.setdefault(tech["category"], []).append(tech["name"])
|
|
|
|
return {
|
|
"technologies": technologies,
|
|
"by_category": by_category,
|
|
"count": len(technologies),
|
|
}
|