fn_registry/python/functions/cybersecurity/fetch_http_fingerprint.py

"""GET HTTP(S) que recoge senales crudas para fingerprint de tecnologia web.

Funcion IMPURA: hace una peticion HTTP(S) GET a una URL con User-Agent de
navegador, sigue redirects y recoge TODAS las senales utiles para identificar
el stack tecnologico del sitio (estilo Wappalyzer): cabeceras de respuesta
normalizadas (lowercase), nombres de cookies, el HTML, el titulo y la cadena
del servidor. Es la capa de RECOLECCION del fingerprinting web; el MATCHING de
firmas vive en una funcion pura aparte (`detect_web_tech_py_cybersecurity`)
que consume exactamente lo que esta devuelve.

Devuelve siempre un dict (estilo del grupo recon): nunca lanza excepciones.
Un 403/500 sigue siendo senal util de fingerprint, asi que un HTTPError se
captura y se devuelve con su status_code real, headers y body.

SEGURIDAD: en `cookies` solo se guardan los NOMBRES de las cookies, jamas los
valores (un Set-Cookie lleva tokens de sesion sensibles).

Solo usa stdlib (urllib, ssl, re, gzip, zlib).
"""

import gzip
import re
import socket
import ssl
import urllib.error
import urllib.request
import zlib

_DEFAULT_UA = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
_TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
_CHARSET_RE = re.compile(r"charset=([\w-]+)", re.IGNORECASE)
_COOKIE_NAME_RE = re.compile(r"^\s*([^=;\s]+)=")


def _decompress(body: bytes, encoding: str) -> bytes:
    """Descomprime el body segun Content-Encoding (gzip/deflate). Best-effort."""
    enc = (encoding or "").lower()
    try:
        if "gzip" in enc:
            return gzip.decompress(body)
        if "deflate" in enc:
            # deflate puede venir con o sin cabecera zlib.
            try:
                return zlib.decompress(body)
            except zlib.error:
                return zlib.decompress(body, -zlib.MAX_WBITS)
    except (OSError, zlib.error):
        # Si la descompresion falla, devuelve el body crudo (mejor algo que nada).
        return body
    return body


def _decode_html(body: bytes, content_type: str) -> str:
    """Decodifica el HTML best-effort: charset del Content-Type -> utf-8 -> latin-1."""
    charset = None
    m = _CHARSET_RE.search(content_type or "")
    if m:
        charset = m.group(1).strip()
    for enc in (charset, "utf-8", "latin-1"):
        if not enc:
            continue
        try:
            return body.decode(enc, errors="strict")
        except (LookupError, UnicodeDecodeError):
            continue
    # latin-1 nunca falla; ultimo recurso explicito.
    return body.decode("latin-1", errors="replace")


def _extract_title(html: str) -> str | None:
    """Extrae el contenido de <title> best-effort, colapsando espacios."""
    m = _TITLE_RE.search(html)
    if not m:
        return None
    title = re.sub(r"\s+", " ", m.group(1)).strip()
    return title or None


def _cookie_names(set_cookie_values: list[str]) -> list[str]:
    """Devuelve solo los NOMBRES de las cookies (nunca valores), deduplicados en orden."""
    out: list[str] = []
    seen: set[str] = set()
    for raw in set_cookie_values:
        m = _COOKIE_NAME_RE.match(raw or "")
        if not m:
            continue
        name = m.group(1)
        if name and name not in seen:
            seen.add(name)
            out.append(name)
    return out


def _normalize_headers(headers) -> tuple[dict, list[str]]:
    """Normaliza headers a {clave_lower: valor_str} y extrae los Set-Cookie crudos.

    Si una cabecera se repite, gana el ultimo valor (salvo Set-Cookie, que se
    acumula aparte para extraer todos los nombres de cookie). Devuelve
    (headers_dict, lista_de_set_cookie_crudos).
    """
    norm: dict[str, str] = {}
    set_cookies: list[str] = []
    # http.client.HTTPMessage soporta .items() devolviendo cada par (con repetidos).
    for key, value in headers.items():
        lk = key.lower()
        if lk == "set-cookie":
            set_cookies.append(value)
            continue
        norm[lk] = value
    return norm, set_cookies


def _build_raw(status_line: str, headers: dict, cookie_names: list[str]) -> str:
    """Construye un bloque legible (status + headers + nombres de cookie) para evidencia.

    NO incluye el HTML entero (puede ser megas) ni valores de cookie (sensibles).
    """
    lines = [status_line]
    for k in sorted(headers):
        lines.append(f"{k}: {headers[k]}")
    if cookie_names:
        lines.append("set-cookie-names: " + ", ".join(cookie_names))
    return "\n".join(lines)


def _do_get(
    url: str,
    timeout_s: float,
    verify_tls: bool,
    max_html_bytes: int,
    ua: str,
) -> dict:
    """Hace un GET unico a `url` y construye el dict de salida. Puede lanzar."""
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": ua,
            "Accept": (
                "text/html,application/xhtml+xml,application/xml;q=0.9,"
                "image/avif,image/webp,*/*;q=0.8"
            ),
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Encoding": "gzip, deflate",
        },
        method="GET",
    )
    context = None if verify_tls else ssl._create_unverified_context()

    try:
        resp = urllib.request.urlopen(req, timeout=timeout_s, context=context)
        status_code = resp.getcode()
        final_url = resp.geturl()
        resp_headers = resp.headers
        body = resp.read(max_html_bytes + 1)
        resp.close()
    except urllib.error.HTTPError as e:
        # Un error HTTP (403/404/500...) SIGUE siendo senal util de fingerprint:
        # tiene headers y a menudo body. Lo tratamos como respuesta valida.
        status_code = e.code
        final_url = e.geturl() or url
        resp_headers = e.headers
        body = e.read(max_html_bytes + 1) if e.fp is not None else b""

    headers, set_cookie_raw = _normalize_headers(resp_headers)
    cookie_names = _cookie_names(set_cookie_raw)

    content_encoding = headers.get("content-encoding", "")
    body = _decompress(body, content_encoding)
    if len(body) > max_html_bytes:
        body = body[:max_html_bytes]

    html = _decode_html(body, headers.get("content-type", ""))
    title = _extract_title(html)
    server = headers.get("server")

    status_line = f"HTTP {status_code} {final_url}"
    raw = _build_raw(status_line, headers, cookie_names)

    return {
        "status": "ok",
        "url": url,
        "final_url": final_url,
        "status_code": status_code,
        "headers": headers,
        "cookies": cookie_names,
        "title": title,
        "server": server,
        "html": html,
        "html_len": len(html),
        "raw": raw,
    }


def fetch_http_fingerprint(
    url: str,
    timeout_s: float = 15.0,
    verify_tls: bool = True,
    max_html_bytes: int = 500_000,
    user_agent: str | None = None,
) -> dict:
    """GET HTTP(S) que recoge senales crudas para fingerprint de tecnologia web.

    Funcion IMPURA: hace red. Manda un GET con User-Agent de navegador, sigue
    redirects (urllib los sigue por defecto) y recoge headers normalizados,
    nombres de cookies, HTML, titulo y servidor. Nunca lanza: cualquier fallo
    de red total devuelve ``{"status": "error", ...}``. Un error HTTP
    (403/500...) se devuelve como ``status: ok`` con su ``status_code`` real,
    porque sigue siendo senal de fingerprint.

    Si `url` no trae esquema, asume ``https://`` y, si la conexion HTTPS falla,
    reintenta con ``http://``.

    Args:
        url: URL objetivo. Sin esquema se asume https:// (fallback a http://).
        timeout_s: Timeout de la peticion en segundos. Default 15.0.
        verify_tls: Si False, crea un ssl context sin verificacion (inseguro,
            solo para recon de hosts propios con cert self-signed). Default True.
        max_html_bytes: Corta el HTML leido a este tamano para no descargar
            megas. Default 500_000 (500 KB).
        user_agent: User-Agent a enviar. Default un UA realista de Chrome.

    Returns:
        dict. En exito::

            {
                "status": "ok",
                "url": <url solicitada>,
                "final_url": <url tras redirects>,
                "status_code": int,
                "headers": {clave_lower: valor_str, ...},  # ultimo valor si repetido
                "cookies": [<nombre_cookie>, ...],          # SOLO nombres, nunca valores
                "title": str | None,
                "server": str | None,                       # atajo a headers["server"]
                "html": str,                                # cortado a max_html_bytes
                "html_len": int,
                "raw": str,                                 # status + headers (sin html)
            }

        En error de red total (host no resuelve / conexion rechazada / timeout)::

            {"status": "error", "error": "<mensaje>", "url": <url>}

    SEGURIDAD: `cookies` lleva SOLO los nombres de las cookies de Set-Cookie,
    jamas los valores (que contienen tokens de sesion).
    """
    if not url or not url.strip():
        return {"status": "error", "error": "fetch_http_fingerprint: url vacia", "url": url}

    url = url.strip()
    ua = user_agent or _DEFAULT_UA

    # Construye la lista de URLs a intentar: si no hay esquema, https:// y luego
    # http:// como fallback. Si ya trae esquema, solo esa.
    if "://" in url:
        candidates = [url]
    else:
        candidates = ["https://" + url, "http://" + url]

    last_error: str | None = None
    for candidate in candidates:
        try:
            return _do_get(candidate, timeout_s, verify_tls, max_html_bytes, ua)
        except urllib.error.URLError as e:
            reason = getattr(e, "reason", e)
            last_error = f"{candidate}: {reason}"
        except socket.timeout:
            last_error = f"{candidate}: timeout tras {timeout_s}s"
        except ssl.SSLError as e:
            last_error = f"{candidate}: SSL error: {e}"
        except (OSError, ValueError) as e:  # conexion rechazada, URL invalida, etc.
            last_error = f"{candidate}: {e}"

    return {
        "status": "error",
        "error": f"fetch_http_fingerprint: {last_error or 'fallo desconocido'}",
        "url": url,
    }


if __name__ == "__main__":
    # Smoke test contra un sitio publico, best-effort (no rompe si no hay red).
    res = fetch_http_fingerprint("https://example.com")
    print("status:", res["status"])
    if res["status"] == "ok":
        print("  final_url:", res["final_url"])
        print("  status_code:", res["status_code"])
        print("  server:", res["server"])
        print("  title:", res["title"])
        print("  cookies:", res["cookies"])
        print("  html_len:", res["html_len"])
    else:
        print("  (red no disponible, tolerado):", res["error"])