fn_registry/python/functions/datascience/scrape_competitor_prices.py

"""Scrape current prices for a list of competitor product pages.

Watches competitor pricing: given a list of targets (product URL + competitor),
fetches each page and extracts the current price using a cascade of strategies
(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
id/snapshot_date/scraped_at columns).
"""

import json
import re
import urllib.request
import urllib.error

from bs4 import BeautifulSoup

_USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)

_REQUEST_HEADERS = {
    "User-Agent": _USER_AGENT,
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;q=0.9,"
        "image/avif,image/webp,*/*;q=0.8"
    ),
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
    "Accept-Encoding": "identity",
    "Connection": "close",
}

# Substrings that, when present, signal the product is NOT available.
_OUT_OF_STOCK_MARKERS = (
    "agotado",
    "sin stock",
    "sin existencias",
    "no disponible",
    "out of stock",
    "sold out",
    "unavailable",
    "currently unavailable",
)

# Common class/attribute patterns used by mainstream e-commerce templates.
_PRICE_HEURISTIC_SELECTORS = (
    "[itemprop=price]",
    "[data-price]",
    "[data-product-price]",
    ".price",
    ".product-price",
    ".price--current",
    ".current-price",
    ".sale-price",
    ".a-price .a-offscreen",
    "[class*=price]",
)

# A token that looks like a price: optional currency symbol, digits with
# thousands/decimal separators. Captured group is the numeric part.
# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
# second alternative covers plain contiguous digits with optional decimals
# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
# plain-digit branch greedily eating "1299" out of "1299.99".
_PRICE_NUMBER_RE = re.compile(
    r"(?:[€$£]|EUR|USD|GBP)?\s*"
    r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
    r"\s*(?:[€$£]|EUR|USD|GBP)?",
    re.IGNORECASE,
)


def _fetch_html(url: str, timeout: float = 15.0) -> str:
    """GET a URL with realistic headers, one retry on failure.

    Raises the last urllib error if both attempts fail.
    """
    last_err: Exception | None = None
    for attempt in range(2):
        try:
            req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                raw = resp.read()
            charset = resp.headers.get_content_charset() or "utf-8"
            try:
                return raw.decode(charset, errors="replace")
            except (LookupError, UnicodeDecodeError):
                return raw.decode("utf-8", errors="replace")
        except Exception as err:  # noqa: BLE001 - retry on any transport error
            last_err = err
            continue
    raise last_err if last_err is not None else RuntimeError("fetch failed")


def _normalize_price(raw) -> float | None:
    """Normalize a price token to float, tolerating comma/dot and symbols.

    Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
    Returns None if no numeric value can be parsed.
    """
    if raw is None:
        return None
    if isinstance(raw, (int, float)):
        try:
            return float(raw)
        except (ValueError, TypeError):
            return None

    text = str(raw).strip()
    if not text:
        return None

    match = _PRICE_NUMBER_RE.search(text)
    if not match:
        return None

    num = match.group(1).strip().replace(" ", "")

    last_comma = num.rfind(",")
    last_dot = num.rfind(".")

    if last_comma != -1 and last_dot != -1:
        # The right-most separator is the decimal separator.
        if last_comma > last_dot:
            # European: 1.299,99 -> dots are thousands, comma is decimal.
            num = num.replace(".", "").replace(",", ".")
        else:
            # US: 1,299.99 -> commas are thousands, dot is decimal.
            num = num.replace(",", "")
    elif last_comma != -1:
        # Only commas present. Decimal if it looks like "29,90"; else thousands.
        if len(num) - last_comma - 1 == 2:
            num = num.replace(",", ".")
        else:
            num = num.replace(",", "")
    # Only dots (or none): assume dot is already decimal / no separators.

    try:
        return float(num)
    except ValueError:
        return None


def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
    """Try a single CSS selector and normalize the matched node."""
    try:
        node = soup.select_one(selector)
    except Exception:  # noqa: BLE001 - invalid selector should not abort
        return None
    if node is None:
        return None
    # Prefer common price-bearing attributes, fall back to text.
    for attr in ("content", "data-price", "data-product-price", "value"):
        if node.has_attr(attr):
            price = _normalize_price(node.get(attr))
            if price is not None:
                return price
    return _normalize_price(node.get_text(" ", strip=True))


def _iter_json_ld_prices(soup: BeautifulSoup):
    """Yield candidate prices found inside ld+json offers blocks."""
    for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
        payload = tag.string or tag.get_text()
        if not payload:
            continue
        try:
            data = json.loads(payload)
        except (ValueError, TypeError):
            continue
        for node in _walk_json(data):
            if not isinstance(node, dict):
                continue
            offers = node.get("offers")
            for offer in _as_list(offers):
                if isinstance(offer, dict) and "price" in offer:
                    yield offer.get("price")
            # Some schemas place price directly on the node.
            if "price" in node and not isinstance(node.get("offers"), (dict, list)):
                yield node.get("price")


def _walk_json(node):
    """Depth-first walk over arbitrarily nested JSON structures."""
    if isinstance(node, dict):
        yield node
        for value in node.values():
            yield from _walk_json(value)
    elif isinstance(node, list):
        for item in node:
            yield from _walk_json(item)


def _as_list(value):
    """Wrap a value in a list unless it already is one."""
    if value is None:
        return []
    return value if isinstance(value, list) else [value]


def _extract_from_meta(soup: BeautifulSoup) -> float | None:
    """Try common price meta tags in priority order."""
    candidates = (
        {"itemprop": "price"},
        {"property": "og:price:amount"},
        {"property": "product:price:amount"},
        {"name": "twitter:data1"},
    )
    for attrs in candidates:
        tag = soup.find("meta", attrs=attrs)
        if tag is not None:
            price = _normalize_price(tag.get("content"))
            if price is not None:
                return price
    return None


def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
    """Heuristic stock detection: True unless an out-of-stock marker appears."""
    text = soup.get_text(" ", strip=True).lower()
    if not text:
        return None
    for marker in _OUT_OF_STOCK_MARKERS:
        if marker in text:
            return False
    return True


def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
    """Run the extraction cascade and return the first price found."""
    # 1. Caller-supplied CSS selector (most robust).
    if price_selector:
        price = _extract_from_selector(soup, str(price_selector))
        if price is not None:
            return price

    # 2. JSON-LD offers.
    for candidate in _iter_json_ld_prices(soup):
        price = _normalize_price(candidate)
        if price is not None:
            return price

    # 3. Meta tags.
    price = _extract_from_meta(soup)
    if price is not None:
        return price

    # 4. Common-class heuristics.
    for selector in _PRICE_HEURISTIC_SELECTORS:
        price = _extract_from_selector(soup, selector)
        if price is not None:
            return price

    return None


def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
    """Scrape current prices for a list of competitor product pages.

    For each target performs a GET with realistic headers (timeout + 1 retry)
    and extracts the price using a cascade of strategies. Extraction failures
    of a single target never abort the others: that row is returned with
    price=None (and in_stock=None) so the caller still gets one row per target.

    Args:
        targets: list of dicts, each with keys:
            - competitor (str): competitor name/id.
            - product_key (str): stable internal product key.
            - product_name (str): human-readable product name.
            - url (str): product page URL to scrape.
            - price_selector (str, optional): CSS selector pinpointing the
              price node. Most robust when provided.
            - currency (str, optional): currency code to stamp on the row
              (e.g. "EUR"). Defaults to "EUR".

    Returns:
        list of dicts, one per target, with EXACTLY these keys (1:1 with the
        Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
            - competitor (str)
            - product_key (str)
            - product_name (str)
            - url (str)
            - price (float | None)
            - currency (str)
            - in_stock (bool | None)
    """
    rows: list[dict] = []

    for target in targets:
        competitor = target.get("competitor")
        product_key = target.get("product_key")
        product_name = target.get("product_name")
        url = target.get("url")
        price_selector = target.get("price_selector")
        currency = target.get("currency") or "EUR"

        price: float | None = None
        in_stock: bool | None = None

        if url:
            try:
                html = _fetch_html(url)
                soup = BeautifulSoup(html, "lxml")
                price = _extract_price(soup, price_selector)
                in_stock = _detect_in_stock(soup)
            except Exception:  # noqa: BLE001 - never abort the whole batch
                price = None
                in_stock = None

        rows.append(
            {
                "competitor": competitor,
                "product_key": product_key,
                "product_name": product_name,
                "url": url,
                "price": price,
                "currency": currency,
                "in_stock": in_stock,
            }
        )

    return rows


if __name__ == "__main__":
    # Self-test: import is implicitly OK if we reach this point.
    print("self-test: import OK")

    # Pure-logic checks that need no network.
    assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
    assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
    assert _normalize_price("29,90") == 29.90, "EU decimal only"
    assert _normalize_price("1,299") == 1299.0, "US thousands only"
    assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
    assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
    assert _normalize_price("no price here") is None, "no number"
    assert _normalize_price(None) is None, "none in -> none out"
    print("self-test: price normalization OK")

    # Shape check: one row per target, exact keys, failed target -> price None.
    sample = scrape_competitor_prices(
        [
            {
                "competitor": "demo",
                "product_key": "SKU-1",
                "product_name": "Demo product",
                "url": "http://invalid.localhost.invalid/nope",
                "currency": "EUR",
            }
        ]
    )
    expected_keys = {
        "competitor",
        "product_key",
        "product_name",
        "url",
        "price",
        "currency",
        "in_stock",
    }
    assert len(sample) == 1, "one row per target"
    assert set(sample[0].keys()) == expected_keys, "exact keys"
    assert sample[0]["price"] is None, "failed target -> price None, no abort"
    assert sample[0]["currency"] == "EUR", "currency default"
    print("self-test: row shape + graceful-failure OK")

    # Optional: best-effort real fetch against a public URL (never fails build).
    try:
        live = scrape_competitor_prices(
            [
                {
                    "competitor": "books-to-scrape",
                    "product_key": "light-in-the-attic",
                    "product_name": "A Light in the Attic",
                    "url": (
                        "http://books.toscrape.com/catalogue/"
                        "a-light-in-the-attic_1000/index.html"
                    ),
                    "price_selector": "p.price_color",
                    "currency": "GBP",
                }
            ]
        )
        print(f"self-test: live fetch -> price={live[0]['price']} "
              f"in_stock={live[0]['in_stock']}")
    except Exception as err:  # noqa: BLE001 - network optional
        print(f"self-test: live fetch skipped ({type(err).__name__})")

    print("self-test: ALL OK")