"""Scrape current prices for a list of competitor product pages. Watches competitor pricing: given a list of targets (product URL + competitor), fetches each page and extracts the current price using a cascade of strategies (CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated id/snapshot_date/scraped_at columns). """ import json import re import urllib.request import urllib.error from bs4 import BeautifulSoup _USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ) _REQUEST_HEADERS = { "User-Agent": _USER_AGENT, "Accept": ( "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8" ), "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", "Accept-Encoding": "identity", "Connection": "close", } # Substrings that, when present, signal the product is NOT available. _OUT_OF_STOCK_MARKERS = ( "agotado", "sin stock", "sin existencias", "no disponible", "out of stock", "sold out", "unavailable", "currently unavailable", ) # Common class/attribute patterns used by mainstream e-commerce templates. _PRICE_HEURISTIC_SELECTORS = ( "[itemprop=price]", "[data-price]", "[data-product-price]", ".price", ".product-price", ".price--current", ".current-price", ".sale-price", ".a-price .a-offscreen", "[class*=price]", ) # A token that looks like a price: optional currency symbol, digits with # thousands/decimal separators. Captured group is the numeric part. # First alternative requires >=1 explicit thousands group (e.g. 1.299,99); # second alternative covers plain contiguous digits with optional decimals # (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the # plain-digit branch greedily eating "1299" out of "1299.99". _PRICE_NUMBER_RE = re.compile( r"(?:[€$£]|EUR|USD|GBP)?\s*" r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)" r"\s*(?:[€$£]|EUR|USD|GBP)?", re.IGNORECASE, ) def _fetch_html(url: str, timeout: float = 15.0) -> str: """GET a URL with realistic headers, one retry on failure. Raises the last urllib error if both attempts fail. """ last_err: Exception | None = None for attempt in range(2): try: req = urllib.request.Request(url, headers=_REQUEST_HEADERS) with urllib.request.urlopen(req, timeout=timeout) as resp: raw = resp.read() charset = resp.headers.get_content_charset() or "utf-8" try: return raw.decode(charset, errors="replace") except (LookupError, UnicodeDecodeError): return raw.decode("utf-8", errors="replace") except Exception as err: # noqa: BLE001 - retry on any transport error last_err = err continue raise last_err if last_err is not None else RuntimeError("fetch failed") def _normalize_price(raw) -> float | None: """Normalize a price token to float, tolerating comma/dot and symbols. Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc. Returns None if no numeric value can be parsed. """ if raw is None: return None if isinstance(raw, (int, float)): try: return float(raw) except (ValueError, TypeError): return None text = str(raw).strip() if not text: return None match = _PRICE_NUMBER_RE.search(text) if not match: return None num = match.group(1).strip().replace(" ", "") last_comma = num.rfind(",") last_dot = num.rfind(".") if last_comma != -1 and last_dot != -1: # The right-most separator is the decimal separator. if last_comma > last_dot: # European: 1.299,99 -> dots are thousands, comma is decimal. num = num.replace(".", "").replace(",", ".") else: # US: 1,299.99 -> commas are thousands, dot is decimal. num = num.replace(",", "") elif last_comma != -1: # Only commas present. Decimal if it looks like "29,90"; else thousands. if len(num) - last_comma - 1 == 2: num = num.replace(",", ".") else: num = num.replace(",", "") # Only dots (or none): assume dot is already decimal / no separators. try: return float(num) except ValueError: return None def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None: """Try a single CSS selector and normalize the matched node.""" try: node = soup.select_one(selector) except Exception: # noqa: BLE001 - invalid selector should not abort return None if node is None: return None # Prefer common price-bearing attributes, fall back to text. for attr in ("content", "data-price", "data-product-price", "value"): if node.has_attr(attr): price = _normalize_price(node.get(attr)) if price is not None: return price return _normalize_price(node.get_text(" ", strip=True)) def _iter_json_ld_prices(soup: BeautifulSoup): """Yield candidate prices found inside ld+json offers blocks.""" for tag in soup.find_all("script", attrs={"type": "application/ld+json"}): payload = tag.string or tag.get_text() if not payload: continue try: data = json.loads(payload) except (ValueError, TypeError): continue for node in _walk_json(data): if not isinstance(node, dict): continue offers = node.get("offers") for offer in _as_list(offers): if isinstance(offer, dict) and "price" in offer: yield offer.get("price") # Some schemas place price directly on the node. if "price" in node and not isinstance(node.get("offers"), (dict, list)): yield node.get("price") def _walk_json(node): """Depth-first walk over arbitrarily nested JSON structures.""" if isinstance(node, dict): yield node for value in node.values(): yield from _walk_json(value) elif isinstance(node, list): for item in node: yield from _walk_json(item) def _as_list(value): """Wrap a value in a list unless it already is one.""" if value is None: return [] return value if isinstance(value, list) else [value] def _extract_from_meta(soup: BeautifulSoup) -> float | None: """Try common price meta tags in priority order.""" candidates = ( {"itemprop": "price"}, {"property": "og:price:amount"}, {"property": "product:price:amount"}, {"name": "twitter:data1"}, ) for attrs in candidates: tag = soup.find("meta", attrs=attrs) if tag is not None: price = _normalize_price(tag.get("content")) if price is not None: return price return None def _detect_in_stock(soup: BeautifulSoup) -> bool | None: """Heuristic stock detection: True unless an out-of-stock marker appears.""" text = soup.get_text(" ", strip=True).lower() if not text: return None for marker in _OUT_OF_STOCK_MARKERS: if marker in text: return False return True def _extract_price(soup: BeautifulSoup, price_selector) -> float | None: """Run the extraction cascade and return the first price found.""" # 1. Caller-supplied CSS selector (most robust). if price_selector: price = _extract_from_selector(soup, str(price_selector)) if price is not None: return price # 2. JSON-LD offers. for candidate in _iter_json_ld_prices(soup): price = _normalize_price(candidate) if price is not None: return price # 3. Meta tags. price = _extract_from_meta(soup) if price is not None: return price # 4. Common-class heuristics. for selector in _PRICE_HEURISTIC_SELECTORS: price = _extract_from_selector(soup, selector) if price is not None: return price return None def scrape_competitor_prices(targets: list[dict]) -> list[dict]: """Scrape current prices for a list of competitor product pages. For each target performs a GET with realistic headers (timeout + 1 retry) and extracts the price using a cascade of strategies. Extraction failures of a single target never abort the others: that row is returned with price=None (and in_stock=None) so the caller still gets one row per target. Args: targets: list of dicts, each with keys: - competitor (str): competitor name/id. - product_key (str): stable internal product key. - product_name (str): human-readable product name. - url (str): product page URL to scrape. - price_selector (str, optional): CSS selector pinpointing the price node. Most robust when provided. - currency (str, optional): currency code to stamp on the row (e.g. "EUR"). Defaults to "EUR". Returns: list of dicts, one per target, with EXACTLY these keys (1:1 with the Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at): - competitor (str) - product_key (str) - product_name (str) - url (str) - price (float | None) - currency (str) - in_stock (bool | None) """ rows: list[dict] = [] for target in targets: competitor = target.get("competitor") product_key = target.get("product_key") product_name = target.get("product_name") url = target.get("url") price_selector = target.get("price_selector") currency = target.get("currency") or "EUR" price: float | None = None in_stock: bool | None = None if url: try: html = _fetch_html(url) soup = BeautifulSoup(html, "lxml") price = _extract_price(soup, price_selector) in_stock = _detect_in_stock(soup) except Exception: # noqa: BLE001 - never abort the whole batch price = None in_stock = None rows.append( { "competitor": competitor, "product_key": product_key, "product_name": product_name, "url": url, "price": price, "currency": currency, "in_stock": in_stock, } ) return rows if __name__ == "__main__": # Self-test: import is implicitly OK if we reach this point. print("self-test: import OK") # Pure-logic checks that need no network. assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal" assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal" assert _normalize_price("29,90") == 29.90, "EU decimal only" assert _normalize_price("1,299") == 1299.0, "US thousands only" assert _normalize_price("1299.99") == 1299.99, "plain dot decimal" assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded" assert _normalize_price("no price here") is None, "no number" assert _normalize_price(None) is None, "none in -> none out" print("self-test: price normalization OK") # Shape check: one row per target, exact keys, failed target -> price None. sample = scrape_competitor_prices( [ { "competitor": "demo", "product_key": "SKU-1", "product_name": "Demo product", "url": "http://invalid.localhost.invalid/nope", "currency": "EUR", } ] ) expected_keys = { "competitor", "product_key", "product_name", "url", "price", "currency", "in_stock", } assert len(sample) == 1, "one row per target" assert set(sample[0].keys()) == expected_keys, "exact keys" assert sample[0]["price"] is None, "failed target -> price None, no abort" assert sample[0]["currency"] == "EUR", "currency default" print("self-test: row shape + graceful-failure OK") # Optional: best-effort real fetch against a public URL (never fails build). try: live = scrape_competitor_prices( [ { "competitor": "books-to-scrape", "product_key": "light-in-the-attic", "product_name": "A Light in the Attic", "url": ( "http://books.toscrape.com/catalogue/" "a-light-in-the-attic_1000/index.html" ), "price_selector": "p.price_color", "currency": "GBP", } ] ) print(f"self-test: live fetch -> price={live[0]['price']} " f"in_stock={live[0]['in_stock']}") except Exception as err: # noqa: BLE001 - network optional print(f"self-test: live fetch skipped ({type(err).__name__})") print("self-test: ALL OK")