feat(shell): auto-commit con 31 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-14 23:55:16 +02:00
parent 1430039688
commit e1e9bb7499
31 changed files with 3917 additions and 0 deletions
@@ -0,0 +1,389 @@
+"""Scrape current prices for a list of competitor product pages.
+
+Watches competitor pricing: given a list of targets (product URL + competitor),
+fetches each page and extracts the current price using a cascade of strategies
+(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
+map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
+id/snapshot_date/scraped_at columns).
+"""
+
+import json
+import re
+import urllib.request
+import urllib.error
+
+from bs4 import BeautifulSoup
+
+_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+
+_REQUEST_HEADERS = {
+    "User-Agent": _USER_AGENT,
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;q=0.9,"
+        "image/avif,image/webp,*/*;q=0.8"
+    ),
+    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
+    "Accept-Encoding": "identity",
+    "Connection": "close",
+}
+
+# Substrings that, when present, signal the product is NOT available.
+_OUT_OF_STOCK_MARKERS = (
+    "agotado",
+    "sin stock",
+    "sin existencias",
+    "no disponible",
+    "out of stock",
+    "sold out",
+    "unavailable",
+    "currently unavailable",
+)
+
+# Common class/attribute patterns used by mainstream e-commerce templates.
+_PRICE_HEURISTIC_SELECTORS = (
+    "[itemprop=price]",
+    "[data-price]",
+    "[data-product-price]",
+    ".price",
+    ".product-price",
+    ".price--current",
+    ".current-price",
+    ".sale-price",
+    ".a-price .a-offscreen",
+    "[class*=price]",
+)
+
+# A token that looks like a price: optional currency symbol, digits with
+# thousands/decimal separators. Captured group is the numeric part.
+# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
+# second alternative covers plain contiguous digits with optional decimals
+# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
+# plain-digit branch greedily eating "1299" out of "1299.99".
+_PRICE_NUMBER_RE = re.compile(
+    r"(?:[€$£]|EUR|USD|GBP)?\s*"
+    r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
+    r"\s*(?:[€$£]|EUR|USD|GBP)?",
+    re.IGNORECASE,
+)
+
+
+def _fetch_html(url: str, timeout: float = 15.0) -> str:
+    """GET a URL with realistic headers, one retry on failure.
+
+    Raises the last urllib error if both attempts fail.
+    """
+    last_err: Exception | None = None
+    for attempt in range(2):
+        try:
+            req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                raw = resp.read()
+            charset = resp.headers.get_content_charset() or "utf-8"
+            try:
+                return raw.decode(charset, errors="replace")
+            except (LookupError, UnicodeDecodeError):
+                return raw.decode("utf-8", errors="replace")
+        except Exception as err:  # noqa: BLE001 - retry on any transport error
+            last_err = err
+            continue
+    raise last_err if last_err is not None else RuntimeError("fetch failed")
+
+
+def _normalize_price(raw) -> float | None:
+    """Normalize a price token to float, tolerating comma/dot and symbols.
+
+    Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
+    Returns None if no numeric value can be parsed.
+    """
+    if raw is None:
+        return None
+    if isinstance(raw, (int, float)):
+        try:
+            return float(raw)
+        except (ValueError, TypeError):
+            return None
+
+    text = str(raw).strip()
+    if not text:
+        return None
+
+    match = _PRICE_NUMBER_RE.search(text)
+    if not match:
+        return None
+
+    num = match.group(1).strip().replace(" ", "")
+
+    last_comma = num.rfind(",")
+    last_dot = num.rfind(".")
+
+    if last_comma != -1 and last_dot != -1:
+        # The right-most separator is the decimal separator.
+        if last_comma > last_dot:
+            # European: 1.299,99 -> dots are thousands, comma is decimal.
+            num = num.replace(".", "").replace(",", ".")
+        else:
+            # US: 1,299.99 -> commas are thousands, dot is decimal.
+            num = num.replace(",", "")
+    elif last_comma != -1:
+        # Only commas present. Decimal if it looks like "29,90"; else thousands.
+        if len(num) - last_comma - 1 == 2:
+            num = num.replace(",", ".")
+        else:
+            num = num.replace(",", "")
+    # Only dots (or none): assume dot is already decimal / no separators.
+
+    try:
+        return float(num)
+    except ValueError:
+        return None
+
+
+def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
+    """Try a single CSS selector and normalize the matched node."""
+    try:
+        node = soup.select_one(selector)
+    except Exception:  # noqa: BLE001 - invalid selector should not abort
+        return None
+    if node is None:
+        return None
+    # Prefer common price-bearing attributes, fall back to text.
+    for attr in ("content", "data-price", "data-product-price", "value"):
+        if node.has_attr(attr):
+            price = _normalize_price(node.get(attr))
+            if price is not None:
+                return price
+    return _normalize_price(node.get_text(" ", strip=True))
+
+
+def _iter_json_ld_prices(soup: BeautifulSoup):
+    """Yield candidate prices found inside ld+json offers blocks."""
+    for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
+        payload = tag.string or tag.get_text()
+        if not payload:
+            continue
+        try:
+            data = json.loads(payload)
+        except (ValueError, TypeError):
+            continue
+        for node in _walk_json(data):
+            if not isinstance(node, dict):
+                continue
+            offers = node.get("offers")
+            for offer in _as_list(offers):
+                if isinstance(offer, dict) and "price" in offer:
+                    yield offer.get("price")
+            # Some schemas place price directly on the node.
+            if "price" in node and not isinstance(node.get("offers"), (dict, list)):
+                yield node.get("price")
+
+
+def _walk_json(node):
+    """Depth-first walk over arbitrarily nested JSON structures."""
+    if isinstance(node, dict):
+        yield node
+        for value in node.values():
+            yield from _walk_json(value)
+    elif isinstance(node, list):
+        for item in node:
+            yield from _walk_json(item)
+
+
+def _as_list(value):
+    """Wrap a value in a list unless it already is one."""
+    if value is None:
+        return []
+    return value if isinstance(value, list) else [value]
+
+
+def _extract_from_meta(soup: BeautifulSoup) -> float | None:
+    """Try common price meta tags in priority order."""
+    candidates = (
+        {"itemprop": "price"},
+        {"property": "og:price:amount"},
+        {"property": "product:price:amount"},
+        {"name": "twitter:data1"},
+    )
+    for attrs in candidates:
+        tag = soup.find("meta", attrs=attrs)
+        if tag is not None:
+            price = _normalize_price(tag.get("content"))
+            if price is not None:
+                return price
+    return None
+
+
+def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
+    """Heuristic stock detection: True unless an out-of-stock marker appears."""
+    text = soup.get_text(" ", strip=True).lower()
+    if not text:
+        return None
+    for marker in _OUT_OF_STOCK_MARKERS:
+        if marker in text:
+            return False
+    return True
+
+
+def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
+    """Run the extraction cascade and return the first price found."""
+    # 1. Caller-supplied CSS selector (most robust).
+    if price_selector:
+        price = _extract_from_selector(soup, str(price_selector))
+        if price is not None:
+            return price
+
+    # 2. JSON-LD offers.
+    for candidate in _iter_json_ld_prices(soup):
+        price = _normalize_price(candidate)
+        if price is not None:
+            return price
+
+    # 3. Meta tags.
+    price = _extract_from_meta(soup)
+    if price is not None:
+        return price
+
+    # 4. Common-class heuristics.
+    for selector in _PRICE_HEURISTIC_SELECTORS:
+        price = _extract_from_selector(soup, selector)
+        if price is not None:
+            return price
+
+    return None
+
+
+def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
+    """Scrape current prices for a list of competitor product pages.
+
+    For each target performs a GET with realistic headers (timeout + 1 retry)
+    and extracts the price using a cascade of strategies. Extraction failures
+    of a single target never abort the others: that row is returned with
+    price=None (and in_stock=None) so the caller still gets one row per target.
+
+    Args:
+        targets: list of dicts, each with keys:
+            - competitor (str): competitor name/id.
+            - product_key (str): stable internal product key.
+            - product_name (str): human-readable product name.
+            - url (str): product page URL to scrape.
+            - price_selector (str, optional): CSS selector pinpointing the
+              price node. Most robust when provided.
+            - currency (str, optional): currency code to stamp on the row
+              (e.g. "EUR"). Defaults to "EUR".
+
+    Returns:
+        list of dicts, one per target, with EXACTLY these keys (1:1 with the
+        Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
+            - competitor (str)
+            - product_key (str)
+            - product_name (str)
+            - url (str)
+            - price (float | None)
+            - currency (str)
+            - in_stock (bool | None)
+    """
+    rows: list[dict] = []
+
+    for target in targets:
+        competitor = target.get("competitor")
+        product_key = target.get("product_key")
+        product_name = target.get("product_name")
+        url = target.get("url")
+        price_selector = target.get("price_selector")
+        currency = target.get("currency") or "EUR"
+
+        price: float | None = None
+        in_stock: bool | None = None
+
+        if url:
+            try:
+                html = _fetch_html(url)
+                soup = BeautifulSoup(html, "lxml")
+                price = _extract_price(soup, price_selector)
+                in_stock = _detect_in_stock(soup)
+            except Exception:  # noqa: BLE001 - never abort the whole batch
+                price = None
+                in_stock = None
+
+        rows.append(
+            {
+                "competitor": competitor,
+                "product_key": product_key,
+                "product_name": product_name,
+                "url": url,
+                "price": price,
+                "currency": currency,
+                "in_stock": in_stock,
+            }
+        )
+
+    return rows
+
+
+if __name__ == "__main__":
+    # Self-test: import is implicitly OK if we reach this point.
+    print("self-test: import OK")
+
+    # Pure-logic checks that need no network.
+    assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
+    assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
+    assert _normalize_price("29,90") == 29.90, "EU decimal only"
+    assert _normalize_price("1,299") == 1299.0, "US thousands only"
+    assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
+    assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
+    assert _normalize_price("no price here") is None, "no number"
+    assert _normalize_price(None) is None, "none in -> none out"
+    print("self-test: price normalization OK")
+
+    # Shape check: one row per target, exact keys, failed target -> price None.
+    sample = scrape_competitor_prices(
+        [
+            {
+                "competitor": "demo",
+                "product_key": "SKU-1",
+                "product_name": "Demo product",
+                "url": "http://invalid.localhost.invalid/nope",
+                "currency": "EUR",
+            }
+        ]
+    )
+    expected_keys = {
+        "competitor",
+        "product_key",
+        "product_name",
+        "url",
+        "price",
+        "currency",
+        "in_stock",
+    }
+    assert len(sample) == 1, "one row per target"
+    assert set(sample[0].keys()) == expected_keys, "exact keys"
+    assert sample[0]["price"] is None, "failed target -> price None, no abort"
+    assert sample[0]["currency"] == "EUR", "currency default"
+    print("self-test: row shape + graceful-failure OK")
+
+    # Optional: best-effort real fetch against a public URL (never fails build).
+    try:
+        live = scrape_competitor_prices(
+            [
+                {
+                    "competitor": "books-to-scrape",
+                    "product_key": "light-in-the-attic",
+                    "product_name": "A Light in the Attic",
+                    "url": (
+                        "http://books.toscrape.com/catalogue/"
+                        "a-light-in-the-attic_1000/index.html"
+                    ),
+                    "price_selector": "p.price_color",
+                    "currency": "GBP",
+                }
+            ]
+        )
+        print(f"self-test: live fetch -> price={live[0]['price']} "
+              f"in_stock={live[0]['in_stock']}")
+    except Exception as err:  # noqa: BLE001 - network optional
+        print(f"self-test: live fetch skipped ({type(err).__name__})")
+
+    print("self-test: ALL OK")