e1e9bb7499
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
390 lines
13 KiB
Python
390 lines
13 KiB
Python
"""Scrape current prices for a list of competitor product pages.
|
|
|
|
Watches competitor pricing: given a list of targets (product URL + competitor),
|
|
fetches each page and extracts the current price using a cascade of strategies
|
|
(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
|
|
map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
|
|
id/snapshot_date/scraped_at columns).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
_USER_AGENT = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
_REQUEST_HEADERS = {
|
|
"User-Agent": _USER_AGENT,
|
|
"Accept": (
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
|
"image/avif,image/webp,*/*;q=0.8"
|
|
),
|
|
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
|
|
"Accept-Encoding": "identity",
|
|
"Connection": "close",
|
|
}
|
|
|
|
# Substrings that, when present, signal the product is NOT available.
|
|
_OUT_OF_STOCK_MARKERS = (
|
|
"agotado",
|
|
"sin stock",
|
|
"sin existencias",
|
|
"no disponible",
|
|
"out of stock",
|
|
"sold out",
|
|
"unavailable",
|
|
"currently unavailable",
|
|
)
|
|
|
|
# Common class/attribute patterns used by mainstream e-commerce templates.
|
|
_PRICE_HEURISTIC_SELECTORS = (
|
|
"[itemprop=price]",
|
|
"[data-price]",
|
|
"[data-product-price]",
|
|
".price",
|
|
".product-price",
|
|
".price--current",
|
|
".current-price",
|
|
".sale-price",
|
|
".a-price .a-offscreen",
|
|
"[class*=price]",
|
|
)
|
|
|
|
# A token that looks like a price: optional currency symbol, digits with
|
|
# thousands/decimal separators. Captured group is the numeric part.
|
|
# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
|
|
# second alternative covers plain contiguous digits with optional decimals
|
|
# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
|
|
# plain-digit branch greedily eating "1299" out of "1299.99".
|
|
_PRICE_NUMBER_RE = re.compile(
|
|
r"(?:[€$£]|EUR|USD|GBP)?\s*"
|
|
r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
|
|
r"\s*(?:[€$£]|EUR|USD|GBP)?",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _fetch_html(url: str, timeout: float = 15.0) -> str:
|
|
"""GET a URL with realistic headers, one retry on failure.
|
|
|
|
Raises the last urllib error if both attempts fail.
|
|
"""
|
|
last_err: Exception | None = None
|
|
for attempt in range(2):
|
|
try:
|
|
req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
raw = resp.read()
|
|
charset = resp.headers.get_content_charset() or "utf-8"
|
|
try:
|
|
return raw.decode(charset, errors="replace")
|
|
except (LookupError, UnicodeDecodeError):
|
|
return raw.decode("utf-8", errors="replace")
|
|
except Exception as err: # noqa: BLE001 - retry on any transport error
|
|
last_err = err
|
|
continue
|
|
raise last_err if last_err is not None else RuntimeError("fetch failed")
|
|
|
|
|
|
def _normalize_price(raw) -> float | None:
|
|
"""Normalize a price token to float, tolerating comma/dot and symbols.
|
|
|
|
Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
|
|
Returns None if no numeric value can be parsed.
|
|
"""
|
|
if raw is None:
|
|
return None
|
|
if isinstance(raw, (int, float)):
|
|
try:
|
|
return float(raw)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
text = str(raw).strip()
|
|
if not text:
|
|
return None
|
|
|
|
match = _PRICE_NUMBER_RE.search(text)
|
|
if not match:
|
|
return None
|
|
|
|
num = match.group(1).strip().replace(" ", "")
|
|
|
|
last_comma = num.rfind(",")
|
|
last_dot = num.rfind(".")
|
|
|
|
if last_comma != -1 and last_dot != -1:
|
|
# The right-most separator is the decimal separator.
|
|
if last_comma > last_dot:
|
|
# European: 1.299,99 -> dots are thousands, comma is decimal.
|
|
num = num.replace(".", "").replace(",", ".")
|
|
else:
|
|
# US: 1,299.99 -> commas are thousands, dot is decimal.
|
|
num = num.replace(",", "")
|
|
elif last_comma != -1:
|
|
# Only commas present. Decimal if it looks like "29,90"; else thousands.
|
|
if len(num) - last_comma - 1 == 2:
|
|
num = num.replace(",", ".")
|
|
else:
|
|
num = num.replace(",", "")
|
|
# Only dots (or none): assume dot is already decimal / no separators.
|
|
|
|
try:
|
|
return float(num)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
|
|
"""Try a single CSS selector and normalize the matched node."""
|
|
try:
|
|
node = soup.select_one(selector)
|
|
except Exception: # noqa: BLE001 - invalid selector should not abort
|
|
return None
|
|
if node is None:
|
|
return None
|
|
# Prefer common price-bearing attributes, fall back to text.
|
|
for attr in ("content", "data-price", "data-product-price", "value"):
|
|
if node.has_attr(attr):
|
|
price = _normalize_price(node.get(attr))
|
|
if price is not None:
|
|
return price
|
|
return _normalize_price(node.get_text(" ", strip=True))
|
|
|
|
|
|
def _iter_json_ld_prices(soup: BeautifulSoup):
|
|
"""Yield candidate prices found inside ld+json offers blocks."""
|
|
for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
|
payload = tag.string or tag.get_text()
|
|
if not payload:
|
|
continue
|
|
try:
|
|
data = json.loads(payload)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
for node in _walk_json(data):
|
|
if not isinstance(node, dict):
|
|
continue
|
|
offers = node.get("offers")
|
|
for offer in _as_list(offers):
|
|
if isinstance(offer, dict) and "price" in offer:
|
|
yield offer.get("price")
|
|
# Some schemas place price directly on the node.
|
|
if "price" in node and not isinstance(node.get("offers"), (dict, list)):
|
|
yield node.get("price")
|
|
|
|
|
|
def _walk_json(node):
|
|
"""Depth-first walk over arbitrarily nested JSON structures."""
|
|
if isinstance(node, dict):
|
|
yield node
|
|
for value in node.values():
|
|
yield from _walk_json(value)
|
|
elif isinstance(node, list):
|
|
for item in node:
|
|
yield from _walk_json(item)
|
|
|
|
|
|
def _as_list(value):
|
|
"""Wrap a value in a list unless it already is one."""
|
|
if value is None:
|
|
return []
|
|
return value if isinstance(value, list) else [value]
|
|
|
|
|
|
def _extract_from_meta(soup: BeautifulSoup) -> float | None:
|
|
"""Try common price meta tags in priority order."""
|
|
candidates = (
|
|
{"itemprop": "price"},
|
|
{"property": "og:price:amount"},
|
|
{"property": "product:price:amount"},
|
|
{"name": "twitter:data1"},
|
|
)
|
|
for attrs in candidates:
|
|
tag = soup.find("meta", attrs=attrs)
|
|
if tag is not None:
|
|
price = _normalize_price(tag.get("content"))
|
|
if price is not None:
|
|
return price
|
|
return None
|
|
|
|
|
|
def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
|
|
"""Heuristic stock detection: True unless an out-of-stock marker appears."""
|
|
text = soup.get_text(" ", strip=True).lower()
|
|
if not text:
|
|
return None
|
|
for marker in _OUT_OF_STOCK_MARKERS:
|
|
if marker in text:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
|
|
"""Run the extraction cascade and return the first price found."""
|
|
# 1. Caller-supplied CSS selector (most robust).
|
|
if price_selector:
|
|
price = _extract_from_selector(soup, str(price_selector))
|
|
if price is not None:
|
|
return price
|
|
|
|
# 2. JSON-LD offers.
|
|
for candidate in _iter_json_ld_prices(soup):
|
|
price = _normalize_price(candidate)
|
|
if price is not None:
|
|
return price
|
|
|
|
# 3. Meta tags.
|
|
price = _extract_from_meta(soup)
|
|
if price is not None:
|
|
return price
|
|
|
|
# 4. Common-class heuristics.
|
|
for selector in _PRICE_HEURISTIC_SELECTORS:
|
|
price = _extract_from_selector(soup, selector)
|
|
if price is not None:
|
|
return price
|
|
|
|
return None
|
|
|
|
|
|
def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
|
|
"""Scrape current prices for a list of competitor product pages.
|
|
|
|
For each target performs a GET with realistic headers (timeout + 1 retry)
|
|
and extracts the price using a cascade of strategies. Extraction failures
|
|
of a single target never abort the others: that row is returned with
|
|
price=None (and in_stock=None) so the caller still gets one row per target.
|
|
|
|
Args:
|
|
targets: list of dicts, each with keys:
|
|
- competitor (str): competitor name/id.
|
|
- product_key (str): stable internal product key.
|
|
- product_name (str): human-readable product name.
|
|
- url (str): product page URL to scrape.
|
|
- price_selector (str, optional): CSS selector pinpointing the
|
|
price node. Most robust when provided.
|
|
- currency (str, optional): currency code to stamp on the row
|
|
(e.g. "EUR"). Defaults to "EUR".
|
|
|
|
Returns:
|
|
list of dicts, one per target, with EXACTLY these keys (1:1 with the
|
|
Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
|
|
- competitor (str)
|
|
- product_key (str)
|
|
- product_name (str)
|
|
- url (str)
|
|
- price (float | None)
|
|
- currency (str)
|
|
- in_stock (bool | None)
|
|
"""
|
|
rows: list[dict] = []
|
|
|
|
for target in targets:
|
|
competitor = target.get("competitor")
|
|
product_key = target.get("product_key")
|
|
product_name = target.get("product_name")
|
|
url = target.get("url")
|
|
price_selector = target.get("price_selector")
|
|
currency = target.get("currency") or "EUR"
|
|
|
|
price: float | None = None
|
|
in_stock: bool | None = None
|
|
|
|
if url:
|
|
try:
|
|
html = _fetch_html(url)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
price = _extract_price(soup, price_selector)
|
|
in_stock = _detect_in_stock(soup)
|
|
except Exception: # noqa: BLE001 - never abort the whole batch
|
|
price = None
|
|
in_stock = None
|
|
|
|
rows.append(
|
|
{
|
|
"competitor": competitor,
|
|
"product_key": product_key,
|
|
"product_name": product_name,
|
|
"url": url,
|
|
"price": price,
|
|
"currency": currency,
|
|
"in_stock": in_stock,
|
|
}
|
|
)
|
|
|
|
return rows
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Self-test: import is implicitly OK if we reach this point.
|
|
print("self-test: import OK")
|
|
|
|
# Pure-logic checks that need no network.
|
|
assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
|
|
assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
|
|
assert _normalize_price("29,90") == 29.90, "EU decimal only"
|
|
assert _normalize_price("1,299") == 1299.0, "US thousands only"
|
|
assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
|
|
assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
|
|
assert _normalize_price("no price here") is None, "no number"
|
|
assert _normalize_price(None) is None, "none in -> none out"
|
|
print("self-test: price normalization OK")
|
|
|
|
# Shape check: one row per target, exact keys, failed target -> price None.
|
|
sample = scrape_competitor_prices(
|
|
[
|
|
{
|
|
"competitor": "demo",
|
|
"product_key": "SKU-1",
|
|
"product_name": "Demo product",
|
|
"url": "http://invalid.localhost.invalid/nope",
|
|
"currency": "EUR",
|
|
}
|
|
]
|
|
)
|
|
expected_keys = {
|
|
"competitor",
|
|
"product_key",
|
|
"product_name",
|
|
"url",
|
|
"price",
|
|
"currency",
|
|
"in_stock",
|
|
}
|
|
assert len(sample) == 1, "one row per target"
|
|
assert set(sample[0].keys()) == expected_keys, "exact keys"
|
|
assert sample[0]["price"] is None, "failed target -> price None, no abort"
|
|
assert sample[0]["currency"] == "EUR", "currency default"
|
|
print("self-test: row shape + graceful-failure OK")
|
|
|
|
# Optional: best-effort real fetch against a public URL (never fails build).
|
|
try:
|
|
live = scrape_competitor_prices(
|
|
[
|
|
{
|
|
"competitor": "books-to-scrape",
|
|
"product_key": "light-in-the-attic",
|
|
"product_name": "A Light in the Attic",
|
|
"url": (
|
|
"http://books.toscrape.com/catalogue/"
|
|
"a-light-in-the-attic_1000/index.html"
|
|
),
|
|
"price_selector": "p.price_color",
|
|
"currency": "GBP",
|
|
}
|
|
]
|
|
)
|
|
print(f"self-test: live fetch -> price={live[0]['price']} "
|
|
f"in_stock={live[0]['in_stock']}")
|
|
except Exception as err: # noqa: BLE001 - network optional
|
|
print(f"self-test: live fetch skipped ({type(err).__name__})")
|
|
|
|
print("self-test: ALL OK")
|