feat(shell): auto-commit con 31 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,389 @@
|
||||
"""Scrape current prices for a list of competitor product pages.
|
||||
|
||||
Watches competitor pricing: given a list of targets (product URL + competitor),
|
||||
fetches each page and extracts the current price using a cascade of strategies
|
||||
(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
|
||||
map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
|
||||
id/snapshot_date/scraped_at columns).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
_USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
_REQUEST_HEADERS = {
|
||||
"User-Agent": _USER_AGENT,
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||
"image/avif,image/webp,*/*;q=0.8"
|
||||
),
|
||||
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "identity",
|
||||
"Connection": "close",
|
||||
}
|
||||
|
||||
# Substrings that, when present, signal the product is NOT available.
|
||||
_OUT_OF_STOCK_MARKERS = (
|
||||
"agotado",
|
||||
"sin stock",
|
||||
"sin existencias",
|
||||
"no disponible",
|
||||
"out of stock",
|
||||
"sold out",
|
||||
"unavailable",
|
||||
"currently unavailable",
|
||||
)
|
||||
|
||||
# Common class/attribute patterns used by mainstream e-commerce templates.
|
||||
_PRICE_HEURISTIC_SELECTORS = (
|
||||
"[itemprop=price]",
|
||||
"[data-price]",
|
||||
"[data-product-price]",
|
||||
".price",
|
||||
".product-price",
|
||||
".price--current",
|
||||
".current-price",
|
||||
".sale-price",
|
||||
".a-price .a-offscreen",
|
||||
"[class*=price]",
|
||||
)
|
||||
|
||||
# A token that looks like a price: optional currency symbol, digits with
|
||||
# thousands/decimal separators. Captured group is the numeric part.
|
||||
# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
|
||||
# second alternative covers plain contiguous digits with optional decimals
|
||||
# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
|
||||
# plain-digit branch greedily eating "1299" out of "1299.99".
|
||||
_PRICE_NUMBER_RE = re.compile(
|
||||
r"(?:[€$£]|EUR|USD|GBP)?\s*"
|
||||
r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
|
||||
r"\s*(?:[€$£]|EUR|USD|GBP)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _fetch_html(url: str, timeout: float = 15.0) -> str:
|
||||
"""GET a URL with realistic headers, one retry on failure.
|
||||
|
||||
Raises the last urllib error if both attempts fail.
|
||||
"""
|
||||
last_err: Exception | None = None
|
||||
for attempt in range(2):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read()
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
try:
|
||||
return raw.decode(charset, errors="replace")
|
||||
except (LookupError, UnicodeDecodeError):
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
except Exception as err: # noqa: BLE001 - retry on any transport error
|
||||
last_err = err
|
||||
continue
|
||||
raise last_err if last_err is not None else RuntimeError("fetch failed")
|
||||
|
||||
|
||||
def _normalize_price(raw) -> float | None:
|
||||
"""Normalize a price token to float, tolerating comma/dot and symbols.
|
||||
|
||||
Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
|
||||
Returns None if no numeric value can be parsed.
|
||||
"""
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, (int, float)):
|
||||
try:
|
||||
return float(raw)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
text = str(raw).strip()
|
||||
if not text:
|
||||
return None
|
||||
|
||||
match = _PRICE_NUMBER_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
num = match.group(1).strip().replace(" ", "")
|
||||
|
||||
last_comma = num.rfind(",")
|
||||
last_dot = num.rfind(".")
|
||||
|
||||
if last_comma != -1 and last_dot != -1:
|
||||
# The right-most separator is the decimal separator.
|
||||
if last_comma > last_dot:
|
||||
# European: 1.299,99 -> dots are thousands, comma is decimal.
|
||||
num = num.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
# US: 1,299.99 -> commas are thousands, dot is decimal.
|
||||
num = num.replace(",", "")
|
||||
elif last_comma != -1:
|
||||
# Only commas present. Decimal if it looks like "29,90"; else thousands.
|
||||
if len(num) - last_comma - 1 == 2:
|
||||
num = num.replace(",", ".")
|
||||
else:
|
||||
num = num.replace(",", "")
|
||||
# Only dots (or none): assume dot is already decimal / no separators.
|
||||
|
||||
try:
|
||||
return float(num)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
|
||||
"""Try a single CSS selector and normalize the matched node."""
|
||||
try:
|
||||
node = soup.select_one(selector)
|
||||
except Exception: # noqa: BLE001 - invalid selector should not abort
|
||||
return None
|
||||
if node is None:
|
||||
return None
|
||||
# Prefer common price-bearing attributes, fall back to text.
|
||||
for attr in ("content", "data-price", "data-product-price", "value"):
|
||||
if node.has_attr(attr):
|
||||
price = _normalize_price(node.get(attr))
|
||||
if price is not None:
|
||||
return price
|
||||
return _normalize_price(node.get_text(" ", strip=True))
|
||||
|
||||
|
||||
def _iter_json_ld_prices(soup: BeautifulSoup):
|
||||
"""Yield candidate prices found inside ld+json offers blocks."""
|
||||
for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
||||
payload = tag.string or tag.get_text()
|
||||
if not payload:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(payload)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
for node in _walk_json(data):
|
||||
if not isinstance(node, dict):
|
||||
continue
|
||||
offers = node.get("offers")
|
||||
for offer in _as_list(offers):
|
||||
if isinstance(offer, dict) and "price" in offer:
|
||||
yield offer.get("price")
|
||||
# Some schemas place price directly on the node.
|
||||
if "price" in node and not isinstance(node.get("offers"), (dict, list)):
|
||||
yield node.get("price")
|
||||
|
||||
|
||||
def _walk_json(node):
|
||||
"""Depth-first walk over arbitrarily nested JSON structures."""
|
||||
if isinstance(node, dict):
|
||||
yield node
|
||||
for value in node.values():
|
||||
yield from _walk_json(value)
|
||||
elif isinstance(node, list):
|
||||
for item in node:
|
||||
yield from _walk_json(item)
|
||||
|
||||
|
||||
def _as_list(value):
|
||||
"""Wrap a value in a list unless it already is one."""
|
||||
if value is None:
|
||||
return []
|
||||
return value if isinstance(value, list) else [value]
|
||||
|
||||
|
||||
def _extract_from_meta(soup: BeautifulSoup) -> float | None:
|
||||
"""Try common price meta tags in priority order."""
|
||||
candidates = (
|
||||
{"itemprop": "price"},
|
||||
{"property": "og:price:amount"},
|
||||
{"property": "product:price:amount"},
|
||||
{"name": "twitter:data1"},
|
||||
)
|
||||
for attrs in candidates:
|
||||
tag = soup.find("meta", attrs=attrs)
|
||||
if tag is not None:
|
||||
price = _normalize_price(tag.get("content"))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
|
||||
def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
|
||||
"""Heuristic stock detection: True unless an out-of-stock marker appears."""
|
||||
text = soup.get_text(" ", strip=True).lower()
|
||||
if not text:
|
||||
return None
|
||||
for marker in _OUT_OF_STOCK_MARKERS:
|
||||
if marker in text:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
|
||||
"""Run the extraction cascade and return the first price found."""
|
||||
# 1. Caller-supplied CSS selector (most robust).
|
||||
if price_selector:
|
||||
price = _extract_from_selector(soup, str(price_selector))
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# 2. JSON-LD offers.
|
||||
for candidate in _iter_json_ld_prices(soup):
|
||||
price = _normalize_price(candidate)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# 3. Meta tags.
|
||||
price = _extract_from_meta(soup)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# 4. Common-class heuristics.
|
||||
for selector in _PRICE_HEURISTIC_SELECTORS:
|
||||
price = _extract_from_selector(soup, selector)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
|
||||
"""Scrape current prices for a list of competitor product pages.
|
||||
|
||||
For each target performs a GET with realistic headers (timeout + 1 retry)
|
||||
and extracts the price using a cascade of strategies. Extraction failures
|
||||
of a single target never abort the others: that row is returned with
|
||||
price=None (and in_stock=None) so the caller still gets one row per target.
|
||||
|
||||
Args:
|
||||
targets: list of dicts, each with keys:
|
||||
- competitor (str): competitor name/id.
|
||||
- product_key (str): stable internal product key.
|
||||
- product_name (str): human-readable product name.
|
||||
- url (str): product page URL to scrape.
|
||||
- price_selector (str, optional): CSS selector pinpointing the
|
||||
price node. Most robust when provided.
|
||||
- currency (str, optional): currency code to stamp on the row
|
||||
(e.g. "EUR"). Defaults to "EUR".
|
||||
|
||||
Returns:
|
||||
list of dicts, one per target, with EXACTLY these keys (1:1 with the
|
||||
Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
|
||||
- competitor (str)
|
||||
- product_key (str)
|
||||
- product_name (str)
|
||||
- url (str)
|
||||
- price (float | None)
|
||||
- currency (str)
|
||||
- in_stock (bool | None)
|
||||
"""
|
||||
rows: list[dict] = []
|
||||
|
||||
for target in targets:
|
||||
competitor = target.get("competitor")
|
||||
product_key = target.get("product_key")
|
||||
product_name = target.get("product_name")
|
||||
url = target.get("url")
|
||||
price_selector = target.get("price_selector")
|
||||
currency = target.get("currency") or "EUR"
|
||||
|
||||
price: float | None = None
|
||||
in_stock: bool | None = None
|
||||
|
||||
if url:
|
||||
try:
|
||||
html = _fetch_html(url)
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
price = _extract_price(soup, price_selector)
|
||||
in_stock = _detect_in_stock(soup)
|
||||
except Exception: # noqa: BLE001 - never abort the whole batch
|
||||
price = None
|
||||
in_stock = None
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"competitor": competitor,
|
||||
"product_key": product_key,
|
||||
"product_name": product_name,
|
||||
"url": url,
|
||||
"price": price,
|
||||
"currency": currency,
|
||||
"in_stock": in_stock,
|
||||
}
|
||||
)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Self-test: import is implicitly OK if we reach this point.
|
||||
print("self-test: import OK")
|
||||
|
||||
# Pure-logic checks that need no network.
|
||||
assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
|
||||
assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
|
||||
assert _normalize_price("29,90") == 29.90, "EU decimal only"
|
||||
assert _normalize_price("1,299") == 1299.0, "US thousands only"
|
||||
assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
|
||||
assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
|
||||
assert _normalize_price("no price here") is None, "no number"
|
||||
assert _normalize_price(None) is None, "none in -> none out"
|
||||
print("self-test: price normalization OK")
|
||||
|
||||
# Shape check: one row per target, exact keys, failed target -> price None.
|
||||
sample = scrape_competitor_prices(
|
||||
[
|
||||
{
|
||||
"competitor": "demo",
|
||||
"product_key": "SKU-1",
|
||||
"product_name": "Demo product",
|
||||
"url": "http://invalid.localhost.invalid/nope",
|
||||
"currency": "EUR",
|
||||
}
|
||||
]
|
||||
)
|
||||
expected_keys = {
|
||||
"competitor",
|
||||
"product_key",
|
||||
"product_name",
|
||||
"url",
|
||||
"price",
|
||||
"currency",
|
||||
"in_stock",
|
||||
}
|
||||
assert len(sample) == 1, "one row per target"
|
||||
assert set(sample[0].keys()) == expected_keys, "exact keys"
|
||||
assert sample[0]["price"] is None, "failed target -> price None, no abort"
|
||||
assert sample[0]["currency"] == "EUR", "currency default"
|
||||
print("self-test: row shape + graceful-failure OK")
|
||||
|
||||
# Optional: best-effort real fetch against a public URL (never fails build).
|
||||
try:
|
||||
live = scrape_competitor_prices(
|
||||
[
|
||||
{
|
||||
"competitor": "books-to-scrape",
|
||||
"product_key": "light-in-the-attic",
|
||||
"product_name": "A Light in the Attic",
|
||||
"url": (
|
||||
"http://books.toscrape.com/catalogue/"
|
||||
"a-light-in-the-attic_1000/index.html"
|
||||
),
|
||||
"price_selector": "p.price_color",
|
||||
"currency": "GBP",
|
||||
}
|
||||
]
|
||||
)
|
||||
print(f"self-test: live fetch -> price={live[0]['price']} "
|
||||
f"in_stock={live[0]['in_stock']}")
|
||||
except Exception as err: # noqa: BLE001 - network optional
|
||||
print(f"self-test: live fetch skipped ({type(err).__name__})")
|
||||
|
||||
print("self-test: ALL OK")
|
||||
Reference in New Issue
Block a user