feat(shell): auto-commit con 31 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-14 23:55:16 +02:00
parent 1430039688
commit e1e9bb7499
31 changed files with 3917 additions and 0 deletions
@@ -0,0 +1,389 @@
"""Scrape current prices for a list of competitor product pages.
Watches competitor pricing: given a list of targets (product URL + competitor),
fetches each page and extracts the current price using a cascade of strategies
(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
id/snapshot_date/scraped_at columns).
"""
import json
import re
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
_REQUEST_HEADERS = {
"User-Agent": _USER_AGENT,
"Accept": (
"text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,image/webp,*/*;q=0.8"
),
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
"Accept-Encoding": "identity",
"Connection": "close",
}
# Substrings that, when present, signal the product is NOT available.
_OUT_OF_STOCK_MARKERS = (
"agotado",
"sin stock",
"sin existencias",
"no disponible",
"out of stock",
"sold out",
"unavailable",
"currently unavailable",
)
# Common class/attribute patterns used by mainstream e-commerce templates.
_PRICE_HEURISTIC_SELECTORS = (
"[itemprop=price]",
"[data-price]",
"[data-product-price]",
".price",
".product-price",
".price--current",
".current-price",
".sale-price",
".a-price .a-offscreen",
"[class*=price]",
)
# A token that looks like a price: optional currency symbol, digits with
# thousands/decimal separators. Captured group is the numeric part.
# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
# second alternative covers plain contiguous digits with optional decimals
# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
# plain-digit branch greedily eating "1299" out of "1299.99".
_PRICE_NUMBER_RE = re.compile(
r"(?:[€$£]|EUR|USD|GBP)?\s*"
r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
r"\s*(?:[€$£]|EUR|USD|GBP)?",
re.IGNORECASE,
)
def _fetch_html(url: str, timeout: float = 15.0) -> str:
"""GET a URL with realistic headers, one retry on failure.
Raises the last urllib error if both attempts fail.
"""
last_err: Exception | None = None
for attempt in range(2):
try:
req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read()
charset = resp.headers.get_content_charset() or "utf-8"
try:
return raw.decode(charset, errors="replace")
except (LookupError, UnicodeDecodeError):
return raw.decode("utf-8", errors="replace")
except Exception as err: # noqa: BLE001 - retry on any transport error
last_err = err
continue
raise last_err if last_err is not None else RuntimeError("fetch failed")
def _normalize_price(raw) -> float | None:
"""Normalize a price token to float, tolerating comma/dot and symbols.
Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
Returns None if no numeric value can be parsed.
"""
if raw is None:
return None
if isinstance(raw, (int, float)):
try:
return float(raw)
except (ValueError, TypeError):
return None
text = str(raw).strip()
if not text:
return None
match = _PRICE_NUMBER_RE.search(text)
if not match:
return None
num = match.group(1).strip().replace(" ", "")
last_comma = num.rfind(",")
last_dot = num.rfind(".")
if last_comma != -1 and last_dot != -1:
# The right-most separator is the decimal separator.
if last_comma > last_dot:
# European: 1.299,99 -> dots are thousands, comma is decimal.
num = num.replace(".", "").replace(",", ".")
else:
# US: 1,299.99 -> commas are thousands, dot is decimal.
num = num.replace(",", "")
elif last_comma != -1:
# Only commas present. Decimal if it looks like "29,90"; else thousands.
if len(num) - last_comma - 1 == 2:
num = num.replace(",", ".")
else:
num = num.replace(",", "")
# Only dots (or none): assume dot is already decimal / no separators.
try:
return float(num)
except ValueError:
return None
def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
"""Try a single CSS selector and normalize the matched node."""
try:
node = soup.select_one(selector)
except Exception: # noqa: BLE001 - invalid selector should not abort
return None
if node is None:
return None
# Prefer common price-bearing attributes, fall back to text.
for attr in ("content", "data-price", "data-product-price", "value"):
if node.has_attr(attr):
price = _normalize_price(node.get(attr))
if price is not None:
return price
return _normalize_price(node.get_text(" ", strip=True))
def _iter_json_ld_prices(soup: BeautifulSoup):
"""Yield candidate prices found inside ld+json offers blocks."""
for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
payload = tag.string or tag.get_text()
if not payload:
continue
try:
data = json.loads(payload)
except (ValueError, TypeError):
continue
for node in _walk_json(data):
if not isinstance(node, dict):
continue
offers = node.get("offers")
for offer in _as_list(offers):
if isinstance(offer, dict) and "price" in offer:
yield offer.get("price")
# Some schemas place price directly on the node.
if "price" in node and not isinstance(node.get("offers"), (dict, list)):
yield node.get("price")
def _walk_json(node):
"""Depth-first walk over arbitrarily nested JSON structures."""
if isinstance(node, dict):
yield node
for value in node.values():
yield from _walk_json(value)
elif isinstance(node, list):
for item in node:
yield from _walk_json(item)
def _as_list(value):
"""Wrap a value in a list unless it already is one."""
if value is None:
return []
return value if isinstance(value, list) else [value]
def _extract_from_meta(soup: BeautifulSoup) -> float | None:
"""Try common price meta tags in priority order."""
candidates = (
{"itemprop": "price"},
{"property": "og:price:amount"},
{"property": "product:price:amount"},
{"name": "twitter:data1"},
)
for attrs in candidates:
tag = soup.find("meta", attrs=attrs)
if tag is not None:
price = _normalize_price(tag.get("content"))
if price is not None:
return price
return None
def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
"""Heuristic stock detection: True unless an out-of-stock marker appears."""
text = soup.get_text(" ", strip=True).lower()
if not text:
return None
for marker in _OUT_OF_STOCK_MARKERS:
if marker in text:
return False
return True
def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
"""Run the extraction cascade and return the first price found."""
# 1. Caller-supplied CSS selector (most robust).
if price_selector:
price = _extract_from_selector(soup, str(price_selector))
if price is not None:
return price
# 2. JSON-LD offers.
for candidate in _iter_json_ld_prices(soup):
price = _normalize_price(candidate)
if price is not None:
return price
# 3. Meta tags.
price = _extract_from_meta(soup)
if price is not None:
return price
# 4. Common-class heuristics.
for selector in _PRICE_HEURISTIC_SELECTORS:
price = _extract_from_selector(soup, selector)
if price is not None:
return price
return None
def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
"""Scrape current prices for a list of competitor product pages.
For each target performs a GET with realistic headers (timeout + 1 retry)
and extracts the price using a cascade of strategies. Extraction failures
of a single target never abort the others: that row is returned with
price=None (and in_stock=None) so the caller still gets one row per target.
Args:
targets: list of dicts, each with keys:
- competitor (str): competitor name/id.
- product_key (str): stable internal product key.
- product_name (str): human-readable product name.
- url (str): product page URL to scrape.
- price_selector (str, optional): CSS selector pinpointing the
price node. Most robust when provided.
- currency (str, optional): currency code to stamp on the row
(e.g. "EUR"). Defaults to "EUR".
Returns:
list of dicts, one per target, with EXACTLY these keys (1:1 with the
Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
- competitor (str)
- product_key (str)
- product_name (str)
- url (str)
- price (float | None)
- currency (str)
- in_stock (bool | None)
"""
rows: list[dict] = []
for target in targets:
competitor = target.get("competitor")
product_key = target.get("product_key")
product_name = target.get("product_name")
url = target.get("url")
price_selector = target.get("price_selector")
currency = target.get("currency") or "EUR"
price: float | None = None
in_stock: bool | None = None
if url:
try:
html = _fetch_html(url)
soup = BeautifulSoup(html, "lxml")
price = _extract_price(soup, price_selector)
in_stock = _detect_in_stock(soup)
except Exception: # noqa: BLE001 - never abort the whole batch
price = None
in_stock = None
rows.append(
{
"competitor": competitor,
"product_key": product_key,
"product_name": product_name,
"url": url,
"price": price,
"currency": currency,
"in_stock": in_stock,
}
)
return rows
if __name__ == "__main__":
# Self-test: import is implicitly OK if we reach this point.
print("self-test: import OK")
# Pure-logic checks that need no network.
assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
assert _normalize_price("29,90") == 29.90, "EU decimal only"
assert _normalize_price("1,299") == 1299.0, "US thousands only"
assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
assert _normalize_price("no price here") is None, "no number"
assert _normalize_price(None) is None, "none in -> none out"
print("self-test: price normalization OK")
# Shape check: one row per target, exact keys, failed target -> price None.
sample = scrape_competitor_prices(
[
{
"competitor": "demo",
"product_key": "SKU-1",
"product_name": "Demo product",
"url": "http://invalid.localhost.invalid/nope",
"currency": "EUR",
}
]
)
expected_keys = {
"competitor",
"product_key",
"product_name",
"url",
"price",
"currency",
"in_stock",
}
assert len(sample) == 1, "one row per target"
assert set(sample[0].keys()) == expected_keys, "exact keys"
assert sample[0]["price"] is None, "failed target -> price None, no abort"
assert sample[0]["currency"] == "EUR", "currency default"
print("self-test: row shape + graceful-failure OK")
# Optional: best-effort real fetch against a public URL (never fails build).
try:
live = scrape_competitor_prices(
[
{
"competitor": "books-to-scrape",
"product_key": "light-in-the-attic",
"product_name": "A Light in the Attic",
"url": (
"http://books.toscrape.com/catalogue/"
"a-light-in-the-attic_1000/index.html"
),
"price_selector": "p.price_color",
"currency": "GBP",
}
]
)
print(f"self-test: live fetch -> price={live[0]['price']} "
f"in_stock={live[0]['in_stock']}")
except Exception as err: # noqa: BLE001 - network optional
print(f"self-test: live fetch skipped ({type(err).__name__})")
print("self-test: ALL OK")