763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
348 lines
12 KiB
Python
348 lines
12 KiB
Python
"""Pure HTML parser for Amazon ranking pages (Best Sellers and Movers & Shakers).
|
|
|
|
This module holds the *pure* DOM-parsing core shared by the HTTP scraper
|
|
(``scrape_amazon_bestsellers``) and the CDP/browser scraper
|
|
(``scrape_amazon_movers_cdp``). It takes a chunk of already-fetched HTML (from
|
|
``requests`` or from a rendered ``outerHTML`` via Chrome DevTools Protocol) and
|
|
returns a list of product dicts. No I/O, no network, deterministic for a fixed
|
|
input string — so it can be unit-tested with HTML fixtures and reused by any
|
|
fetch strategy.
|
|
|
|
Amazon serves several DOM templates at once (A/B tests) and rotates them often,
|
|
so every field is parsed defensively with multiple fallback selectors. A field
|
|
that no known template exposes is returned as ``None`` rather than raising.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Currency guessed from the marketplace TLD (used only as a fallback when the
|
|
# price string has no recognisable symbol).
|
|
_CURRENCY_BY_MARKET = {
|
|
"amazon.es": "EUR",
|
|
"amazon.com": "USD",
|
|
"amazon.co.uk": "GBP",
|
|
"amazon.de": "EUR",
|
|
"amazon.fr": "EUR",
|
|
"amazon.it": "EUR",
|
|
"amazon.com.mx": "MXN",
|
|
"amazon.com.br": "BRL",
|
|
}
|
|
|
|
# Map common currency symbols to ISO codes.
|
|
_SYMBOL_TO_CURRENCY = {
|
|
"€": "EUR",
|
|
"$": "USD",
|
|
"£": "GBP",
|
|
"R$": "BRL",
|
|
"US$": "USD",
|
|
}
|
|
|
|
_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
|
|
_RANK_RE = re.compile(r"#?\s*(\d+)")
|
|
_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
|
|
_REVIEWS_RE = re.compile(r"[\d.,]+")
|
|
_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
|
|
_PCT_RE = re.compile(r"([\d.,]+)\s*%")
|
|
|
|
|
|
def _text(node) -> str:
|
|
return node.get_text(" ", strip=True) if node is not None else ""
|
|
|
|
|
|
def _parse_asin(card) -> str | None:
|
|
"""ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
|
|
asin = card.get("data-asin")
|
|
if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
|
|
return asin
|
|
# Some templates put data-asin on a descendant, not the card root.
|
|
inner = card.select_one("[data-asin]")
|
|
if inner is not None:
|
|
inner_asin = inner.get("data-asin")
|
|
if inner_asin and re.fullmatch(r"[A-Z0-9]{10}", inner_asin):
|
|
return inner_asin
|
|
for a in card.find_all("a", href=True):
|
|
m = _ASIN_RE.search(a["href"])
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
|
|
def _parse_url(card, marketplace: str) -> str | None:
|
|
"""Absolute product URL from the first /dp/ link in the card."""
|
|
base = f"https://www.{marketplace}"
|
|
for a in card.find_all("a", href=True):
|
|
if _ASIN_RE.search(a["href"]):
|
|
return urljoin(base, a["href"].split("?")[0])
|
|
# Fall back to the first link at all.
|
|
first = card.find("a", href=True)
|
|
if first is not None:
|
|
return urljoin(base, first["href"].split("?")[0])
|
|
return None
|
|
|
|
|
|
def _parse_rank(card) -> int | None:
|
|
"""Rank badge. Amazon renders it as '#1', '1', etc."""
|
|
badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
|
|
txt = _text(badge)
|
|
if not txt:
|
|
# Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
|
|
for sel in (".a-badge-text", "[class*='rank']"):
|
|
node = card.select_one(sel)
|
|
txt = _text(node)
|
|
if txt:
|
|
break
|
|
m = _RANK_RE.search(txt)
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def _parse_title(card) -> str | None:
|
|
"""Product title — several templates over the years."""
|
|
for sel in (
|
|
"._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
|
|
"._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
|
|
"[class*='line-clamp']",
|
|
".p13n-sc-truncate",
|
|
".p13n-sc-truncated",
|
|
"a.a-link-normal[title]",
|
|
"img[alt]",
|
|
):
|
|
node = card.select_one(sel)
|
|
if node is None:
|
|
continue
|
|
if node.name == "img":
|
|
alt = node.get("alt")
|
|
if alt:
|
|
return alt.strip()
|
|
continue
|
|
if node.has_attr("title") and node["title"].strip():
|
|
return node["title"].strip()
|
|
txt = _text(node)
|
|
if txt:
|
|
return txt
|
|
return None
|
|
|
|
|
|
def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
|
|
"""Price value (float) and ISO currency, best-effort across templates."""
|
|
for sel in (
|
|
"._cDEzb_p13n-sc-price_3mJ9Z",
|
|
".p13n-sc-price",
|
|
"span.a-price > span.a-offscreen",
|
|
".a-price .a-offscreen",
|
|
"[class*='price']",
|
|
):
|
|
node = card.select_one(sel)
|
|
txt = _text(node)
|
|
if not txt:
|
|
continue
|
|
|
|
currency = None
|
|
for sym, iso in _SYMBOL_TO_CURRENCY.items():
|
|
if sym in txt:
|
|
currency = iso
|
|
break
|
|
if currency is None:
|
|
currency = _CURRENCY_BY_MARKET.get(marketplace)
|
|
|
|
m = _PRICE_NUM_RE.search(txt)
|
|
if not m:
|
|
continue
|
|
raw = m.group(0)
|
|
value = _to_float(raw)
|
|
if value is not None:
|
|
return value, currency
|
|
return None, None
|
|
|
|
|
|
def _parse_rating(card) -> float | None:
|
|
"""Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
|
|
for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
|
|
node = card.select_one(sel)
|
|
txt = _text(node) or (node.get("title", "") if node is not None else "") or (
|
|
node.get("aria-label", "") if node is not None else ""
|
|
)
|
|
if not txt:
|
|
continue
|
|
m = _RATING_RE.search(txt)
|
|
if m:
|
|
return _to_float(m.group(1))
|
|
# Some templates only render the number ('4,5').
|
|
m2 = _PRICE_NUM_RE.search(txt)
|
|
if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
|
|
return _to_float(m2.group(0))
|
|
return None
|
|
|
|
|
|
def _parse_reviews(card) -> int | None:
|
|
"""Number of ratings/reviews shown next to the stars."""
|
|
for sel in (
|
|
"a.a-size-small.a-link-normal",
|
|
".a-size-small.a-link-normal",
|
|
"[class*='review-count']",
|
|
"span.a-size-small",
|
|
):
|
|
for node in card.select(sel):
|
|
txt = _text(node)
|
|
if not txt:
|
|
continue
|
|
m = _REVIEWS_RE.search(txt)
|
|
if not m:
|
|
continue
|
|
digits = m.group(0).replace(".", "").replace(",", "")
|
|
if digits.isdigit() and len(digits) >= 1:
|
|
# Avoid catching rank/price by requiring a plausible count token.
|
|
return int(digits)
|
|
return None
|
|
|
|
|
|
def _parse_pct_change(card) -> float | None:
|
|
"""Movers & Shakers percentage change ('+150%').
|
|
|
|
Targets the sales-rank-gain badge specific to the movers grid, NOT the
|
|
generic discount/savings percent (``apex-savings-percent``) that appears on
|
|
bestseller/deal cards — matching those would report a bogus pct_change.
|
|
"""
|
|
for sel in (
|
|
".zg-percent-change",
|
|
"[class*='sales-movement']",
|
|
"[class*='percent-change']",
|
|
"[class*='zg_percent']",
|
|
):
|
|
node = card.select_one(sel)
|
|
txt = _text(node)
|
|
if not txt:
|
|
continue
|
|
m = _PCT_RE.search(txt)
|
|
if m:
|
|
value = _to_float(m.group(1))
|
|
if value is None:
|
|
continue
|
|
return -value if txt.strip().startswith("-") else value
|
|
return None
|
|
|
|
|
|
def _to_float(raw: str) -> float | None:
|
|
"""Parse a numeric string with EU or US decimal/grouping conventions."""
|
|
if raw is None:
|
|
return None
|
|
s = raw.strip().replace("\xa0", "").replace(" ", "")
|
|
if not s:
|
|
return None
|
|
if "," in s and "." in s:
|
|
# The rightmost separator is the decimal one.
|
|
if s.rfind(",") > s.rfind("."):
|
|
s = s.replace(".", "").replace(",", ".")
|
|
else:
|
|
s = s.replace(",", "")
|
|
elif "," in s:
|
|
# Treat a single comma as decimal separator (EU markets).
|
|
s = s.replace(",", ".")
|
|
try:
|
|
return float(s)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _select_cards(soup: BeautifulSoup) -> list:
|
|
"""Locate the list-item cards across known Amazon templates.
|
|
|
|
Prefers the grid *wrapper* (``gridItemRoot``) over the inner faceout: the
|
|
rank badge (``span.zg-bdg-text``) is a sibling of the faceout *inside* the
|
|
wrapper, so selecting the wrapper keeps both rank and product data in the
|
|
same card. Older / alternative templates fall back to their own roots.
|
|
"""
|
|
selectors = (
|
|
'div[id="gridItemRoot"]',
|
|
"div[id^='gridItemRoot']",
|
|
"div.zg-grid-general-faceout",
|
|
"li.zg-item-immersion",
|
|
"div.a-cardui[data-asin]",
|
|
"div.p13n-sc-uncoverable-faceout",
|
|
"div[data-asin]",
|
|
)
|
|
for sel in selectors:
|
|
cards = soup.select(sel)
|
|
if cards:
|
|
return cards
|
|
return []
|
|
|
|
|
|
def parse_amazon_ranking_html(
|
|
html: str,
|
|
marketplace: str = "amazon.es",
|
|
list_type: str = "bestsellers",
|
|
max_items: int = 50,
|
|
) -> list[dict]:
|
|
"""Parse Amazon ranking HTML into a list of product dicts (pure).
|
|
|
|
Pure function: given a fixed HTML string it always returns the same list,
|
|
with no I/O. Used by both the HTTP scraper (``scrape_amazon_bestsellers``)
|
|
and the CDP scraper (``scrape_amazon_movers_cdp``).
|
|
|
|
Args:
|
|
html: Raw HTML of an Amazon ranking page (or the rendered ``outerHTML``
|
|
of the grid container). May be the whole document or just the grid.
|
|
marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``. Used
|
|
to build absolute product URLs and to infer the fallback currency.
|
|
list_type: ``"bestsellers"`` or ``"movers_shakers"``. Only affects
|
|
whether ``pct_change`` is parsed (movers) or forced to ``None``.
|
|
max_items: Maximum number of products returned.
|
|
|
|
Returns:
|
|
A list of dicts, one per product, with exactly these keys:
|
|
``marketplace, list_type, category, rank, asin, title, price,
|
|
currency, rating, reviews, pct_change, url``. Missing values are
|
|
``None``. ``price``/``rating``/``pct_change`` are floats,
|
|
``rank``/``reviews`` are ints. ``category`` is always ``None`` here —
|
|
the caller (which knows the URL) fills it in. Returns ``[]`` for empty
|
|
or card-less HTML (never raises on missing fields).
|
|
"""
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
cards = _select_cards(soup)
|
|
|
|
results: list[dict] = []
|
|
count = 0
|
|
for idx, card in enumerate(cards):
|
|
if count >= max_items:
|
|
break
|
|
asin = _parse_asin(card)
|
|
title = _parse_title(card)
|
|
# Skip empty / non-product wrappers.
|
|
if asin is None and title is None:
|
|
continue
|
|
|
|
rank = _parse_rank(card)
|
|
if rank is None:
|
|
rank = idx + 1 # positional fallback when no badge is rendered
|
|
|
|
price, currency = _parse_price(card, marketplace)
|
|
results.append(
|
|
{
|
|
"marketplace": marketplace,
|
|
"list_type": list_type,
|
|
"category": None,
|
|
"rank": rank,
|
|
"asin": asin,
|
|
"title": title,
|
|
"price": price,
|
|
"currency": currency,
|
|
"rating": _parse_rating(card),
|
|
"reviews": _parse_reviews(card),
|
|
"pct_change": _parse_pct_change(card)
|
|
if list_type == "movers_shakers"
|
|
else None,
|
|
"url": _parse_url(card, marketplace),
|
|
}
|
|
)
|
|
count += 1
|
|
|
|
return results
|