feat(datascience): auto-commit con 7 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,245 @@
|
||||
"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
|
||||
|
||||
Uses Gumroad's verified public JSON search endpoint
|
||||
|
||||
GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
|
||||
|
||||
to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
|
||||
endpoint exposes, besides the product list, the ``total`` count of products in
|
||||
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
|
||||
their own product counts). This scraper focuses on the product list and stamps
|
||||
each product with the taxonomy-level ``total`` so a downstream consumer can
|
||||
reason about saturation without a second request.
|
||||
|
||||
Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
|
||||
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
|
||||
HTTP / JSON failures.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import zlib
|
||||
|
||||
_BASE_URL = "https://gumroad.com/products/search"
|
||||
|
||||
# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
|
||||
# the request or redirect away from the JSON payload.
|
||||
_USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
_VALID_SORTS = (
|
||||
"best_selling",
|
||||
"most_reviewed",
|
||||
"hot_and_new",
|
||||
"highest_rated",
|
||||
"newest",
|
||||
"price_asc",
|
||||
"price_desc",
|
||||
)
|
||||
|
||||
|
||||
def _build_headers() -> dict:
|
||||
"""Headers Gumroad needs to serve the JSON search payload."""
|
||||
return {
|
||||
"User-Agent": _USER_AGENT,
|
||||
"Accept": "application/json",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
# Request an uncompressed body: urllib does not transparently inflate
|
||||
# gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
|
||||
# Asking for identity keeps the payload as plain JSON. A defensive
|
||||
# inflate in _fetch_json covers the case where Cloudflare ignores this.
|
||||
"Accept-Encoding": "identity",
|
||||
"Connection": "keep-alive",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
}
|
||||
|
||||
|
||||
def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
|
||||
"""Compose the Discover search URL for a page window.
|
||||
|
||||
Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
|
||||
Only ``from`` (offset) + ``size`` paginate.
|
||||
"""
|
||||
query = urllib.parse.urlencode(
|
||||
{
|
||||
"taxonomy": taxonomy,
|
||||
"sort": sort,
|
||||
"from": offset,
|
||||
"size": size,
|
||||
}
|
||||
)
|
||||
return f"{_BASE_URL}?{query}"
|
||||
|
||||
|
||||
def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
|
||||
"""GET the URL and decode the JSON body. Raises RuntimeError on failure."""
|
||||
req = urllib.request.Request(url, headers=headers, method="GET")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read()
|
||||
# Defensive inflate: Cloudflare may still return a gzip/deflate body
|
||||
# (magic bytes 1f 8b for gzip) even when we ask for identity.
|
||||
encoding = (resp.headers.get("Content-Encoding") or "").lower()
|
||||
if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
|
||||
raw = gzip.decompress(raw)
|
||||
elif "deflate" in encoding:
|
||||
raw = zlib.decompress(raw)
|
||||
except urllib.error.HTTPError as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
|
||||
"Cloudflare may be blocking the request; ensure a browser "
|
||||
"User-Agent is sent, or fall back to the browser MCP/CDP path."
|
||||
) from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search request to {url} failed: {exc.reason}"
|
||||
) from exc
|
||||
|
||||
try:
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
except (ValueError, UnicodeDecodeError) as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search returned non-JSON body for {url}: {exc}. "
|
||||
"A browser User-Agent is required; a Cloudflare challenge page "
|
||||
"is returned as HTML, not JSON."
|
||||
) from exc
|
||||
|
||||
|
||||
def _normalize_product(
|
||||
product: dict,
|
||||
taxonomy: str,
|
||||
total_in_taxonomy: int,
|
||||
sort: str,
|
||||
rank: int,
|
||||
) -> dict:
|
||||
"""Flatten a raw Gumroad product into the flat dict contract."""
|
||||
seller = product.get("seller") or {}
|
||||
ratings = product.get("ratings") or {}
|
||||
price_cents = product.get("price_cents")
|
||||
if not isinstance(price_cents, int):
|
||||
price_cents = 0
|
||||
currency_code = product.get("currency_code")
|
||||
|
||||
return {
|
||||
"id": product.get("id"),
|
||||
"permalink": product.get("permalink"),
|
||||
"name": product.get("name"),
|
||||
"seller_name": seller.get("name"),
|
||||
"ratings_count": ratings.get("count"),
|
||||
"ratings_avg": ratings.get("average"),
|
||||
"price_cents": price_cents,
|
||||
"currency_code": currency_code,
|
||||
# price_usd is a convenience float (cents/100). If the currency is not
|
||||
# USD we keep the numeric value but preserve currency_code so the
|
||||
# consumer can convert/decide.
|
||||
"price_usd": price_cents / 100.0,
|
||||
"is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
|
||||
"is_free": price_cents == 0,
|
||||
"native_type": product.get("native_type"),
|
||||
"url": product.get("url"),
|
||||
"taxonomy": taxonomy,
|
||||
"total_in_taxonomy": total_in_taxonomy,
|
||||
"sort_used": sort,
|
||||
"rank": rank,
|
||||
}
|
||||
|
||||
|
||||
def scrape_gumroad_discover(
|
||||
taxonomy: str,
|
||||
sort: str = "best_selling",
|
||||
max_products: int = 300,
|
||||
page_size: int = 100,
|
||||
) -> list[dict]:
|
||||
"""Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
|
||||
|
||||
Paginates the verified Gumroad search endpoint with ``from``+``size`` until
|
||||
``max_products`` are collected or a page returns fewer than ``page_size``
|
||||
items (end of window). Each product is normalized to a flat dict carrying
|
||||
the taxonomy-level ``total`` (niche saturation), the sort used and the
|
||||
0-based rank in the returned order.
|
||||
|
||||
Args:
|
||||
taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
|
||||
``"business-and-money"``, ``"3d"``. Determines the market segment
|
||||
scraped and the ``total_in_taxonomy`` reported on every product.
|
||||
sort: One of ``best_selling, most_reviewed, hot_and_new,
|
||||
highest_rated, newest, price_asc, price_desc``. Any other value
|
||||
raises ``ValueError``.
|
||||
max_products: Upper bound on how many products to collect across pages.
|
||||
Gumroad's pagination window is finite (from~960 still returns), so
|
||||
very high values may hit fewer results than requested.
|
||||
page_size: Items requested per page via ``size``. Gumroad accepts at
|
||||
least 300; a page returning fewer than this signals the end.
|
||||
|
||||
Returns:
|
||||
A list of flat dicts, one per product, with exactly these keys:
|
||||
``id, permalink, name, seller_name, ratings_count, ratings_avg,
|
||||
price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
|
||||
native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
|
||||
|
||||
Raises:
|
||||
ValueError: If ``sort`` is not one of the allowed values, or if
|
||||
``max_products``/``page_size`` are not positive.
|
||||
RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
|
||||
(typically a Cloudflare challenge served without a browser UA).
|
||||
"""
|
||||
if sort not in _VALID_SORTS:
|
||||
raise ValueError(
|
||||
f"sort must be one of {_VALID_SORTS}, got {sort!r}"
|
||||
)
|
||||
if max_products <= 0:
|
||||
raise ValueError(f"max_products must be positive, got {max_products}")
|
||||
if page_size <= 0:
|
||||
raise ValueError(f"page_size must be positive, got {page_size}")
|
||||
|
||||
headers = _build_headers()
|
||||
results: list[dict] = []
|
||||
total_in_taxonomy = 0
|
||||
offset = 0
|
||||
|
||||
while len(results) < max_products:
|
||||
# Never ask for more than we still need on the last page.
|
||||
size = min(page_size, max_products - len(results))
|
||||
url = _build_url(taxonomy, sort, offset, page_size)
|
||||
payload = _fetch_json(url, headers, timeout=20)
|
||||
|
||||
# The taxonomy-level total is stamped on every product; capture it once.
|
||||
total_val = payload.get("total")
|
||||
if isinstance(total_val, int):
|
||||
total_in_taxonomy = total_val
|
||||
|
||||
products = payload.get("products") or []
|
||||
if not products:
|
||||
break
|
||||
|
||||
for product in products:
|
||||
if len(results) >= max_products:
|
||||
break
|
||||
rank = len(results) # 0-based position across the whole scrape
|
||||
results.append(
|
||||
_normalize_product(
|
||||
product,
|
||||
taxonomy=taxonomy,
|
||||
total_in_taxonomy=total_in_taxonomy,
|
||||
sort=sort,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
|
||||
# A short page means we exhausted the window: stop.
|
||||
if len(products) < page_size:
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
# Be polite between requests so we don't hammer Gumroad.
|
||||
time.sleep(0.4)
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user