8a78a70ef6
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
246 lines
9.0 KiB
Python
246 lines
9.0 KiB
Python
"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
|
|
|
|
Uses Gumroad's verified public JSON search endpoint
|
|
|
|
GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
|
|
|
|
to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
|
|
endpoint exposes, besides the product list, the ``total`` count of products in
|
|
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
|
|
their own product counts). This scraper focuses on the product list and stamps
|
|
each product with the taxonomy-level ``total`` so a downstream consumer can
|
|
reason about saturation without a second request.
|
|
|
|
Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
|
|
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
|
|
HTTP / JSON failures.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import gzip
|
|
import json
|
|
import time
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
import zlib
|
|
|
|
_BASE_URL = "https://gumroad.com/products/search"
|
|
|
|
# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
|
|
# the request or redirect away from the JSON payload.
|
|
_USER_AGENT = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
_VALID_SORTS = (
|
|
"best_selling",
|
|
"most_reviewed",
|
|
"hot_and_new",
|
|
"highest_rated",
|
|
"newest",
|
|
"price_asc",
|
|
"price_desc",
|
|
)
|
|
|
|
|
|
def _build_headers() -> dict:
|
|
"""Headers Gumroad needs to serve the JSON search payload."""
|
|
return {
|
|
"User-Agent": _USER_AGENT,
|
|
"Accept": "application/json",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
# Request an uncompressed body: urllib does not transparently inflate
|
|
# gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
|
|
# Asking for identity keeps the payload as plain JSON. A defensive
|
|
# inflate in _fetch_json covers the case where Cloudflare ignores this.
|
|
"Accept-Encoding": "identity",
|
|
"Connection": "keep-alive",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
}
|
|
|
|
|
|
def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
|
|
"""Compose the Discover search URL for a page window.
|
|
|
|
Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
|
|
Only ``from`` (offset) + ``size`` paginate.
|
|
"""
|
|
query = urllib.parse.urlencode(
|
|
{
|
|
"taxonomy": taxonomy,
|
|
"sort": sort,
|
|
"from": offset,
|
|
"size": size,
|
|
}
|
|
)
|
|
return f"{_BASE_URL}?{query}"
|
|
|
|
|
|
def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
|
|
"""GET the URL and decode the JSON body. Raises RuntimeError on failure."""
|
|
req = urllib.request.Request(url, headers=headers, method="GET")
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
raw = resp.read()
|
|
# Defensive inflate: Cloudflare may still return a gzip/deflate body
|
|
# (magic bytes 1f 8b for gzip) even when we ask for identity.
|
|
encoding = (resp.headers.get("Content-Encoding") or "").lower()
|
|
if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
|
|
raw = gzip.decompress(raw)
|
|
elif "deflate" in encoding:
|
|
raw = zlib.decompress(raw)
|
|
except urllib.error.HTTPError as exc:
|
|
raise RuntimeError(
|
|
f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
|
|
"Cloudflare may be blocking the request; ensure a browser "
|
|
"User-Agent is sent, or fall back to the browser MCP/CDP path."
|
|
) from exc
|
|
except urllib.error.URLError as exc:
|
|
raise RuntimeError(
|
|
f"Gumroad search request to {url} failed: {exc.reason}"
|
|
) from exc
|
|
|
|
try:
|
|
return json.loads(raw.decode("utf-8"))
|
|
except (ValueError, UnicodeDecodeError) as exc:
|
|
raise RuntimeError(
|
|
f"Gumroad search returned non-JSON body for {url}: {exc}. "
|
|
"A browser User-Agent is required; a Cloudflare challenge page "
|
|
"is returned as HTML, not JSON."
|
|
) from exc
|
|
|
|
|
|
def _normalize_product(
|
|
product: dict,
|
|
taxonomy: str,
|
|
total_in_taxonomy: int,
|
|
sort: str,
|
|
rank: int,
|
|
) -> dict:
|
|
"""Flatten a raw Gumroad product into the flat dict contract."""
|
|
seller = product.get("seller") or {}
|
|
ratings = product.get("ratings") or {}
|
|
price_cents = product.get("price_cents")
|
|
if not isinstance(price_cents, int):
|
|
price_cents = 0
|
|
currency_code = product.get("currency_code")
|
|
|
|
return {
|
|
"id": product.get("id"),
|
|
"permalink": product.get("permalink"),
|
|
"name": product.get("name"),
|
|
"seller_name": seller.get("name"),
|
|
"ratings_count": ratings.get("count"),
|
|
"ratings_avg": ratings.get("average"),
|
|
"price_cents": price_cents,
|
|
"currency_code": currency_code,
|
|
# price_usd is a convenience float (cents/100). If the currency is not
|
|
# USD we keep the numeric value but preserve currency_code so the
|
|
# consumer can convert/decide.
|
|
"price_usd": price_cents / 100.0,
|
|
"is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
|
|
"is_free": price_cents == 0,
|
|
"native_type": product.get("native_type"),
|
|
"url": product.get("url"),
|
|
"taxonomy": taxonomy,
|
|
"total_in_taxonomy": total_in_taxonomy,
|
|
"sort_used": sort,
|
|
"rank": rank,
|
|
}
|
|
|
|
|
|
def scrape_gumroad_discover(
|
|
taxonomy: str,
|
|
sort: str = "best_selling",
|
|
max_products: int = 300,
|
|
page_size: int = 100,
|
|
) -> list[dict]:
|
|
"""Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
|
|
|
|
Paginates the verified Gumroad search endpoint with ``from``+``size`` until
|
|
``max_products`` are collected or a page returns fewer than ``page_size``
|
|
items (end of window). Each product is normalized to a flat dict carrying
|
|
the taxonomy-level ``total`` (niche saturation), the sort used and the
|
|
0-based rank in the returned order.
|
|
|
|
Args:
|
|
taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
|
|
``"business-and-money"``, ``"3d"``. Determines the market segment
|
|
scraped and the ``total_in_taxonomy`` reported on every product.
|
|
sort: One of ``best_selling, most_reviewed, hot_and_new,
|
|
highest_rated, newest, price_asc, price_desc``. Any other value
|
|
raises ``ValueError``.
|
|
max_products: Upper bound on how many products to collect across pages.
|
|
Gumroad's pagination window is finite (from~960 still returns), so
|
|
very high values may hit fewer results than requested.
|
|
page_size: Items requested per page via ``size``. Gumroad accepts at
|
|
least 300; a page returning fewer than this signals the end.
|
|
|
|
Returns:
|
|
A list of flat dicts, one per product, with exactly these keys:
|
|
``id, permalink, name, seller_name, ratings_count, ratings_avg,
|
|
price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
|
|
native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
|
|
|
|
Raises:
|
|
ValueError: If ``sort`` is not one of the allowed values, or if
|
|
``max_products``/``page_size`` are not positive.
|
|
RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
|
|
(typically a Cloudflare challenge served without a browser UA).
|
|
"""
|
|
if sort not in _VALID_SORTS:
|
|
raise ValueError(
|
|
f"sort must be one of {_VALID_SORTS}, got {sort!r}"
|
|
)
|
|
if max_products <= 0:
|
|
raise ValueError(f"max_products must be positive, got {max_products}")
|
|
if page_size <= 0:
|
|
raise ValueError(f"page_size must be positive, got {page_size}")
|
|
|
|
headers = _build_headers()
|
|
results: list[dict] = []
|
|
total_in_taxonomy = 0
|
|
offset = 0
|
|
|
|
while len(results) < max_products:
|
|
# Never ask for more than we still need on the last page.
|
|
size = min(page_size, max_products - len(results))
|
|
url = _build_url(taxonomy, sort, offset, page_size)
|
|
payload = _fetch_json(url, headers, timeout=20)
|
|
|
|
# The taxonomy-level total is stamped on every product; capture it once.
|
|
total_val = payload.get("total")
|
|
if isinstance(total_val, int):
|
|
total_in_taxonomy = total_val
|
|
|
|
products = payload.get("products") or []
|
|
if not products:
|
|
break
|
|
|
|
for product in products:
|
|
if len(results) >= max_products:
|
|
break
|
|
rank = len(results) # 0-based position across the whole scrape
|
|
results.append(
|
|
_normalize_product(
|
|
product,
|
|
taxonomy=taxonomy,
|
|
total_in_taxonomy=total_in_taxonomy,
|
|
sort=sort,
|
|
rank=rank,
|
|
)
|
|
)
|
|
|
|
# A short page means we exhausted the window: stop.
|
|
if len(products) < page_size:
|
|
break
|
|
|
|
offset += page_size
|
|
# Be polite between requests so we don't hammer Gumroad.
|
|
time.sleep(0.4)
|
|
|
|
return results
|