"""Scrape the public Gumroad Discover marketplace for niche/market intelligence. Uses Gumroad's verified public JSON search endpoint GET https://gumroad.com/products/search?taxonomy=&sort=&from=&size= to collect the products of a taxonomy (niche) sorted by a chosen criterion. The endpoint exposes, besides the product list, the ``total`` count of products in that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with their own product counts). This scraper focuses on the product list and stamps each product with the taxonomy-level ``total`` so a downstream consumer can reason about saturation without a second request. Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies. The function is impure (it performs network I/O) and raises ``RuntimeError`` on HTTP / JSON failures. """ from __future__ import annotations import gzip import json import time import urllib.error import urllib.parse import urllib.request import zlib _BASE_URL = "https://gumroad.com/products/search" # A browser User-Agent is required: without it Gumroad / Cloudflare may reject # the request or redirect away from the JSON payload. _USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ) _VALID_SORTS = ( "best_selling", "most_reviewed", "hot_and_new", "highest_rated", "newest", "price_asc", "price_desc", ) def _build_headers() -> dict: """Headers Gumroad needs to serve the JSON search payload.""" return { "User-Agent": _USER_AGENT, "Accept": "application/json", "Accept-Language": "en-US,en;q=0.9", # Request an uncompressed body: urllib does not transparently inflate # gzip/deflate, and Cloudflare serves gzip when a browser UA is present. # Asking for identity keeps the payload as plain JSON. A defensive # inflate in _fetch_json covers the case where Cloudflare ignores this. "Accept-Encoding": "identity", "Connection": "keep-alive", "X-Requested-With": "XMLHttpRequest", } def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str: """Compose the Discover search URL for a page window. Note: Gumroad ignores ``page``/``per_page`` (they always return from 0). Only ``from`` (offset) + ``size`` paginate. """ query = urllib.parse.urlencode( { "taxonomy": taxonomy, "sort": sort, "from": offset, "size": size, } ) return f"{_BASE_URL}?{query}" def _fetch_json(url: str, headers: dict, timeout: int) -> dict: """GET the URL and decode the JSON body. Raises RuntimeError on failure.""" req = urllib.request.Request(url, headers=headers, method="GET") try: with urllib.request.urlopen(req, timeout=timeout) as resp: raw = resp.read() # Defensive inflate: Cloudflare may still return a gzip/deflate body # (magic bytes 1f 8b for gzip) even when we ask for identity. encoding = (resp.headers.get("Content-Encoding") or "").lower() if "gzip" in encoding or raw[:2] == b"\x1f\x8b": raw = gzip.decompress(raw) elif "deflate" in encoding: raw = zlib.decompress(raw) except urllib.error.HTTPError as exc: raise RuntimeError( f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. " "Cloudflare may be blocking the request; ensure a browser " "User-Agent is sent, or fall back to the browser MCP/CDP path." ) from exc except urllib.error.URLError as exc: raise RuntimeError( f"Gumroad search request to {url} failed: {exc.reason}" ) from exc try: return json.loads(raw.decode("utf-8")) except (ValueError, UnicodeDecodeError) as exc: raise RuntimeError( f"Gumroad search returned non-JSON body for {url}: {exc}. " "A browser User-Agent is required; a Cloudflare challenge page " "is returned as HTML, not JSON." ) from exc def _normalize_product( product: dict, taxonomy: str, total_in_taxonomy: int, sort: str, rank: int, ) -> dict: """Flatten a raw Gumroad product into the flat dict contract.""" seller = product.get("seller") or {} ratings = product.get("ratings") or {} price_cents = product.get("price_cents") if not isinstance(price_cents, int): price_cents = 0 currency_code = product.get("currency_code") return { "id": product.get("id"), "permalink": product.get("permalink"), "name": product.get("name"), "seller_name": seller.get("name"), "ratings_count": ratings.get("count"), "ratings_avg": ratings.get("average"), "price_cents": price_cents, "currency_code": currency_code, # price_usd is a convenience float (cents/100). If the currency is not # USD we keep the numeric value but preserve currency_code so the # consumer can convert/decide. "price_usd": price_cents / 100.0, "is_pay_what_you_want": bool(product.get("is_pay_what_you_want")), "is_free": price_cents == 0, "native_type": product.get("native_type"), "url": product.get("url"), "taxonomy": taxonomy, "total_in_taxonomy": total_in_taxonomy, "sort_used": sort, "rank": rank, } def scrape_gumroad_discover( taxonomy: str, sort: str = "best_selling", max_products: int = 300, page_size: int = 100, ) -> list[dict]: """Scrape the public Gumroad Discover marketplace for a taxonomy (niche). Paginates the verified Gumroad search endpoint with ``from``+``size`` until ``max_products`` are collected or a page returns fewer than ``page_size`` items (end of window). Each product is normalized to a flat dict carrying the taxonomy-level ``total`` (niche saturation), the sort used and the 0-based rank in the returned order. Args: taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``, ``"business-and-money"``, ``"3d"``. Determines the market segment scraped and the ``total_in_taxonomy`` reported on every product. sort: One of ``best_selling, most_reviewed, hot_and_new, highest_rated, newest, price_asc, price_desc``. Any other value raises ``ValueError``. max_products: Upper bound on how many products to collect across pages. Gumroad's pagination window is finite (from~960 still returns), so very high values may hit fewer results than requested. page_size: Items requested per page via ``size``. Gumroad accepts at least 300; a page returning fewer than this signals the end. Returns: A list of flat dicts, one per product, with exactly these keys: ``id, permalink, name, seller_name, ratings_count, ratings_avg, price_cents, currency_code, price_usd, is_pay_what_you_want, is_free, native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``. Raises: ValueError: If ``sort`` is not one of the allowed values, or if ``max_products``/``page_size`` are not positive. RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body (typically a Cloudflare challenge served without a browser UA). """ if sort not in _VALID_SORTS: raise ValueError( f"sort must be one of {_VALID_SORTS}, got {sort!r}" ) if max_products <= 0: raise ValueError(f"max_products must be positive, got {max_products}") if page_size <= 0: raise ValueError(f"page_size must be positive, got {page_size}") headers = _build_headers() results: list[dict] = [] total_in_taxonomy = 0 offset = 0 while len(results) < max_products: # Never ask for more than we still need on the last page. size = min(page_size, max_products - len(results)) url = _build_url(taxonomy, sort, offset, page_size) payload = _fetch_json(url, headers, timeout=20) # The taxonomy-level total is stamped on every product; capture it once. total_val = payload.get("total") if isinstance(total_val, int): total_in_taxonomy = total_val products = payload.get("products") or [] if not products: break for product in products: if len(results) >= max_products: break rank = len(results) # 0-based position across the whole scrape results.append( _normalize_product( product, taxonomy=taxonomy, total_in_taxonomy=total_in_taxonomy, sort=sort, rank=rank, ) ) # A short page means we exhausted the window: stop. if len(products) < page_size: break offset += page_size # Be polite between requests so we don't hammer Gumroad. time.sleep(0.4) return results