fn_registry/python/functions/datascience/scrape_gumroad_discover.py

"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.

Uses Gumroad's verified public JSON search endpoint

    GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>

to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
endpoint exposes, besides the product list, the ``total`` count of products in
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
their own product counts). This scraper focuses on the product list and stamps
each product with the taxonomy-level ``total`` so a downstream consumer can
reason about saturation without a second request.

Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
HTTP / JSON failures.
"""

from __future__ import annotations

import gzip
import json
import time
import urllib.error
import urllib.parse
import urllib.request
import zlib

_BASE_URL = "https://gumroad.com/products/search"

# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
# the request or redirect away from the JSON payload.
_USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)

_VALID_SORTS = (
    "best_selling",
    "most_reviewed",
    "hot_and_new",
    "highest_rated",
    "newest",
    "price_asc",
    "price_desc",
)


def _build_headers() -> dict:
    """Headers Gumroad needs to serve the JSON search payload."""
    return {
        "User-Agent": _USER_AGENT,
        "Accept": "application/json",
        "Accept-Language": "en-US,en;q=0.9",
        # Request an uncompressed body: urllib does not transparently inflate
        # gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
        # Asking for identity keeps the payload as plain JSON. A defensive
        # inflate in _fetch_json covers the case where Cloudflare ignores this.
        "Accept-Encoding": "identity",
        "Connection": "keep-alive",
        "X-Requested-With": "XMLHttpRequest",
    }


def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
    """Compose the Discover search URL for a page window.

    Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
    Only ``from`` (offset) + ``size`` paginate.
    """
    query = urllib.parse.urlencode(
        {
            "taxonomy": taxonomy,
            "sort": sort,
            "from": offset,
            "size": size,
        }
    )
    return f"{_BASE_URL}?{query}"


def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
    """GET the URL and decode the JSON body. Raises RuntimeError on failure."""
    req = urllib.request.Request(url, headers=headers, method="GET")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            raw = resp.read()
            # Defensive inflate: Cloudflare may still return a gzip/deflate body
            # (magic bytes 1f 8b for gzip) even when we ask for identity.
            encoding = (resp.headers.get("Content-Encoding") or "").lower()
            if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
                raw = gzip.decompress(raw)
            elif "deflate" in encoding:
                raw = zlib.decompress(raw)
    except urllib.error.HTTPError as exc:
        raise RuntimeError(
            f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
            "Cloudflare may be blocking the request; ensure a browser "
            "User-Agent is sent, or fall back to the browser MCP/CDP path."
        ) from exc
    except urllib.error.URLError as exc:
        raise RuntimeError(
            f"Gumroad search request to {url} failed: {exc.reason}"
        ) from exc

    try:
        return json.loads(raw.decode("utf-8"))
    except (ValueError, UnicodeDecodeError) as exc:
        raise RuntimeError(
            f"Gumroad search returned non-JSON body for {url}: {exc}. "
            "A browser User-Agent is required; a Cloudflare challenge page "
            "is returned as HTML, not JSON."
        ) from exc


def _normalize_product(
    product: dict,
    taxonomy: str,
    total_in_taxonomy: int,
    sort: str,
    rank: int,
) -> dict:
    """Flatten a raw Gumroad product into the flat dict contract."""
    seller = product.get("seller") or {}
    ratings = product.get("ratings") or {}
    price_cents = product.get("price_cents")
    if not isinstance(price_cents, int):
        price_cents = 0
    currency_code = product.get("currency_code")

    return {
        "id": product.get("id"),
        "permalink": product.get("permalink"),
        "name": product.get("name"),
        "seller_name": seller.get("name"),
        "ratings_count": ratings.get("count"),
        "ratings_avg": ratings.get("average"),
        "price_cents": price_cents,
        "currency_code": currency_code,
        # price_usd is a convenience float (cents/100). If the currency is not
        # USD we keep the numeric value but preserve currency_code so the
        # consumer can convert/decide.
        "price_usd": price_cents / 100.0,
        "is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
        "is_free": price_cents == 0,
        "native_type": product.get("native_type"),
        "url": product.get("url"),
        "taxonomy": taxonomy,
        "total_in_taxonomy": total_in_taxonomy,
        "sort_used": sort,
        "rank": rank,
    }


def scrape_gumroad_discover(
    taxonomy: str,
    sort: str = "best_selling",
    max_products: int = 300,
    page_size: int = 100,
) -> list[dict]:
    """Scrape the public Gumroad Discover marketplace for a taxonomy (niche).

    Paginates the verified Gumroad search endpoint with ``from``+``size`` until
    ``max_products`` are collected or a page returns fewer than ``page_size``
    items (end of window). Each product is normalized to a flat dict carrying
    the taxonomy-level ``total`` (niche saturation), the sort used and the
    0-based rank in the returned order.

    Args:
        taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
            ``"business-and-money"``, ``"3d"``. Determines the market segment
            scraped and the ``total_in_taxonomy`` reported on every product.
        sort: One of ``best_selling, most_reviewed, hot_and_new,
            highest_rated, newest, price_asc, price_desc``. Any other value
            raises ``ValueError``.
        max_products: Upper bound on how many products to collect across pages.
            Gumroad's pagination window is finite (from~960 still returns), so
            very high values may hit fewer results than requested.
        page_size: Items requested per page via ``size``. Gumroad accepts at
            least 300; a page returning fewer than this signals the end.

    Returns:
        A list of flat dicts, one per product, with exactly these keys:
        ``id, permalink, name, seller_name, ratings_count, ratings_avg,
        price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
        native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.

    Raises:
        ValueError: If ``sort`` is not one of the allowed values, or if
            ``max_products``/``page_size`` are not positive.
        RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
            (typically a Cloudflare challenge served without a browser UA).
    """
    if sort not in _VALID_SORTS:
        raise ValueError(
            f"sort must be one of {_VALID_SORTS}, got {sort!r}"
        )
    if max_products <= 0:
        raise ValueError(f"max_products must be positive, got {max_products}")
    if page_size <= 0:
        raise ValueError(f"page_size must be positive, got {page_size}")

    headers = _build_headers()
    results: list[dict] = []
    total_in_taxonomy = 0
    offset = 0

    while len(results) < max_products:
        # Never ask for more than we still need on the last page.
        size = min(page_size, max_products - len(results))
        url = _build_url(taxonomy, sort, offset, page_size)
        payload = _fetch_json(url, headers, timeout=20)

        # The taxonomy-level total is stamped on every product; capture it once.
        total_val = payload.get("total")
        if isinstance(total_val, int):
            total_in_taxonomy = total_val

        products = payload.get("products") or []
        if not products:
            break

        for product in products:
            if len(results) >= max_products:
                break
            rank = len(results)  # 0-based position across the whole scrape
            results.append(
                _normalize_product(
                    product,
                    taxonomy=taxonomy,
                    total_in_taxonomy=total_in_taxonomy,
                    sort=sort,
                    rank=rank,
                )
            )

        # A short page means we exhausted the window: stop.
        if len(products) < page_size:
            break

        offset += page_size
        # Be polite between requests so we don't hammer Gumroad.
        time.sleep(0.4)

    return results