feat(datascience): auto-commit con 7 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-03 00:48:43 +02:00
parent 5a4f82cf76
commit 8a78a70ef6
7 changed files with 817 additions and 8 deletions
@@ -0,0 +1,245 @@
+"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
+
+Uses Gumroad's verified public JSON search endpoint
+
+    GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
+
+to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
+endpoint exposes, besides the product list, the ``total`` count of products in
+that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
+their own product counts). This scraper focuses on the product list and stamps
+each product with the taxonomy-level ``total`` so a downstream consumer can
+reason about saturation without a second request.
+
+Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
+The function is impure (it performs network I/O) and raises ``RuntimeError`` on
+HTTP / JSON failures.
+"""
+
+from __future__ import annotations
+
+import gzip
+import json
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+import zlib
+
+_BASE_URL = "https://gumroad.com/products/search"
+
+# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
+# the request or redirect away from the JSON payload.
+_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+
+_VALID_SORTS = (
+    "best_selling",
+    "most_reviewed",
+    "hot_and_new",
+    "highest_rated",
+    "newest",
+    "price_asc",
+    "price_desc",
+)
+
+
+def _build_headers() -> dict:
+    """Headers Gumroad needs to serve the JSON search payload."""
+    return {
+        "User-Agent": _USER_AGENT,
+        "Accept": "application/json",
+        "Accept-Language": "en-US,en;q=0.9",
+        # Request an uncompressed body: urllib does not transparently inflate
+        # gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
+        # Asking for identity keeps the payload as plain JSON. A defensive
+        # inflate in _fetch_json covers the case where Cloudflare ignores this.
+        "Accept-Encoding": "identity",
+        "Connection": "keep-alive",
+        "X-Requested-With": "XMLHttpRequest",
+    }
+
+
+def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
+    """Compose the Discover search URL for a page window.
+
+    Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
+    Only ``from`` (offset) + ``size`` paginate.
+    """
+    query = urllib.parse.urlencode(
+        {
+            "taxonomy": taxonomy,
+            "sort": sort,
+            "from": offset,
+            "size": size,
+        }
+    )
+    return f"{_BASE_URL}?{query}"
+
+
+def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
+    """GET the URL and decode the JSON body. Raises RuntimeError on failure."""
+    req = urllib.request.Request(url, headers=headers, method="GET")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read()
+            # Defensive inflate: Cloudflare may still return a gzip/deflate body
+            # (magic bytes 1f 8b for gzip) even when we ask for identity.
+            encoding = (resp.headers.get("Content-Encoding") or "").lower()
+            if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
+                raw = gzip.decompress(raw)
+            elif "deflate" in encoding:
+                raw = zlib.decompress(raw)
+    except urllib.error.HTTPError as exc:
+        raise RuntimeError(
+            f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
+            "Cloudflare may be blocking the request; ensure a browser "
+            "User-Agent is sent, or fall back to the browser MCP/CDP path."
+        ) from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(
+            f"Gumroad search request to {url} failed: {exc.reason}"
+        ) from exc
+
+    try:
+        return json.loads(raw.decode("utf-8"))
+    except (ValueError, UnicodeDecodeError) as exc:
+        raise RuntimeError(
+            f"Gumroad search returned non-JSON body for {url}: {exc}. "
+            "A browser User-Agent is required; a Cloudflare challenge page "
+            "is returned as HTML, not JSON."
+        ) from exc
+
+
+def _normalize_product(
+    product: dict,
+    taxonomy: str,
+    total_in_taxonomy: int,
+    sort: str,
+    rank: int,
+) -> dict:
+    """Flatten a raw Gumroad product into the flat dict contract."""
+    seller = product.get("seller") or {}
+    ratings = product.get("ratings") or {}
+    price_cents = product.get("price_cents")
+    if not isinstance(price_cents, int):
+        price_cents = 0
+    currency_code = product.get("currency_code")
+
+    return {
+        "id": product.get("id"),
+        "permalink": product.get("permalink"),
+        "name": product.get("name"),
+        "seller_name": seller.get("name"),
+        "ratings_count": ratings.get("count"),
+        "ratings_avg": ratings.get("average"),
+        "price_cents": price_cents,
+        "currency_code": currency_code,
+        # price_usd is a convenience float (cents/100). If the currency is not
+        # USD we keep the numeric value but preserve currency_code so the
+        # consumer can convert/decide.
+        "price_usd": price_cents / 100.0,
+        "is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
+        "is_free": price_cents == 0,
+        "native_type": product.get("native_type"),
+        "url": product.get("url"),
+        "taxonomy": taxonomy,
+        "total_in_taxonomy": total_in_taxonomy,
+        "sort_used": sort,
+        "rank": rank,
+    }
+
+
+def scrape_gumroad_discover(
+    taxonomy: str,
+    sort: str = "best_selling",
+    max_products: int = 300,
+    page_size: int = 100,
+) -> list[dict]:
+    """Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
+
+    Paginates the verified Gumroad search endpoint with ``from``+``size`` until
+    ``max_products`` are collected or a page returns fewer than ``page_size``
+    items (end of window). Each product is normalized to a flat dict carrying
+    the taxonomy-level ``total`` (niche saturation), the sort used and the
+    0-based rank in the returned order.
+
+    Args:
+        taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
+            ``"business-and-money"``, ``"3d"``. Determines the market segment
+            scraped and the ``total_in_taxonomy`` reported on every product.
+        sort: One of ``best_selling, most_reviewed, hot_and_new,
+            highest_rated, newest, price_asc, price_desc``. Any other value
+            raises ``ValueError``.
+        max_products: Upper bound on how many products to collect across pages.
+            Gumroad's pagination window is finite (from~960 still returns), so
+            very high values may hit fewer results than requested.
+        page_size: Items requested per page via ``size``. Gumroad accepts at
+            least 300; a page returning fewer than this signals the end.
+
+    Returns:
+        A list of flat dicts, one per product, with exactly these keys:
+        ``id, permalink, name, seller_name, ratings_count, ratings_avg,
+        price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
+        native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
+
+    Raises:
+        ValueError: If ``sort`` is not one of the allowed values, or if
+            ``max_products``/``page_size`` are not positive.
+        RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
+            (typically a Cloudflare challenge served without a browser UA).
+    """
+    if sort not in _VALID_SORTS:
+        raise ValueError(
+            f"sort must be one of {_VALID_SORTS}, got {sort!r}"
+        )
+    if max_products <= 0:
+        raise ValueError(f"max_products must be positive, got {max_products}")
+    if page_size <= 0:
+        raise ValueError(f"page_size must be positive, got {page_size}")
+
+    headers = _build_headers()
+    results: list[dict] = []
+    total_in_taxonomy = 0
+    offset = 0
+
+    while len(results) < max_products:
+        # Never ask for more than we still need on the last page.
+        size = min(page_size, max_products - len(results))
+        url = _build_url(taxonomy, sort, offset, page_size)
+        payload = _fetch_json(url, headers, timeout=20)
+
+        # The taxonomy-level total is stamped on every product; capture it once.
+        total_val = payload.get("total")
+        if isinstance(total_val, int):
+            total_in_taxonomy = total_val
+
+        products = payload.get("products") or []
+        if not products:
+            break
+
+        for product in products:
+            if len(results) >= max_products:
+                break
+            rank = len(results)  # 0-based position across the whole scrape
+            results.append(
+                _normalize_product(
+                    product,
+                    taxonomy=taxonomy,
+                    total_in_taxonomy=total_in_taxonomy,
+                    sort=sort,
+                    rank=rank,
+                )
+            )
+
+        # A short page means we exhausted the window: stop.
+        if len(products) < page_size:
+            break
+
+        offset += page_size
+        # Be polite between requests so we don't hammer Gumroad.
+        time.sleep(0.4)
+
+    return results