feat(datascience): auto-commit con 7 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-03 00:48:43 +02:00
parent 5a4f82cf76
commit 8a78a70ef6
7 changed files with 817 additions and 8 deletions
@@ -0,0 +1,245 @@
"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
Uses Gumroad's verified public JSON search endpoint
GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
endpoint exposes, besides the product list, the ``total`` count of products in
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
their own product counts). This scraper focuses on the product list and stamps
each product with the taxonomy-level ``total`` so a downstream consumer can
reason about saturation without a second request.
Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
HTTP / JSON failures.
"""
from __future__ import annotations
import gzip
import json
import time
import urllib.error
import urllib.parse
import urllib.request
import zlib
_BASE_URL = "https://gumroad.com/products/search"
# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
# the request or redirect away from the JSON payload.
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
_VALID_SORTS = (
"best_selling",
"most_reviewed",
"hot_and_new",
"highest_rated",
"newest",
"price_asc",
"price_desc",
)
def _build_headers() -> dict:
"""Headers Gumroad needs to serve the JSON search payload."""
return {
"User-Agent": _USER_AGENT,
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
# Request an uncompressed body: urllib does not transparently inflate
# gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
# Asking for identity keeps the payload as plain JSON. A defensive
# inflate in _fetch_json covers the case where Cloudflare ignores this.
"Accept-Encoding": "identity",
"Connection": "keep-alive",
"X-Requested-With": "XMLHttpRequest",
}
def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
"""Compose the Discover search URL for a page window.
Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
Only ``from`` (offset) + ``size`` paginate.
"""
query = urllib.parse.urlencode(
{
"taxonomy": taxonomy,
"sort": sort,
"from": offset,
"size": size,
}
)
return f"{_BASE_URL}?{query}"
def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
"""GET the URL and decode the JSON body. Raises RuntimeError on failure."""
req = urllib.request.Request(url, headers=headers, method="GET")
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read()
# Defensive inflate: Cloudflare may still return a gzip/deflate body
# (magic bytes 1f 8b for gzip) even when we ask for identity.
encoding = (resp.headers.get("Content-Encoding") or "").lower()
if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
raw = gzip.decompress(raw)
elif "deflate" in encoding:
raw = zlib.decompress(raw)
except urllib.error.HTTPError as exc:
raise RuntimeError(
f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
"Cloudflare may be blocking the request; ensure a browser "
"User-Agent is sent, or fall back to the browser MCP/CDP path."
) from exc
except urllib.error.URLError as exc:
raise RuntimeError(
f"Gumroad search request to {url} failed: {exc.reason}"
) from exc
try:
return json.loads(raw.decode("utf-8"))
except (ValueError, UnicodeDecodeError) as exc:
raise RuntimeError(
f"Gumroad search returned non-JSON body for {url}: {exc}. "
"A browser User-Agent is required; a Cloudflare challenge page "
"is returned as HTML, not JSON."
) from exc
def _normalize_product(
product: dict,
taxonomy: str,
total_in_taxonomy: int,
sort: str,
rank: int,
) -> dict:
"""Flatten a raw Gumroad product into the flat dict contract."""
seller = product.get("seller") or {}
ratings = product.get("ratings") or {}
price_cents = product.get("price_cents")
if not isinstance(price_cents, int):
price_cents = 0
currency_code = product.get("currency_code")
return {
"id": product.get("id"),
"permalink": product.get("permalink"),
"name": product.get("name"),
"seller_name": seller.get("name"),
"ratings_count": ratings.get("count"),
"ratings_avg": ratings.get("average"),
"price_cents": price_cents,
"currency_code": currency_code,
# price_usd is a convenience float (cents/100). If the currency is not
# USD we keep the numeric value but preserve currency_code so the
# consumer can convert/decide.
"price_usd": price_cents / 100.0,
"is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
"is_free": price_cents == 0,
"native_type": product.get("native_type"),
"url": product.get("url"),
"taxonomy": taxonomy,
"total_in_taxonomy": total_in_taxonomy,
"sort_used": sort,
"rank": rank,
}
def scrape_gumroad_discover(
taxonomy: str,
sort: str = "best_selling",
max_products: int = 300,
page_size: int = 100,
) -> list[dict]:
"""Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
Paginates the verified Gumroad search endpoint with ``from``+``size`` until
``max_products`` are collected or a page returns fewer than ``page_size``
items (end of window). Each product is normalized to a flat dict carrying
the taxonomy-level ``total`` (niche saturation), the sort used and the
0-based rank in the returned order.
Args:
taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
``"business-and-money"``, ``"3d"``. Determines the market segment
scraped and the ``total_in_taxonomy`` reported on every product.
sort: One of ``best_selling, most_reviewed, hot_and_new,
highest_rated, newest, price_asc, price_desc``. Any other value
raises ``ValueError``.
max_products: Upper bound on how many products to collect across pages.
Gumroad's pagination window is finite (from~960 still returns), so
very high values may hit fewer results than requested.
page_size: Items requested per page via ``size``. Gumroad accepts at
least 300; a page returning fewer than this signals the end.
Returns:
A list of flat dicts, one per product, with exactly these keys:
``id, permalink, name, seller_name, ratings_count, ratings_avg,
price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
Raises:
ValueError: If ``sort`` is not one of the allowed values, or if
``max_products``/``page_size`` are not positive.
RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
(typically a Cloudflare challenge served without a browser UA).
"""
if sort not in _VALID_SORTS:
raise ValueError(
f"sort must be one of {_VALID_SORTS}, got {sort!r}"
)
if max_products <= 0:
raise ValueError(f"max_products must be positive, got {max_products}")
if page_size <= 0:
raise ValueError(f"page_size must be positive, got {page_size}")
headers = _build_headers()
results: list[dict] = []
total_in_taxonomy = 0
offset = 0
while len(results) < max_products:
# Never ask for more than we still need on the last page.
size = min(page_size, max_products - len(results))
url = _build_url(taxonomy, sort, offset, page_size)
payload = _fetch_json(url, headers, timeout=20)
# The taxonomy-level total is stamped on every product; capture it once.
total_val = payload.get("total")
if isinstance(total_val, int):
total_in_taxonomy = total_val
products = payload.get("products") or []
if not products:
break
for product in products:
if len(results) >= max_products:
break
rank = len(results) # 0-based position across the whole scrape
results.append(
_normalize_product(
product,
taxonomy=taxonomy,
total_in_taxonomy=total_in_taxonomy,
sort=sort,
rank=rank,
)
)
# A short page means we exhausted the window: stop.
if len(products) < page_size:
break
offset += page_size
# Be polite between requests so we don't hammer Gumroad.
time.sleep(0.4)
return results