feat(datascience): auto-commit con 7 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-03 00:48:43 +02:00
parent 5a4f82cf76
commit 8a78a70ef6
7 changed files with 817 additions and 8 deletions
@@ -17,7 +17,7 @@ from __future__ import annotations
from .. import model
CHAPTER_VERSION = "1.1.0"
CHAPTER_VERSION = "1.1.1"
CHAPTER_ID = "glosario"
CHAPTER_TITLE = "Glosario"
@@ -89,14 +89,19 @@ def build_glosario(profile: dict, ctx: dict):
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
"definición en esta sección.")),
]
# One clickable destination per term, alphabetically by visible label. A term
# registered without a definition is completed from the canonical baseline.
for term in glossary.terms(by="label"):
# One clickable destination per term, alphabetically by *visible* label. The
# baseline resolution must happen BEFORE sorting: a term registered bare (no
# label) carries its key as label in the collector, so ordering by the
# collector's label would place it by its key instead of by the human label
# supplied by the baseline catalog. Resolve first, then sort by the final label.
resolved = []
for term in glossary.terms(by="order"):
label, definition = _resolve_term(term)
resolved.append((label, definition, model._safe_str(term.get("key"))))
resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
for label, definition, key in resolved:
blocks.append(model.GlossaryEntry(
key=model._safe_str(term.get("key")),
label=label,
definition=definition))
key=key, label=label, definition=definition))
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,181 @@
"""Tests for the GLOSARIO chapter — DoD: golden + edges + degradation + no-cut render.
The glossary is the last chapter of every AutomaticEDA document. It does not read
the profile: it turns the terms that the other chapters registered on the shared
``GlossaryCollector`` (``ctx['glossary']``) into one clickable ``GlossaryEntry``
destination each, alphabetically by visible label.
Covered here:
- **Golden**: a collector with three terms (one carrying its own definition, two
registered bare and completed from the canonical baseline catalog) builds a
``Chapter`` with three ``GlossaryEntry`` blocks, alphabetically ordered, and
renders to PDF and PPTX with nothing cut.
- **Baseline resolution** (``_resolve_term``): a bare term whose key is in the
baseline gets its label *and* definition filled in; a term that already carries
its own definition is never overwritten.
- **Edges**: ``None`` / ``{}`` ctx, an empty collector and a non-collector value in
``ctx['glossary']`` all return ``None`` (the chapter simply disappears) and never
raise, even with a ``None`` profile.
- **Click target**: every emitted entry carries the registered ``key`` so each
in-text ``[[term:key]]`` appearance resolves to a real jump.
"""
import os
import tempfile
from pptx import Presentation
from pypdf import PdfReader
from datascience.automatic_eda.chapters.glosario import (
_BASELINE_TERMS,
_resolve_term,
build_glosario,
)
from datascience.automatic_eda.model import (
Chapter,
GlossaryCollector,
GlossaryEntry,
)
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
# --------------------------------------------------------------------------- #
# Helpers.
# --------------------------------------------------------------------------- #
def _entries(chapter: Chapter) -> list:
"""The GlossaryEntry blocks of a built chapter, in document order."""
return [b for b in chapter.blocks if isinstance(b, GlossaryEntry)]
def _render_both(chapter: Chapter, tag: str):
"""Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
tmp = tempfile.mkdtemp(prefix=f"glosario_{tag}_")
pdf_path = os.path.join(tmp, "out.pdf")
pptx_path = os.path.join(tmp, "out.pptx")
meta = {"title": f"EDA — {tag}"}
render_automatic_eda_pdf([chapter], pdf_path, meta)
render_automatic_eda_pptx([chapter], pptx_path, meta)
assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
n_slides = len(Presentation(pptx_path).slides)
return text, n_slides
def _collector_three_terms() -> GlossaryCollector:
"""A collector with three terms registered out of alphabetical order:
- ``entropia``: its own label + definition (must not be baseline-overwritten).
- ``pagina_categorica``: bare, completed from the baseline.
- ``histograma_boxplot``: bare, completed from the baseline.
"""
g = GlossaryCollector()
g.add("entropia", "Entropía",
"Medida de la incertidumbre o dispersión de una variable categórica.")
g.add("pagina_categorica") # bare -> baseline label + definition
g.add("histograma_boxplot") # bare -> baseline label + definition
return g
# --------------------------------------------------------------------------- #
# Golden.
# --------------------------------------------------------------------------- #
def test_golden_terms_render_clickable_entries():
g = _collector_three_terms()
chapter = build_glosario({"table": "x"}, {"glossary": g})
assert isinstance(chapter, Chapter)
assert chapter.id == "glosario"
assert chapter.title == "Glosario"
assert chapter.version == "1.1.1"
entries = _entries(chapter)
assert len(entries) == 3
assert all(isinstance(e, GlossaryEntry) for e in entries)
# Alphabetical by visible label: "Cómo leer…" < "Cómo se organiza…" < "Entropía".
labels = [e.label for e in entries]
assert labels == sorted(labels, key=str.lower)
assert labels[0] == "Cómo leer el histograma y el boxplot"
assert labels[-1] == "Entropía"
# Bare terms were completed from the baseline; the own-definition term survived.
by_key = {e.key: e for e in entries}
assert "boxplot de Tukey" in by_key["histograma_boxplot"].definition
assert "identificador" in by_key["pagina_categorica"].definition
assert by_key["entropia"].definition.startswith("Medida de la incertidumbre")
# Renders with nothing cut; the labels and a definition fragment reach the PDF.
pdf_text, n_slides = _render_both(chapter, "golden")
assert "Entropía" in pdf_text
assert n_slides >= 1
# --------------------------------------------------------------------------- #
# Baseline resolution (_resolve_term).
# --------------------------------------------------------------------------- #
def test_resolve_term_completes_label_and_definition_from_baseline():
# A bare registration keeps label == key and an empty definition; the resolver
# fills both from the canonical catalog.
key = "histograma_boxplot"
label, definition = _resolve_term({"key": key, "label": key, "definition": ""})
assert label == _BASELINE_TERMS[key]["label"]
assert "boxplot de Tukey" in definition
def test_resolve_term_keeps_own_definition_over_baseline():
# Even when the key is in the baseline, a term that already carries its own
# definition (and a real label) must not be overwritten.
key = "pagina_categorica"
own_def = "Definición propia que no debe pisarse."
label, definition = _resolve_term(
{"key": key, "label": "Mi etiqueta", "definition": own_def})
assert label == "Mi etiqueta"
assert definition == own_def
def test_resolve_term_unknown_key_returns_as_is():
label, definition = _resolve_term(
{"key": "sin_baseline", "label": "Término libre", "definition": "Texto."})
assert label == "Término libre"
assert definition == "Texto."
# --------------------------------------------------------------------------- #
# Edges / degradation — the chapter disappears instead of raising.
# --------------------------------------------------------------------------- #
def test_none_when_no_glossary():
assert build_glosario({"table": "x"}, {}) is None
assert build_glosario({"table": "x"}, None) is None
def test_none_when_empty_collector():
assert build_glosario({"table": "x"}, {"glossary": GlossaryCollector()}) is None
def test_none_when_glossary_is_not_a_collector():
# A stray value in ctx['glossary'] must not be treated as a collector.
assert build_glosario({"table": "x"}, {"glossary": ["not", "a", "collector"]}) is None
assert build_glosario({"table": "x"}, {"glossary": {"entropia": "x"}}) is None
def test_none_profile_does_not_raise():
# The glossary ignores the profile; a None profile with a valid collector still
# builds, and a None profile with no glossary still returns None (no crash).
g = GlossaryCollector()
g.add("entropia", "Entropía", "def")
chapter = build_glosario(None, {"glossary": g})
assert isinstance(chapter, Chapter)
assert build_glosario(None, None) is None
# --------------------------------------------------------------------------- #
# Click target — each entry carries its registration key.
# --------------------------------------------------------------------------- #
def test_entries_carry_registered_key_as_click_target():
g = _collector_three_terms()
chapter = build_glosario({}, {"glossary": g})
keys = {e.key for e in _entries(chapter)}
assert keys == {"entropia", "pagina_categorica", "histograma_boxplot"}
@@ -0,0 +1,65 @@
---
name: scrape_gumroad_discover
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: impure
signature: "def scrape_gumroad_discover(taxonomy: str, sort: str = 'best_selling', max_products: int = 300, page_size: int = 100) -> list[dict]"
description: "Scrapea el marketplace publico de Gumroad Discover usando el endpoint JSON verificado gumroad.com/products/search (taxonomy+sort+from+size). Recolecta los productos de una taxonomy (nicho) ordenados por el criterio elegido y estampa en cada producto el total de la taxonomy (saturacion del nicho). Normaliza cada producto a un dict plano con id, seller_name, ratings, precio (cents/usd), pay-what-you-want/free, native_type, url y metadatos de scrape (taxonomy, total_in_taxonomy, sort_used, rank 0-based). Solo stdlib (urllib+json+time)."
tags: [gumroad, scraping, market-intel, trends, datascience]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: []
tested: true
tests: ["test_normaliza_producto_a_dict_plano", "test_paginacion_para_al_agotar_ventana", "test_sort_invalido_lanza_valueerror", "test_body_no_json_lanza_runtimeerror"]
test_file_path: "python/functions/datascience/scrape_gumroad_discover_test.py"
file_path: "python/functions/datascience/scrape_gumroad_discover.py"
params:
- name: taxonomy
desc: "Slug de taxonomy / nicho de Gumroad (ej. 'design', 'business-and-money', '3d'). Determina el segmento de mercado scrapeado y el valor total_in_taxonomy (numero total de productos = saturacion del nicho) que se estampa en cada producto."
- name: sort
desc: "Criterio de orden. Uno de: best_selling, most_reviewed, hot_and_new, highest_rated, newest, price_asc, price_desc. Cualquier otro valor lanza ValueError. Default 'best_selling'."
- name: max_products
desc: "Cota superior de productos a recolectar entre paginas. Default 300. La ventana de paginacion de Gumroad es finita (from~960 aun devuelve datos), asi que valores muy altos pueden recibir menos productos de los pedidos."
- name: page_size
desc: "Numero de productos pedidos por pagina via 'size'. Gumroad admite al menos 300. Una pagina que devuelve menos de page_size items señala el fin de la ventana y detiene la paginacion. Default 100."
output: "Lista de dicts planos, uno por producto, con exactamente estas claves: id, permalink, name, seller_name, ratings_count, ratings_avg, price_cents, currency_code, price_usd (float = price_cents/100), is_pay_what_you_want (bool), is_free (bool = price_cents==0), native_type, url, taxonomy (el arg), total_in_taxonomy (el 'total' del JSON = saturacion del nicho), sort_used (el arg sort), rank (posicion 0-based en el orden devuelto)."
---
## Ejemplo
```python
import sys, os
sys.path.insert(0, os.path.join("python", "functions"))
from datascience.scrape_gumroad_discover import scrape_gumroad_discover
# Top best-sellers del nicho "design" en Gumroad Discover
rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=300, page_size=100)
print(len(rows), "productos")
print("saturacion del nicho:", rows[0]["total_in_taxonomy"])
print(rows[0])
# {'id': '...', 'permalink': '...', 'name': '...', 'seller_name': '...',
# 'ratings_count': 128, 'ratings_avg': 4.9, 'price_cents': 2900,
# 'currency_code': 'usd', 'price_usd': 29.0, 'is_pay_what_you_want': False,
# 'is_free': False, 'native_type': 'digital', 'url': 'https://...',
# 'taxonomy': 'design', 'total_in_taxonomy': 4213, 'sort_used': 'best_selling', 'rank': 0}
# Productos mas nuevos de un nicho concreto
nuevos = scrape_gumroad_discover(taxonomy="3d", sort="newest", max_products=50)
```
## Cuando usarla
Usala cuando quieras hacer market intelligence sobre productos digitales: descubrir que se vende mas en un nicho de Gumroad, medir la saturacion del nicho (`total_in_taxonomy`) y capturar precios, valoraciones y vendedores para decidir si un nicho merece la pena o esta saturado. Es la fuente de un pipeline de deteccion de oportunidades de producto digital (grupo `market-intel`): scrapea varias taxonomies/sorts, cruza los snapshots y prioriza nichos con demanda alta y competencia manejable. Llamala antes de cualquier analisis de catalogo digital; el dict devuelto es plano y esta listo para insertar en una tabla tras añadir `snapshot_date`/`scraped_at`.
## Gotchas
- **ratings.count son REVIEWS, no ventas**: `ratings_count` cuenta valoraciones dejadas, NO unidades vendidas. Como proxy de ventas hay que multiplicar por un factor incierto (solo una fraccion de compradores valora, y esa fraccion varia por nicho/precio). Trata `ratings_count` como un limite inferior ruidoso de la demanda, no como ventas reales.
- **price=0 no siempre significa gratis util**: `price_cents==0` marca `is_free=True`, pero puede tratarse de un producto pay-what-you-want (`is_pay_what_you_want=True`) con minimo 0, no de un regalo. Cruza siempre `is_free` con `is_pay_what_you_want` antes de sacar conclusiones de precio.
- **Ventana de paginacion finita**: `page`/`per_page` se IGNORAN (siempre devuelven desde 0); solo `from`+`size` paginan. La ventana es amplia pero finita (from~960 aun devuelve, mas alla se agota). Pedir `max_products` muy alto puede recibir menos productos de los pedidos: la funcion para cuando una pagina devuelve menos de `page_size` items.
- **Cloudflare bloquea sin UA de navegador**: el endpoint exige `Accept: application/json` y un `User-Agent` de navegador. Sin ello Gumroad/Cloudflare puede devolver una pagina de challenge en HTML (no JSON) o redirigir. La funcion ya envia un UA de Chrome; si aun asi recibe un body no-JSON lanza `RuntimeError` claro — en ese caso cae al navegador del ecosistema (browser MCP / CDP).
- **Moneda no siempre USD**: `price_usd` es solo `price_cents/100` por conveniencia; si `currency_code != 'usd'` el valor NO esta convertido a dolares. Conserva y usa `currency_code` para convertir tu mismo.
@@ -0,0 +1,245 @@
"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
Uses Gumroad's verified public JSON search endpoint
GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
endpoint exposes, besides the product list, the ``total`` count of products in
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
their own product counts). This scraper focuses on the product list and stamps
each product with the taxonomy-level ``total`` so a downstream consumer can
reason about saturation without a second request.
Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
HTTP / JSON failures.
"""
from __future__ import annotations
import gzip
import json
import time
import urllib.error
import urllib.parse
import urllib.request
import zlib
_BASE_URL = "https://gumroad.com/products/search"
# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
# the request or redirect away from the JSON payload.
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
_VALID_SORTS = (
"best_selling",
"most_reviewed",
"hot_and_new",
"highest_rated",
"newest",
"price_asc",
"price_desc",
)
def _build_headers() -> dict:
"""Headers Gumroad needs to serve the JSON search payload."""
return {
"User-Agent": _USER_AGENT,
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
# Request an uncompressed body: urllib does not transparently inflate
# gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
# Asking for identity keeps the payload as plain JSON. A defensive
# inflate in _fetch_json covers the case where Cloudflare ignores this.
"Accept-Encoding": "identity",
"Connection": "keep-alive",
"X-Requested-With": "XMLHttpRequest",
}
def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
"""Compose the Discover search URL for a page window.
Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
Only ``from`` (offset) + ``size`` paginate.
"""
query = urllib.parse.urlencode(
{
"taxonomy": taxonomy,
"sort": sort,
"from": offset,
"size": size,
}
)
return f"{_BASE_URL}?{query}"
def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
"""GET the URL and decode the JSON body. Raises RuntimeError on failure."""
req = urllib.request.Request(url, headers=headers, method="GET")
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read()
# Defensive inflate: Cloudflare may still return a gzip/deflate body
# (magic bytes 1f 8b for gzip) even when we ask for identity.
encoding = (resp.headers.get("Content-Encoding") or "").lower()
if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
raw = gzip.decompress(raw)
elif "deflate" in encoding:
raw = zlib.decompress(raw)
except urllib.error.HTTPError as exc:
raise RuntimeError(
f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
"Cloudflare may be blocking the request; ensure a browser "
"User-Agent is sent, or fall back to the browser MCP/CDP path."
) from exc
except urllib.error.URLError as exc:
raise RuntimeError(
f"Gumroad search request to {url} failed: {exc.reason}"
) from exc
try:
return json.loads(raw.decode("utf-8"))
except (ValueError, UnicodeDecodeError) as exc:
raise RuntimeError(
f"Gumroad search returned non-JSON body for {url}: {exc}. "
"A browser User-Agent is required; a Cloudflare challenge page "
"is returned as HTML, not JSON."
) from exc
def _normalize_product(
product: dict,
taxonomy: str,
total_in_taxonomy: int,
sort: str,
rank: int,
) -> dict:
"""Flatten a raw Gumroad product into the flat dict contract."""
seller = product.get("seller") or {}
ratings = product.get("ratings") or {}
price_cents = product.get("price_cents")
if not isinstance(price_cents, int):
price_cents = 0
currency_code = product.get("currency_code")
return {
"id": product.get("id"),
"permalink": product.get("permalink"),
"name": product.get("name"),
"seller_name": seller.get("name"),
"ratings_count": ratings.get("count"),
"ratings_avg": ratings.get("average"),
"price_cents": price_cents,
"currency_code": currency_code,
# price_usd is a convenience float (cents/100). If the currency is not
# USD we keep the numeric value but preserve currency_code so the
# consumer can convert/decide.
"price_usd": price_cents / 100.0,
"is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
"is_free": price_cents == 0,
"native_type": product.get("native_type"),
"url": product.get("url"),
"taxonomy": taxonomy,
"total_in_taxonomy": total_in_taxonomy,
"sort_used": sort,
"rank": rank,
}
def scrape_gumroad_discover(
taxonomy: str,
sort: str = "best_selling",
max_products: int = 300,
page_size: int = 100,
) -> list[dict]:
"""Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
Paginates the verified Gumroad search endpoint with ``from``+``size`` until
``max_products`` are collected or a page returns fewer than ``page_size``
items (end of window). Each product is normalized to a flat dict carrying
the taxonomy-level ``total`` (niche saturation), the sort used and the
0-based rank in the returned order.
Args:
taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
``"business-and-money"``, ``"3d"``. Determines the market segment
scraped and the ``total_in_taxonomy`` reported on every product.
sort: One of ``best_selling, most_reviewed, hot_and_new,
highest_rated, newest, price_asc, price_desc``. Any other value
raises ``ValueError``.
max_products: Upper bound on how many products to collect across pages.
Gumroad's pagination window is finite (from~960 still returns), so
very high values may hit fewer results than requested.
page_size: Items requested per page via ``size``. Gumroad accepts at
least 300; a page returning fewer than this signals the end.
Returns:
A list of flat dicts, one per product, with exactly these keys:
``id, permalink, name, seller_name, ratings_count, ratings_avg,
price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
Raises:
ValueError: If ``sort`` is not one of the allowed values, or if
``max_products``/``page_size`` are not positive.
RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
(typically a Cloudflare challenge served without a browser UA).
"""
if sort not in _VALID_SORTS:
raise ValueError(
f"sort must be one of {_VALID_SORTS}, got {sort!r}"
)
if max_products <= 0:
raise ValueError(f"max_products must be positive, got {max_products}")
if page_size <= 0:
raise ValueError(f"page_size must be positive, got {page_size}")
headers = _build_headers()
results: list[dict] = []
total_in_taxonomy = 0
offset = 0
while len(results) < max_products:
# Never ask for more than we still need on the last page.
size = min(page_size, max_products - len(results))
url = _build_url(taxonomy, sort, offset, page_size)
payload = _fetch_json(url, headers, timeout=20)
# The taxonomy-level total is stamped on every product; capture it once.
total_val = payload.get("total")
if isinstance(total_val, int):
total_in_taxonomy = total_val
products = payload.get("products") or []
if not products:
break
for product in products:
if len(results) >= max_products:
break
rank = len(results) # 0-based position across the whole scrape
results.append(
_normalize_product(
product,
taxonomy=taxonomy,
total_in_taxonomy=total_in_taxonomy,
sort=sort,
rank=rank,
)
)
# A short page means we exhausted the window: stop.
if len(products) < page_size:
break
offset += page_size
# Be polite between requests so we don't hammer Gumroad.
time.sleep(0.4)
return results
@@ -0,0 +1,177 @@
"""Tests para scrape_gumroad_discover.
Mockean urllib.request.urlopen para NO hacer red: se inyecta un cuerpo JSON de
Gumroad con productos de ejemplo y se verifica la normalizacion a dict plano, el
corte de la paginacion, la validacion de sort y el manejo de un body no-JSON
(escenario tipico de challenge de Cloudflare). El scrape real no se testea aqui.
"""
import json
import pytest
from scrape_gumroad_discover import scrape_gumroad_discover
class _FakeResponse:
"""Context manager que imita la respuesta de urllib.request.urlopen."""
def __init__(self, raw: bytes, headers: dict | None = None):
self._raw = raw
# urllib response exposes .headers; the scraper reads Content-Encoding
# from it to decide whether to inflate the body.
self.headers = headers or {}
def __enter__(self):
return self
def __exit__(self, *exc):
return False
def read(self):
return self._raw
def _json_response(payload: dict) -> _FakeResponse:
return _FakeResponse(json.dumps(payload).encode("utf-8"))
# Dos productos de ejemplo con la estructura real verificada de Gumroad.
_SAMPLE_PRODUCTS = [
{
"id": "prod_1",
"permalink": "coolkit",
"name": "Cool Design Kit",
"seller": {"id": "s1", "name": "Alice Design", "avatar_url": "http://a"},
"ratings": {"count": 128, "average": 4.9},
"thumbnail_url": "http://thumb1",
"native_type": "digital",
"price_cents": 2900,
"currency_code": "usd",
"is_pay_what_you_want": False,
"url": "https://alice.gumroad.com/l/coolkit",
"description": "A kit",
},
{
"id": "prod_2",
"permalink": "freebie",
"name": "Free Font Pack",
"seller": {"id": "s2", "name": "Bob Type"},
"ratings": {"count": 0, "average": 0.0},
"native_type": "digital",
"price_cents": 0,
"currency_code": "eur",
"is_pay_what_you_want": True,
"url": "https://bob.gumroad.com/l/freebie",
},
]
def test_normaliza_producto_a_dict_plano(monkeypatch):
payload = {"total": 4213, "tags_data": [], "products": _SAMPLE_PRODUCTS}
def fake_urlopen(req, timeout=None):
return _json_response(payload)
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=10, page_size=100)
assert len(rows) == 2
first = rows[0]
# Estructura plana exacta.
assert set(first.keys()) == {
"id", "permalink", "name", "seller_name", "ratings_count", "ratings_avg",
"price_cents", "currency_code", "price_usd", "is_pay_what_you_want",
"is_free", "native_type", "url", "taxonomy", "total_in_taxonomy",
"sort_used", "rank",
}
assert first["id"] == "prod_1"
assert first["name"] == "Cool Design Kit"
assert first["seller_name"] == "Alice Design" # anidado -> plano
assert first["ratings_count"] == 128
assert first["ratings_avg"] == 4.9
assert first["price_cents"] == 2900
assert first["price_usd"] == 29.0 # cents/100
assert first["currency_code"] == "usd"
assert first["is_pay_what_you_want"] is False
assert first["is_free"] is False
assert first["native_type"] == "digital"
assert first["taxonomy"] == "design" # el arg
assert first["total_in_taxonomy"] == 4213 # el total del JSON
assert first["sort_used"] == "best_selling"
assert first["rank"] == 0
# Segundo producto: gratis / pay-what-you-want, moneda no-usd conservada.
second = rows[1]
assert second["price_cents"] == 0
assert second["price_usd"] == 0.0
assert second["is_free"] is True
assert second["is_pay_what_you_want"] is True
assert second["currency_code"] == "eur" # se conserva, no se convierte
assert second["rank"] == 1
def test_paginacion_para_al_agotar_ventana(monkeypatch):
# page_size=2 y una sola pagina con 2 productos: como len(products) == page_size
# se intentaria otra pagina; la segunda devuelve products vacios -> corta.
call_count = {"n": 0}
def fake_urlopen(req, timeout=None):
call_count["n"] += 1
if call_count["n"] == 1:
return _json_response({"total": 2, "products": _SAMPLE_PRODUCTS})
return _json_response({"total": 2, "products": []})
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
monkeypatch.setattr("time.sleep", lambda *_: None) # no dormir en test
rows = scrape_gumroad_discover(taxonomy="design", max_products=100, page_size=2)
assert len(rows) == 2
assert call_count["n"] == 2 # pidio segunda pagina, vino vacia, paro
assert [r["rank"] for r in rows] == [0, 1]
def test_sort_invalido_lanza_valueerror(monkeypatch):
# No debe llegar a hacer red: falla en validacion antes.
def fake_urlopen(req, timeout=None):
raise AssertionError("no deberia hacer red con sort invalido")
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
with pytest.raises(ValueError, match="sort must be one of"):
scrape_gumroad_discover(taxonomy="design", sort="trending")
def test_body_gzip_se_descomprime(monkeypatch):
# Cloudflare puede servir el JSON gzip-comprimido aunque se pida identity.
# El scraper debe inflar el cuerpo (magic bytes 1f 8b) y parsear el JSON.
import gzip as _gzip
payload = {"total": 7, "products": _SAMPLE_PRODUCTS}
gz = _gzip.compress(json.dumps(payload).encode("utf-8"))
def fake_urlopen(req, timeout=None):
return _FakeResponse(gz, headers={"Content-Encoding": "gzip"})
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
monkeypatch.setattr("time.sleep", lambda *_: None)
rows = scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
assert len(rows) == 2
assert rows[0]["name"] == "Cool Design Kit"
assert rows[0]["total_in_taxonomy"] == 7
def test_body_no_json_lanza_runtimeerror(monkeypatch):
# Cloudflare challenge: devuelve HTML, no JSON.
def fake_urlopen(req, timeout=None):
return _FakeResponse(b"<html><body>Just a moment...</body></html>")
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
with pytest.raises(RuntimeError, match="non-JSON"):
scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)