feat(datascience): auto-commit con 7 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-03 00:48:43 +02:00
parent 5a4f82cf76
commit 8a78a70ef6
7 changed files with 817 additions and 8 deletions
@@ -17,7 +17,7 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.1.1"
 CHAPTER_ID = "glosario"
 CHAPTER_TITLE = "Glosario"

@@ -89,14 +89,19 @@ def build_glosario(profile: dict, ctx: dict):
            "Cada término va resaltado en el texto y, al pulsarlo, salta a su "
            "definición en esta sección.")),
    ]
-    # One clickable destination per term, alphabetically by visible label. A term
-    # registered without a definition is completed from the canonical baseline.
-    for term in glossary.terms(by="label"):
+    # One clickable destination per term, alphabetically by *visible* label. The
+    # baseline resolution must happen BEFORE sorting: a term registered bare (no
+    # label) carries its key as label in the collector, so ordering by the
+    # collector's label would place it by its key instead of by the human label
+    # supplied by the baseline catalog. Resolve first, then sort by the final label.
+    resolved = []
+    for term in glossary.terms(by="order"):
        label, definition = _resolve_term(term)
+        resolved.append((label, definition, model._safe_str(term.get("key"))))
+    resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
+    for label, definition, key in resolved:
        blocks.append(model.GlossaryEntry(
-            key=model._safe_str(term.get("key")),
-            label=label,
-            definition=definition))
+            key=key, label=label, definition=definition))

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,181 @@
+"""Tests for the GLOSARIO chapter — DoD: golden + edges + degradation + no-cut render.
+
+The glossary is the last chapter of every AutomaticEDA document. It does not read
+the profile: it turns the terms that the other chapters registered on the shared
+``GlossaryCollector`` (``ctx['glossary']``) into one clickable ``GlossaryEntry``
+destination each, alphabetically by visible label.
+
+Covered here:
+
+- **Golden**: a collector with three terms (one carrying its own definition, two
+  registered bare and completed from the canonical baseline catalog) builds a
+  ``Chapter`` with three ``GlossaryEntry`` blocks, alphabetically ordered, and
+  renders to PDF and PPTX with nothing cut.
+- **Baseline resolution** (``_resolve_term``): a bare term whose key is in the
+  baseline gets its label *and* definition filled in; a term that already carries
+  its own definition is never overwritten.
+- **Edges**: ``None`` / ``{}`` ctx, an empty collector and a non-collector value in
+  ``ctx['glossary']`` all return ``None`` (the chapter simply disappears) and never
+  raise, even with a ``None`` profile.
+- **Click target**: every emitted entry carries the registered ``key`` so each
+  in-text ``[[term:key]]`` appearance resolves to a real jump.
+"""
+
+import os
+import tempfile
+
+from pptx import Presentation
+from pypdf import PdfReader
+
+from datascience.automatic_eda.chapters.glosario import (
+    _BASELINE_TERMS,
+    _resolve_term,
+    build_glosario,
+)
+from datascience.automatic_eda.model import (
+    Chapter,
+    GlossaryCollector,
+    GlossaryEntry,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+# --------------------------------------------------------------------------- #
+# Helpers.
+# --------------------------------------------------------------------------- #
+def _entries(chapter: Chapter) -> list:
+    """The GlossaryEntry blocks of a built chapter, in document order."""
+    return [b for b in chapter.blocks if isinstance(b, GlossaryEntry)]
+
+
+def _render_both(chapter: Chapter, tag: str):
+    """Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
+    tmp = tempfile.mkdtemp(prefix=f"glosario_{tag}_")
+    pdf_path = os.path.join(tmp, "out.pdf")
+    pptx_path = os.path.join(tmp, "out.pptx")
+    meta = {"title": f"EDA — {tag}"}
+    render_automatic_eda_pdf([chapter], pdf_path, meta)
+    render_automatic_eda_pptx([chapter], pptx_path, meta)
+    assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
+    assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
+    text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
+    n_slides = len(Presentation(pptx_path).slides)
+    return text, n_slides
+
+
+def _collector_three_terms() -> GlossaryCollector:
+    """A collector with three terms registered out of alphabetical order:
+
+    - ``entropia``: its own label + definition (must not be baseline-overwritten).
+    - ``pagina_categorica``: bare, completed from the baseline.
+    - ``histograma_boxplot``: bare, completed from the baseline.
+    """
+    g = GlossaryCollector()
+    g.add("entropia", "Entropía",
+          "Medida de la incertidumbre o dispersión de una variable categórica.")
+    g.add("pagina_categorica")          # bare -> baseline label + definition
+    g.add("histograma_boxplot")         # bare -> baseline label + definition
+    return g
+
+
+# --------------------------------------------------------------------------- #
+# Golden.
+# --------------------------------------------------------------------------- #
+def test_golden_terms_render_clickable_entries():
+    g = _collector_three_terms()
+    chapter = build_glosario({"table": "x"}, {"glossary": g})
+
+    assert isinstance(chapter, Chapter)
+    assert chapter.id == "glosario"
+    assert chapter.title == "Glosario"
+    assert chapter.version == "1.1.1"
+
+    entries = _entries(chapter)
+    assert len(entries) == 3
+    assert all(isinstance(e, GlossaryEntry) for e in entries)
+
+    # Alphabetical by visible label: "Cómo leer…" < "Cómo se organiza…" < "Entropía".
+    labels = [e.label for e in entries]
+    assert labels == sorted(labels, key=str.lower)
+    assert labels[0] == "Cómo leer el histograma y el boxplot"
+    assert labels[-1] == "Entropía"
+
+    # Bare terms were completed from the baseline; the own-definition term survived.
+    by_key = {e.key: e for e in entries}
+    assert "boxplot de Tukey" in by_key["histograma_boxplot"].definition
+    assert "identificador" in by_key["pagina_categorica"].definition
+    assert by_key["entropia"].definition.startswith("Medida de la incertidumbre")
+
+    # Renders with nothing cut; the labels and a definition fragment reach the PDF.
+    pdf_text, n_slides = _render_both(chapter, "golden")
+    assert "Entropía" in pdf_text
+    assert n_slides >= 1
+
+
+# --------------------------------------------------------------------------- #
+# Baseline resolution (_resolve_term).
+# --------------------------------------------------------------------------- #
+def test_resolve_term_completes_label_and_definition_from_baseline():
+    # A bare registration keeps label == key and an empty definition; the resolver
+    # fills both from the canonical catalog.
+    key = "histograma_boxplot"
+    label, definition = _resolve_term({"key": key, "label": key, "definition": ""})
+    assert label == _BASELINE_TERMS[key]["label"]
+    assert "boxplot de Tukey" in definition
+
+
+def test_resolve_term_keeps_own_definition_over_baseline():
+    # Even when the key is in the baseline, a term that already carries its own
+    # definition (and a real label) must not be overwritten.
+    key = "pagina_categorica"
+    own_def = "Definición propia que no debe pisarse."
+    label, definition = _resolve_term(
+        {"key": key, "label": "Mi etiqueta", "definition": own_def})
+    assert label == "Mi etiqueta"
+    assert definition == own_def
+
+
+def test_resolve_term_unknown_key_returns_as_is():
+    label, definition = _resolve_term(
+        {"key": "sin_baseline", "label": "Término libre", "definition": "Texto."})
+    assert label == "Término libre"
+    assert definition == "Texto."
+
+
+# --------------------------------------------------------------------------- #
+# Edges / degradation — the chapter disappears instead of raising.
+# --------------------------------------------------------------------------- #
+def test_none_when_no_glossary():
+    assert build_glosario({"table": "x"}, {}) is None
+    assert build_glosario({"table": "x"}, None) is None
+
+
+def test_none_when_empty_collector():
+    assert build_glosario({"table": "x"}, {"glossary": GlossaryCollector()}) is None
+
+
+def test_none_when_glossary_is_not_a_collector():
+    # A stray value in ctx['glossary'] must not be treated as a collector.
+    assert build_glosario({"table": "x"}, {"glossary": ["not", "a", "collector"]}) is None
+    assert build_glosario({"table": "x"}, {"glossary": {"entropia": "x"}}) is None
+
+
+def test_none_profile_does_not_raise():
+    # The glossary ignores the profile; a None profile with a valid collector still
+    # builds, and a None profile with no glossary still returns None (no crash).
+    g = GlossaryCollector()
+    g.add("entropia", "Entropía", "def")
+    chapter = build_glosario(None, {"glossary": g})
+    assert isinstance(chapter, Chapter)
+    assert build_glosario(None, None) is None
+
+
+# --------------------------------------------------------------------------- #
+# Click target — each entry carries its registration key.
+# --------------------------------------------------------------------------- #
+def test_entries_carry_registered_key_as_click_target():
+    g = _collector_three_terms()
+    chapter = build_glosario({}, {"glossary": g})
+    keys = {e.key for e in _entries(chapter)}
+    assert keys == {"entropia", "pagina_categorica", "histograma_boxplot"}
@@ -0,0 +1,65 @@
+---
+name: scrape_gumroad_discover
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_gumroad_discover(taxonomy: str, sort: str = 'best_selling', max_products: int = 300, page_size: int = 100) -> list[dict]"
+description: "Scrapea el marketplace publico de Gumroad Discover usando el endpoint JSON verificado gumroad.com/products/search (taxonomy+sort+from+size). Recolecta los productos de una taxonomy (nicho) ordenados por el criterio elegido y estampa en cada producto el total de la taxonomy (saturacion del nicho). Normaliza cada producto a un dict plano con id, seller_name, ratings, precio (cents/usd), pay-what-you-want/free, native_type, url y metadatos de scrape (taxonomy, total_in_taxonomy, sort_used, rank 0-based). Solo stdlib (urllib+json+time)."
+tags: [gumroad, scraping, market-intel, trends, datascience]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+tested: true
+tests: ["test_normaliza_producto_a_dict_plano", "test_paginacion_para_al_agotar_ventana", "test_sort_invalido_lanza_valueerror", "test_body_no_json_lanza_runtimeerror"]
+test_file_path: "python/functions/datascience/scrape_gumroad_discover_test.py"
+file_path: "python/functions/datascience/scrape_gumroad_discover.py"
+params:
+  - name: taxonomy
+    desc: "Slug de taxonomy / nicho de Gumroad (ej. 'design', 'business-and-money', '3d'). Determina el segmento de mercado scrapeado y el valor total_in_taxonomy (numero total de productos = saturacion del nicho) que se estampa en cada producto."
+  - name: sort
+    desc: "Criterio de orden. Uno de: best_selling, most_reviewed, hot_and_new, highest_rated, newest, price_asc, price_desc. Cualquier otro valor lanza ValueError. Default 'best_selling'."
+  - name: max_products
+    desc: "Cota superior de productos a recolectar entre paginas. Default 300. La ventana de paginacion de Gumroad es finita (from~960 aun devuelve datos), asi que valores muy altos pueden recibir menos productos de los pedidos."
+  - name: page_size
+    desc: "Numero de productos pedidos por pagina via 'size'. Gumroad admite al menos 300. Una pagina que devuelve menos de page_size items señala el fin de la ventana y detiene la paginacion. Default 100."
+output: "Lista de dicts planos, uno por producto, con exactamente estas claves: id, permalink, name, seller_name, ratings_count, ratings_avg, price_cents, currency_code, price_usd (float = price_cents/100), is_pay_what_you_want (bool), is_free (bool = price_cents==0), native_type, url, taxonomy (el arg), total_in_taxonomy (el 'total' del JSON = saturacion del nicho), sort_used (el arg sort), rank (posicion 0-based en el orden devuelto)."
+---
+
+## Ejemplo
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience.scrape_gumroad_discover import scrape_gumroad_discover
+
+# Top best-sellers del nicho "design" en Gumroad Discover
+rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=300, page_size=100)
+print(len(rows), "productos")
+print("saturacion del nicho:", rows[0]["total_in_taxonomy"])
+print(rows[0])
+# {'id': '...', 'permalink': '...', 'name': '...', 'seller_name': '...',
+#  'ratings_count': 128, 'ratings_avg': 4.9, 'price_cents': 2900,
+#  'currency_code': 'usd', 'price_usd': 29.0, 'is_pay_what_you_want': False,
+#  'is_free': False, 'native_type': 'digital', 'url': 'https://...',
+#  'taxonomy': 'design', 'total_in_taxonomy': 4213, 'sort_used': 'best_selling', 'rank': 0}
+
+# Productos mas nuevos de un nicho concreto
+nuevos = scrape_gumroad_discover(taxonomy="3d", sort="newest", max_products=50)
+```
+
+## Cuando usarla
+
+Usala cuando quieras hacer market intelligence sobre productos digitales: descubrir que se vende mas en un nicho de Gumroad, medir la saturacion del nicho (`total_in_taxonomy`) y capturar precios, valoraciones y vendedores para decidir si un nicho merece la pena o esta saturado. Es la fuente de un pipeline de deteccion de oportunidades de producto digital (grupo `market-intel`): scrapea varias taxonomies/sorts, cruza los snapshots y prioriza nichos con demanda alta y competencia manejable. Llamala antes de cualquier analisis de catalogo digital; el dict devuelto es plano y esta listo para insertar en una tabla tras añadir `snapshot_date`/`scraped_at`.
+
+## Gotchas
+
+- **ratings.count son REVIEWS, no ventas**: `ratings_count` cuenta valoraciones dejadas, NO unidades vendidas. Como proxy de ventas hay que multiplicar por un factor incierto (solo una fraccion de compradores valora, y esa fraccion varia por nicho/precio). Trata `ratings_count` como un limite inferior ruidoso de la demanda, no como ventas reales.
+- **price=0 no siempre significa gratis util**: `price_cents==0` marca `is_free=True`, pero puede tratarse de un producto pay-what-you-want (`is_pay_what_you_want=True`) con minimo 0, no de un regalo. Cruza siempre `is_free` con `is_pay_what_you_want` antes de sacar conclusiones de precio.
+- **Ventana de paginacion finita**: `page`/`per_page` se IGNORAN (siempre devuelven desde 0); solo `from`+`size` paginan. La ventana es amplia pero finita (from~960 aun devuelve, mas alla se agota). Pedir `max_products` muy alto puede recibir menos productos de los pedidos: la funcion para cuando una pagina devuelve menos de `page_size` items.
+- **Cloudflare bloquea sin UA de navegador**: el endpoint exige `Accept: application/json` y un `User-Agent` de navegador. Sin ello Gumroad/Cloudflare puede devolver una pagina de challenge en HTML (no JSON) o redirigir. La funcion ya envia un UA de Chrome; si aun asi recibe un body no-JSON lanza `RuntimeError` claro — en ese caso cae al navegador del ecosistema (browser MCP / CDP).
+- **Moneda no siempre USD**: `price_usd` es solo `price_cents/100` por conveniencia; si `currency_code != 'usd'` el valor NO esta convertido a dolares. Conserva y usa `currency_code` para convertir tu mismo.
@@ -0,0 +1,245 @@
+"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
+
+Uses Gumroad's verified public JSON search endpoint
+
+    GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
+
+to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
+endpoint exposes, besides the product list, the ``total`` count of products in
+that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
+their own product counts). This scraper focuses on the product list and stamps
+each product with the taxonomy-level ``total`` so a downstream consumer can
+reason about saturation without a second request.
+
+Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
+The function is impure (it performs network I/O) and raises ``RuntimeError`` on
+HTTP / JSON failures.
+"""
+
+from __future__ import annotations
+
+import gzip
+import json
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+import zlib
+
+_BASE_URL = "https://gumroad.com/products/search"
+
+# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
+# the request or redirect away from the JSON payload.
+_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+
+_VALID_SORTS = (
+    "best_selling",
+    "most_reviewed",
+    "hot_and_new",
+    "highest_rated",
+    "newest",
+    "price_asc",
+    "price_desc",
+)
+
+
+def _build_headers() -> dict:
+    """Headers Gumroad needs to serve the JSON search payload."""
+    return {
+        "User-Agent": _USER_AGENT,
+        "Accept": "application/json",
+        "Accept-Language": "en-US,en;q=0.9",
+        # Request an uncompressed body: urllib does not transparently inflate
+        # gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
+        # Asking for identity keeps the payload as plain JSON. A defensive
+        # inflate in _fetch_json covers the case where Cloudflare ignores this.
+        "Accept-Encoding": "identity",
+        "Connection": "keep-alive",
+        "X-Requested-With": "XMLHttpRequest",
+    }
+
+
+def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
+    """Compose the Discover search URL for a page window.
+
+    Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
+    Only ``from`` (offset) + ``size`` paginate.
+    """
+    query = urllib.parse.urlencode(
+        {
+            "taxonomy": taxonomy,
+            "sort": sort,
+            "from": offset,
+            "size": size,
+        }
+    )
+    return f"{_BASE_URL}?{query}"
+
+
+def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
+    """GET the URL and decode the JSON body. Raises RuntimeError on failure."""
+    req = urllib.request.Request(url, headers=headers, method="GET")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read()
+            # Defensive inflate: Cloudflare may still return a gzip/deflate body
+            # (magic bytes 1f 8b for gzip) even when we ask for identity.
+            encoding = (resp.headers.get("Content-Encoding") or "").lower()
+            if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
+                raw = gzip.decompress(raw)
+            elif "deflate" in encoding:
+                raw = zlib.decompress(raw)
+    except urllib.error.HTTPError as exc:
+        raise RuntimeError(
+            f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
+            "Cloudflare may be blocking the request; ensure a browser "
+            "User-Agent is sent, or fall back to the browser MCP/CDP path."
+        ) from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(
+            f"Gumroad search request to {url} failed: {exc.reason}"
+        ) from exc
+
+    try:
+        return json.loads(raw.decode("utf-8"))
+    except (ValueError, UnicodeDecodeError) as exc:
+        raise RuntimeError(
+            f"Gumroad search returned non-JSON body for {url}: {exc}. "
+            "A browser User-Agent is required; a Cloudflare challenge page "
+            "is returned as HTML, not JSON."
+        ) from exc
+
+
+def _normalize_product(
+    product: dict,
+    taxonomy: str,
+    total_in_taxonomy: int,
+    sort: str,
+    rank: int,
+) -> dict:
+    """Flatten a raw Gumroad product into the flat dict contract."""
+    seller = product.get("seller") or {}
+    ratings = product.get("ratings") or {}
+    price_cents = product.get("price_cents")
+    if not isinstance(price_cents, int):
+        price_cents = 0
+    currency_code = product.get("currency_code")
+
+    return {
+        "id": product.get("id"),
+        "permalink": product.get("permalink"),
+        "name": product.get("name"),
+        "seller_name": seller.get("name"),
+        "ratings_count": ratings.get("count"),
+        "ratings_avg": ratings.get("average"),
+        "price_cents": price_cents,
+        "currency_code": currency_code,
+        # price_usd is a convenience float (cents/100). If the currency is not
+        # USD we keep the numeric value but preserve currency_code so the
+        # consumer can convert/decide.
+        "price_usd": price_cents / 100.0,
+        "is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
+        "is_free": price_cents == 0,
+        "native_type": product.get("native_type"),
+        "url": product.get("url"),
+        "taxonomy": taxonomy,
+        "total_in_taxonomy": total_in_taxonomy,
+        "sort_used": sort,
+        "rank": rank,
+    }
+
+
+def scrape_gumroad_discover(
+    taxonomy: str,
+    sort: str = "best_selling",
+    max_products: int = 300,
+    page_size: int = 100,
+) -> list[dict]:
+    """Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
+
+    Paginates the verified Gumroad search endpoint with ``from``+``size`` until
+    ``max_products`` are collected or a page returns fewer than ``page_size``
+    items (end of window). Each product is normalized to a flat dict carrying
+    the taxonomy-level ``total`` (niche saturation), the sort used and the
+    0-based rank in the returned order.
+
+    Args:
+        taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
+            ``"business-and-money"``, ``"3d"``. Determines the market segment
+            scraped and the ``total_in_taxonomy`` reported on every product.
+        sort: One of ``best_selling, most_reviewed, hot_and_new,
+            highest_rated, newest, price_asc, price_desc``. Any other value
+            raises ``ValueError``.
+        max_products: Upper bound on how many products to collect across pages.
+            Gumroad's pagination window is finite (from~960 still returns), so
+            very high values may hit fewer results than requested.
+        page_size: Items requested per page via ``size``. Gumroad accepts at
+            least 300; a page returning fewer than this signals the end.
+
+    Returns:
+        A list of flat dicts, one per product, with exactly these keys:
+        ``id, permalink, name, seller_name, ratings_count, ratings_avg,
+        price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
+        native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
+
+    Raises:
+        ValueError: If ``sort`` is not one of the allowed values, or if
+            ``max_products``/``page_size`` are not positive.
+        RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
+            (typically a Cloudflare challenge served without a browser UA).
+    """
+    if sort not in _VALID_SORTS:
+        raise ValueError(
+            f"sort must be one of {_VALID_SORTS}, got {sort!r}"
+        )
+    if max_products <= 0:
+        raise ValueError(f"max_products must be positive, got {max_products}")
+    if page_size <= 0:
+        raise ValueError(f"page_size must be positive, got {page_size}")
+
+    headers = _build_headers()
+    results: list[dict] = []
+    total_in_taxonomy = 0
+    offset = 0
+
+    while len(results) < max_products:
+        # Never ask for more than we still need on the last page.
+        size = min(page_size, max_products - len(results))
+        url = _build_url(taxonomy, sort, offset, page_size)
+        payload = _fetch_json(url, headers, timeout=20)
+
+        # The taxonomy-level total is stamped on every product; capture it once.
+        total_val = payload.get("total")
+        if isinstance(total_val, int):
+            total_in_taxonomy = total_val
+
+        products = payload.get("products") or []
+        if not products:
+            break
+
+        for product in products:
+            if len(results) >= max_products:
+                break
+            rank = len(results)  # 0-based position across the whole scrape
+            results.append(
+                _normalize_product(
+                    product,
+                    taxonomy=taxonomy,
+                    total_in_taxonomy=total_in_taxonomy,
+                    sort=sort,
+                    rank=rank,
+                )
+            )
+
+        # A short page means we exhausted the window: stop.
+        if len(products) < page_size:
+            break
+
+        offset += page_size
+        # Be polite between requests so we don't hammer Gumroad.
+        time.sleep(0.4)
+
+    return results
@@ -0,0 +1,177 @@
+"""Tests para scrape_gumroad_discover.
+
+Mockean urllib.request.urlopen para NO hacer red: se inyecta un cuerpo JSON de
+Gumroad con productos de ejemplo y se verifica la normalizacion a dict plano, el
+corte de la paginacion, la validacion de sort y el manejo de un body no-JSON
+(escenario tipico de challenge de Cloudflare). El scrape real no se testea aqui.
+"""
+
+import json
+
+import pytest
+
+from scrape_gumroad_discover import scrape_gumroad_discover
+
+
+class _FakeResponse:
+    """Context manager que imita la respuesta de urllib.request.urlopen."""
+
+    def __init__(self, raw: bytes, headers: dict | None = None):
+        self._raw = raw
+        # urllib response exposes .headers; the scraper reads Content-Encoding
+        # from it to decide whether to inflate the body.
+        self.headers = headers or {}
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        return False
+
+    def read(self):
+        return self._raw
+
+
+def _json_response(payload: dict) -> _FakeResponse:
+    return _FakeResponse(json.dumps(payload).encode("utf-8"))
+
+
+# Dos productos de ejemplo con la estructura real verificada de Gumroad.
+_SAMPLE_PRODUCTS = [
+    {
+        "id": "prod_1",
+        "permalink": "coolkit",
+        "name": "Cool Design Kit",
+        "seller": {"id": "s1", "name": "Alice Design", "avatar_url": "http://a"},
+        "ratings": {"count": 128, "average": 4.9},
+        "thumbnail_url": "http://thumb1",
+        "native_type": "digital",
+        "price_cents": 2900,
+        "currency_code": "usd",
+        "is_pay_what_you_want": False,
+        "url": "https://alice.gumroad.com/l/coolkit",
+        "description": "A kit",
+    },
+    {
+        "id": "prod_2",
+        "permalink": "freebie",
+        "name": "Free Font Pack",
+        "seller": {"id": "s2", "name": "Bob Type"},
+        "ratings": {"count": 0, "average": 0.0},
+        "native_type": "digital",
+        "price_cents": 0,
+        "currency_code": "eur",
+        "is_pay_what_you_want": True,
+        "url": "https://bob.gumroad.com/l/freebie",
+    },
+]
+
+
+def test_normaliza_producto_a_dict_plano(monkeypatch):
+    payload = {"total": 4213, "tags_data": [], "products": _SAMPLE_PRODUCTS}
+
+    def fake_urlopen(req, timeout=None):
+        return _json_response(payload)
+
+    monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
+
+    rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=10, page_size=100)
+
+    assert len(rows) == 2
+
+    first = rows[0]
+    # Estructura plana exacta.
+    assert set(first.keys()) == {
+        "id", "permalink", "name", "seller_name", "ratings_count", "ratings_avg",
+        "price_cents", "currency_code", "price_usd", "is_pay_what_you_want",
+        "is_free", "native_type", "url", "taxonomy", "total_in_taxonomy",
+        "sort_used", "rank",
+    }
+    assert first["id"] == "prod_1"
+    assert first["name"] == "Cool Design Kit"
+    assert first["seller_name"] == "Alice Design"      # anidado -> plano
+    assert first["ratings_count"] == 128
+    assert first["ratings_avg"] == 4.9
+    assert first["price_cents"] == 2900
+    assert first["price_usd"] == 29.0                  # cents/100
+    assert first["currency_code"] == "usd"
+    assert first["is_pay_what_you_want"] is False
+    assert first["is_free"] is False
+    assert first["native_type"] == "digital"
+    assert first["taxonomy"] == "design"               # el arg
+    assert first["total_in_taxonomy"] == 4213          # el total del JSON
+    assert first["sort_used"] == "best_selling"
+    assert first["rank"] == 0
+
+    # Segundo producto: gratis / pay-what-you-want, moneda no-usd conservada.
+    second = rows[1]
+    assert second["price_cents"] == 0
+    assert second["price_usd"] == 0.0
+    assert second["is_free"] is True
+    assert second["is_pay_what_you_want"] is True
+    assert second["currency_code"] == "eur"            # se conserva, no se convierte
+    assert second["rank"] == 1
+
+
+def test_paginacion_para_al_agotar_ventana(monkeypatch):
+    # page_size=2 y una sola pagina con 2 productos: como len(products) == page_size
+    # se intentaria otra pagina; la segunda devuelve products vacios -> corta.
+    call_count = {"n": 0}
+
+    def fake_urlopen(req, timeout=None):
+        call_count["n"] += 1
+        if call_count["n"] == 1:
+            return _json_response({"total": 2, "products": _SAMPLE_PRODUCTS})
+        return _json_response({"total": 2, "products": []})
+
+    monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
+    monkeypatch.setattr("time.sleep", lambda *_: None)  # no dormir en test
+
+    rows = scrape_gumroad_discover(taxonomy="design", max_products=100, page_size=2)
+
+    assert len(rows) == 2
+    assert call_count["n"] == 2  # pidio segunda pagina, vino vacia, paro
+    assert [r["rank"] for r in rows] == [0, 1]
+
+
+def test_sort_invalido_lanza_valueerror(monkeypatch):
+    # No debe llegar a hacer red: falla en validacion antes.
+    def fake_urlopen(req, timeout=None):
+        raise AssertionError("no deberia hacer red con sort invalido")
+
+    monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
+
+    with pytest.raises(ValueError, match="sort must be one of"):
+        scrape_gumroad_discover(taxonomy="design", sort="trending")
+
+
+def test_body_gzip_se_descomprime(monkeypatch):
+    # Cloudflare puede servir el JSON gzip-comprimido aunque se pida identity.
+    # El scraper debe inflar el cuerpo (magic bytes 1f 8b) y parsear el JSON.
+    import gzip as _gzip
+
+    payload = {"total": 7, "products": _SAMPLE_PRODUCTS}
+    gz = _gzip.compress(json.dumps(payload).encode("utf-8"))
+
+    def fake_urlopen(req, timeout=None):
+        return _FakeResponse(gz, headers={"Content-Encoding": "gzip"})
+
+    monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
+    monkeypatch.setattr("time.sleep", lambda *_: None)
+
+    rows = scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
+
+    assert len(rows) == 2
+    assert rows[0]["name"] == "Cool Design Kit"
+    assert rows[0]["total_in_taxonomy"] == 7
+
+
+def test_body_no_json_lanza_runtimeerror(monkeypatch):
+    # Cloudflare challenge: devuelve HTML, no JSON.
+    def fake_urlopen(req, timeout=None):
+        return _FakeResponse(b"<html><body>Just a moment...</body></html>")
+
+    monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
+
+    with pytest.raises(RuntimeError, match="non-JSON"):
+        scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)