"""Tests para scrape_gumroad_discover. Mockean urllib.request.urlopen para NO hacer red: se inyecta un cuerpo JSON de Gumroad con productos de ejemplo y se verifica la normalizacion a dict plano, el corte de la paginacion, la validacion de sort y el manejo de un body no-JSON (escenario tipico de challenge de Cloudflare). El scrape real no se testea aqui. """ import json import pytest from scrape_gumroad_discover import scrape_gumroad_discover class _FakeResponse: """Context manager que imita la respuesta de urllib.request.urlopen.""" def __init__(self, raw: bytes, headers: dict | None = None): self._raw = raw # urllib response exposes .headers; the scraper reads Content-Encoding # from it to decide whether to inflate the body. self.headers = headers or {} def __enter__(self): return self def __exit__(self, *exc): return False def read(self): return self._raw def _json_response(payload: dict) -> _FakeResponse: return _FakeResponse(json.dumps(payload).encode("utf-8")) # Dos productos de ejemplo con la estructura real verificada de Gumroad. _SAMPLE_PRODUCTS = [ { "id": "prod_1", "permalink": "coolkit", "name": "Cool Design Kit", "seller": {"id": "s1", "name": "Alice Design", "avatar_url": "http://a"}, "ratings": {"count": 128, "average": 4.9}, "thumbnail_url": "http://thumb1", "native_type": "digital", "price_cents": 2900, "currency_code": "usd", "is_pay_what_you_want": False, "url": "https://alice.gumroad.com/l/coolkit", "description": "A kit", }, { "id": "prod_2", "permalink": "freebie", "name": "Free Font Pack", "seller": {"id": "s2", "name": "Bob Type"}, "ratings": {"count": 0, "average": 0.0}, "native_type": "digital", "price_cents": 0, "currency_code": "eur", "is_pay_what_you_want": True, "url": "https://bob.gumroad.com/l/freebie", }, ] def test_normaliza_producto_a_dict_plano(monkeypatch): payload = {"total": 4213, "tags_data": [], "products": _SAMPLE_PRODUCTS} def fake_urlopen(req, timeout=None): return _json_response(payload) monkeypatch.setattr("urllib.request.urlopen", fake_urlopen) rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=10, page_size=100) assert len(rows) == 2 first = rows[0] # Estructura plana exacta. assert set(first.keys()) == { "id", "permalink", "name", "seller_name", "ratings_count", "ratings_avg", "price_cents", "currency_code", "price_usd", "is_pay_what_you_want", "is_free", "native_type", "url", "taxonomy", "total_in_taxonomy", "sort_used", "rank", } assert first["id"] == "prod_1" assert first["name"] == "Cool Design Kit" assert first["seller_name"] == "Alice Design" # anidado -> plano assert first["ratings_count"] == 128 assert first["ratings_avg"] == 4.9 assert first["price_cents"] == 2900 assert first["price_usd"] == 29.0 # cents/100 assert first["currency_code"] == "usd" assert first["is_pay_what_you_want"] is False assert first["is_free"] is False assert first["native_type"] == "digital" assert first["taxonomy"] == "design" # el arg assert first["total_in_taxonomy"] == 4213 # el total del JSON assert first["sort_used"] == "best_selling" assert first["rank"] == 0 # Segundo producto: gratis / pay-what-you-want, moneda no-usd conservada. second = rows[1] assert second["price_cents"] == 0 assert second["price_usd"] == 0.0 assert second["is_free"] is True assert second["is_pay_what_you_want"] is True assert second["currency_code"] == "eur" # se conserva, no se convierte assert second["rank"] == 1 def test_paginacion_para_al_agotar_ventana(monkeypatch): # page_size=2 y una sola pagina con 2 productos: como len(products) == page_size # se intentaria otra pagina; la segunda devuelve products vacios -> corta. call_count = {"n": 0} def fake_urlopen(req, timeout=None): call_count["n"] += 1 if call_count["n"] == 1: return _json_response({"total": 2, "products": _SAMPLE_PRODUCTS}) return _json_response({"total": 2, "products": []}) monkeypatch.setattr("urllib.request.urlopen", fake_urlopen) monkeypatch.setattr("time.sleep", lambda *_: None) # no dormir en test rows = scrape_gumroad_discover(taxonomy="design", max_products=100, page_size=2) assert len(rows) == 2 assert call_count["n"] == 2 # pidio segunda pagina, vino vacia, paro assert [r["rank"] for r in rows] == [0, 1] def test_sort_invalido_lanza_valueerror(monkeypatch): # No debe llegar a hacer red: falla en validacion antes. def fake_urlopen(req, timeout=None): raise AssertionError("no deberia hacer red con sort invalido") monkeypatch.setattr("urllib.request.urlopen", fake_urlopen) with pytest.raises(ValueError, match="sort must be one of"): scrape_gumroad_discover(taxonomy="design", sort="trending") def test_body_gzip_se_descomprime(monkeypatch): # Cloudflare puede servir el JSON gzip-comprimido aunque se pida identity. # El scraper debe inflar el cuerpo (magic bytes 1f 8b) y parsear el JSON. import gzip as _gzip payload = {"total": 7, "products": _SAMPLE_PRODUCTS} gz = _gzip.compress(json.dumps(payload).encode("utf-8")) def fake_urlopen(req, timeout=None): return _FakeResponse(gz, headers={"Content-Encoding": "gzip"}) monkeypatch.setattr("urllib.request.urlopen", fake_urlopen) monkeypatch.setattr("time.sleep", lambda *_: None) rows = scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100) assert len(rows) == 2 assert rows[0]["name"] == "Cool Design Kit" assert rows[0]["total_in_taxonomy"] == 7 def test_body_no_json_lanza_runtimeerror(monkeypatch): # Cloudflare challenge: devuelve HTML, no JSON. def fake_urlopen(req, timeout=None): return _FakeResponse(b"Just a moment...") monkeypatch.setattr("urllib.request.urlopen", fake_urlopen) with pytest.raises(RuntimeError, match="non-JSON"): scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)