feat(datascience): auto-commit con 7 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,177 @@
|
||||
"""Tests para scrape_gumroad_discover.
|
||||
|
||||
Mockean urllib.request.urlopen para NO hacer red: se inyecta un cuerpo JSON de
|
||||
Gumroad con productos de ejemplo y se verifica la normalizacion a dict plano, el
|
||||
corte de la paginacion, la validacion de sort y el manejo de un body no-JSON
|
||||
(escenario tipico de challenge de Cloudflare). El scrape real no se testea aqui.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from scrape_gumroad_discover import scrape_gumroad_discover
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
"""Context manager que imita la respuesta de urllib.request.urlopen."""
|
||||
|
||||
def __init__(self, raw: bytes, headers: dict | None = None):
|
||||
self._raw = raw
|
||||
# urllib response exposes .headers; the scraper reads Content-Encoding
|
||||
# from it to decide whether to inflate the body.
|
||||
self.headers = headers or {}
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc):
|
||||
return False
|
||||
|
||||
def read(self):
|
||||
return self._raw
|
||||
|
||||
|
||||
def _json_response(payload: dict) -> _FakeResponse:
|
||||
return _FakeResponse(json.dumps(payload).encode("utf-8"))
|
||||
|
||||
|
||||
# Dos productos de ejemplo con la estructura real verificada de Gumroad.
|
||||
_SAMPLE_PRODUCTS = [
|
||||
{
|
||||
"id": "prod_1",
|
||||
"permalink": "coolkit",
|
||||
"name": "Cool Design Kit",
|
||||
"seller": {"id": "s1", "name": "Alice Design", "avatar_url": "http://a"},
|
||||
"ratings": {"count": 128, "average": 4.9},
|
||||
"thumbnail_url": "http://thumb1",
|
||||
"native_type": "digital",
|
||||
"price_cents": 2900,
|
||||
"currency_code": "usd",
|
||||
"is_pay_what_you_want": False,
|
||||
"url": "https://alice.gumroad.com/l/coolkit",
|
||||
"description": "A kit",
|
||||
},
|
||||
{
|
||||
"id": "prod_2",
|
||||
"permalink": "freebie",
|
||||
"name": "Free Font Pack",
|
||||
"seller": {"id": "s2", "name": "Bob Type"},
|
||||
"ratings": {"count": 0, "average": 0.0},
|
||||
"native_type": "digital",
|
||||
"price_cents": 0,
|
||||
"currency_code": "eur",
|
||||
"is_pay_what_you_want": True,
|
||||
"url": "https://bob.gumroad.com/l/freebie",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_normaliza_producto_a_dict_plano(monkeypatch):
|
||||
payload = {"total": 4213, "tags_data": [], "products": _SAMPLE_PRODUCTS}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _json_response(payload)
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=10, page_size=100)
|
||||
|
||||
assert len(rows) == 2
|
||||
|
||||
first = rows[0]
|
||||
# Estructura plana exacta.
|
||||
assert set(first.keys()) == {
|
||||
"id", "permalink", "name", "seller_name", "ratings_count", "ratings_avg",
|
||||
"price_cents", "currency_code", "price_usd", "is_pay_what_you_want",
|
||||
"is_free", "native_type", "url", "taxonomy", "total_in_taxonomy",
|
||||
"sort_used", "rank",
|
||||
}
|
||||
assert first["id"] == "prod_1"
|
||||
assert first["name"] == "Cool Design Kit"
|
||||
assert first["seller_name"] == "Alice Design" # anidado -> plano
|
||||
assert first["ratings_count"] == 128
|
||||
assert first["ratings_avg"] == 4.9
|
||||
assert first["price_cents"] == 2900
|
||||
assert first["price_usd"] == 29.0 # cents/100
|
||||
assert first["currency_code"] == "usd"
|
||||
assert first["is_pay_what_you_want"] is False
|
||||
assert first["is_free"] is False
|
||||
assert first["native_type"] == "digital"
|
||||
assert first["taxonomy"] == "design" # el arg
|
||||
assert first["total_in_taxonomy"] == 4213 # el total del JSON
|
||||
assert first["sort_used"] == "best_selling"
|
||||
assert first["rank"] == 0
|
||||
|
||||
# Segundo producto: gratis / pay-what-you-want, moneda no-usd conservada.
|
||||
second = rows[1]
|
||||
assert second["price_cents"] == 0
|
||||
assert second["price_usd"] == 0.0
|
||||
assert second["is_free"] is True
|
||||
assert second["is_pay_what_you_want"] is True
|
||||
assert second["currency_code"] == "eur" # se conserva, no se convierte
|
||||
assert second["rank"] == 1
|
||||
|
||||
|
||||
def test_paginacion_para_al_agotar_ventana(monkeypatch):
|
||||
# page_size=2 y una sola pagina con 2 productos: como len(products) == page_size
|
||||
# se intentaria otra pagina; la segunda devuelve products vacios -> corta.
|
||||
call_count = {"n": 0}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return _json_response({"total": 2, "products": _SAMPLE_PRODUCTS})
|
||||
return _json_response({"total": 2, "products": []})
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
monkeypatch.setattr("time.sleep", lambda *_: None) # no dormir en test
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", max_products=100, page_size=2)
|
||||
|
||||
assert len(rows) == 2
|
||||
assert call_count["n"] == 2 # pidio segunda pagina, vino vacia, paro
|
||||
assert [r["rank"] for r in rows] == [0, 1]
|
||||
|
||||
|
||||
def test_sort_invalido_lanza_valueerror(monkeypatch):
|
||||
# No debe llegar a hacer red: falla en validacion antes.
|
||||
def fake_urlopen(req, timeout=None):
|
||||
raise AssertionError("no deberia hacer red con sort invalido")
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
with pytest.raises(ValueError, match="sort must be one of"):
|
||||
scrape_gumroad_discover(taxonomy="design", sort="trending")
|
||||
|
||||
|
||||
def test_body_gzip_se_descomprime(monkeypatch):
|
||||
# Cloudflare puede servir el JSON gzip-comprimido aunque se pida identity.
|
||||
# El scraper debe inflar el cuerpo (magic bytes 1f 8b) y parsear el JSON.
|
||||
import gzip as _gzip
|
||||
|
||||
payload = {"total": 7, "products": _SAMPLE_PRODUCTS}
|
||||
gz = _gzip.compress(json.dumps(payload).encode("utf-8"))
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _FakeResponse(gz, headers={"Content-Encoding": "gzip"})
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
monkeypatch.setattr("time.sleep", lambda *_: None)
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
|
||||
|
||||
assert len(rows) == 2
|
||||
assert rows[0]["name"] == "Cool Design Kit"
|
||||
assert rows[0]["total_in_taxonomy"] == 7
|
||||
|
||||
|
||||
def test_body_no_json_lanza_runtimeerror(monkeypatch):
|
||||
# Cloudflare challenge: devuelve HTML, no JSON.
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _FakeResponse(b"<html><body>Just a moment...</body></html>")
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
with pytest.raises(RuntimeError, match="non-JSON"):
|
||||
scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
|
||||
Reference in New Issue
Block a user