feat(datascience): auto-commit con 7 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,11 @@
|
||||
|
||||
Operar **ONLYOFFICE Desktop Editors** (binario `/usr/bin/onlyoffice-desktopeditors`) en Linux/X11 desde terminal, gestionando la **ventana** de los archivos sin perturbar la instancia personal del usuario.
|
||||
|
||||
Este grupo NO es el ONLYOFFICE **Document Server** (web/Docker) — para eso ver `start_documentserver_bash_infra`, `documentserver_health_go_infra`, `onlyoffice_command_service_go_infra` y compañia. Este grupo es el editor de **escritorio**.
|
||||
Este grupo NO es el ONLYOFFICE **Document Server** (web/Docker/co-editing por navegador): a día de hoy el registry NO tiene funciones de Document Server (las que antes citaba esta página — `start_documentserver_*`, `documentserver_health_*`, `onlyoffice_command_service_*` — nunca se implementaron). Este grupo es el editor de **escritorio**.
|
||||
|
||||
### Edición en vivo desde Claude — app `onlyoffice_bridge`
|
||||
|
||||
Para que Claude **lea y edite el documento abierto en tiempo real** (Word/Cell/Slide) sin cerrar/reabrir, existe la app `apps/onlyoffice_bridge/`: un plugin de sistema instalado dentro de OnlyOffice + un server loopback con long-poll. Es la alternativa "in-place" al ciclo cerrar+reabrir de `reload_onlyoffice_file` (Issue #2313). Ver su `app.md` para instalación, protocolo y limitaciones (el foco de la ventana condiciona el arranque del plugin). Este grupo (`open`/`reload`/`close`/`save`) sigue siendo la vía para gestionar la **ventana**; `onlyoffice_bridge` es la vía para editar el **contenido** en vivo.
|
||||
|
||||
## Convencion de instancia aislada (slot)
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -17,7 +17,7 @@ from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.1.1"
|
||||
CHAPTER_ID = "glosario"
|
||||
CHAPTER_TITLE = "Glosario"
|
||||
|
||||
@@ -89,14 +89,19 @@ def build_glosario(profile: dict, ctx: dict):
|
||||
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
|
||||
"definición en esta sección.")),
|
||||
]
|
||||
# One clickable destination per term, alphabetically by visible label. A term
|
||||
# registered without a definition is completed from the canonical baseline.
|
||||
for term in glossary.terms(by="label"):
|
||||
# One clickable destination per term, alphabetically by *visible* label. The
|
||||
# baseline resolution must happen BEFORE sorting: a term registered bare (no
|
||||
# label) carries its key as label in the collector, so ordering by the
|
||||
# collector's label would place it by its key instead of by the human label
|
||||
# supplied by the baseline catalog. Resolve first, then sort by the final label.
|
||||
resolved = []
|
||||
for term in glossary.terms(by="order"):
|
||||
label, definition = _resolve_term(term)
|
||||
resolved.append((label, definition, model._safe_str(term.get("key"))))
|
||||
resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
|
||||
for label, definition, key in resolved:
|
||||
blocks.append(model.GlossaryEntry(
|
||||
key=model._safe_str(term.get("key")),
|
||||
label=label,
|
||||
definition=definition))
|
||||
key=key, label=label, definition=definition))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
"""Tests for the GLOSARIO chapter — DoD: golden + edges + degradation + no-cut render.
|
||||
|
||||
The glossary is the last chapter of every AutomaticEDA document. It does not read
|
||||
the profile: it turns the terms that the other chapters registered on the shared
|
||||
``GlossaryCollector`` (``ctx['glossary']``) into one clickable ``GlossaryEntry``
|
||||
destination each, alphabetically by visible label.
|
||||
|
||||
Covered here:
|
||||
|
||||
- **Golden**: a collector with three terms (one carrying its own definition, two
|
||||
registered bare and completed from the canonical baseline catalog) builds a
|
||||
``Chapter`` with three ``GlossaryEntry`` blocks, alphabetically ordered, and
|
||||
renders to PDF and PPTX with nothing cut.
|
||||
- **Baseline resolution** (``_resolve_term``): a bare term whose key is in the
|
||||
baseline gets its label *and* definition filled in; a term that already carries
|
||||
its own definition is never overwritten.
|
||||
- **Edges**: ``None`` / ``{}`` ctx, an empty collector and a non-collector value in
|
||||
``ctx['glossary']`` all return ``None`` (the chapter simply disappears) and never
|
||||
raise, even with a ``None`` profile.
|
||||
- **Click target**: every emitted entry carries the registered ``key`` so each
|
||||
in-text ``[[term:key]]`` appearance resolves to a real jump.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from pptx import Presentation
|
||||
from pypdf import PdfReader
|
||||
|
||||
from datascience.automatic_eda.chapters.glosario import (
|
||||
_BASELINE_TERMS,
|
||||
_resolve_term,
|
||||
build_glosario,
|
||||
)
|
||||
from datascience.automatic_eda.model import (
|
||||
Chapter,
|
||||
GlossaryCollector,
|
||||
GlossaryEntry,
|
||||
)
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Helpers.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _entries(chapter: Chapter) -> list:
|
||||
"""The GlossaryEntry blocks of a built chapter, in document order."""
|
||||
return [b for b in chapter.blocks if isinstance(b, GlossaryEntry)]
|
||||
|
||||
|
||||
def _render_both(chapter: Chapter, tag: str):
|
||||
"""Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
|
||||
tmp = tempfile.mkdtemp(prefix=f"glosario_{tag}_")
|
||||
pdf_path = os.path.join(tmp, "out.pdf")
|
||||
pptx_path = os.path.join(tmp, "out.pptx")
|
||||
meta = {"title": f"EDA — {tag}"}
|
||||
render_automatic_eda_pdf([chapter], pdf_path, meta)
|
||||
render_automatic_eda_pptx([chapter], pptx_path, meta)
|
||||
assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
|
||||
assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
|
||||
text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
|
||||
n_slides = len(Presentation(pptx_path).slides)
|
||||
return text, n_slides
|
||||
|
||||
|
||||
def _collector_three_terms() -> GlossaryCollector:
|
||||
"""A collector with three terms registered out of alphabetical order:
|
||||
|
||||
- ``entropia``: its own label + definition (must not be baseline-overwritten).
|
||||
- ``pagina_categorica``: bare, completed from the baseline.
|
||||
- ``histograma_boxplot``: bare, completed from the baseline.
|
||||
"""
|
||||
g = GlossaryCollector()
|
||||
g.add("entropia", "Entropía",
|
||||
"Medida de la incertidumbre o dispersión de una variable categórica.")
|
||||
g.add("pagina_categorica") # bare -> baseline label + definition
|
||||
g.add("histograma_boxplot") # bare -> baseline label + definition
|
||||
return g
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_terms_render_clickable_entries():
|
||||
g = _collector_three_terms()
|
||||
chapter = build_glosario({"table": "x"}, {"glossary": g})
|
||||
|
||||
assert isinstance(chapter, Chapter)
|
||||
assert chapter.id == "glosario"
|
||||
assert chapter.title == "Glosario"
|
||||
assert chapter.version == "1.1.1"
|
||||
|
||||
entries = _entries(chapter)
|
||||
assert len(entries) == 3
|
||||
assert all(isinstance(e, GlossaryEntry) for e in entries)
|
||||
|
||||
# Alphabetical by visible label: "Cómo leer…" < "Cómo se organiza…" < "Entropía".
|
||||
labels = [e.label for e in entries]
|
||||
assert labels == sorted(labels, key=str.lower)
|
||||
assert labels[0] == "Cómo leer el histograma y el boxplot"
|
||||
assert labels[-1] == "Entropía"
|
||||
|
||||
# Bare terms were completed from the baseline; the own-definition term survived.
|
||||
by_key = {e.key: e for e in entries}
|
||||
assert "boxplot de Tukey" in by_key["histograma_boxplot"].definition
|
||||
assert "identificador" in by_key["pagina_categorica"].definition
|
||||
assert by_key["entropia"].definition.startswith("Medida de la incertidumbre")
|
||||
|
||||
# Renders with nothing cut; the labels and a definition fragment reach the PDF.
|
||||
pdf_text, n_slides = _render_both(chapter, "golden")
|
||||
assert "Entropía" in pdf_text
|
||||
assert n_slides >= 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Baseline resolution (_resolve_term).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_resolve_term_completes_label_and_definition_from_baseline():
|
||||
# A bare registration keeps label == key and an empty definition; the resolver
|
||||
# fills both from the canonical catalog.
|
||||
key = "histograma_boxplot"
|
||||
label, definition = _resolve_term({"key": key, "label": key, "definition": ""})
|
||||
assert label == _BASELINE_TERMS[key]["label"]
|
||||
assert "boxplot de Tukey" in definition
|
||||
|
||||
|
||||
def test_resolve_term_keeps_own_definition_over_baseline():
|
||||
# Even when the key is in the baseline, a term that already carries its own
|
||||
# definition (and a real label) must not be overwritten.
|
||||
key = "pagina_categorica"
|
||||
own_def = "Definición propia que no debe pisarse."
|
||||
label, definition = _resolve_term(
|
||||
{"key": key, "label": "Mi etiqueta", "definition": own_def})
|
||||
assert label == "Mi etiqueta"
|
||||
assert definition == own_def
|
||||
|
||||
|
||||
def test_resolve_term_unknown_key_returns_as_is():
|
||||
label, definition = _resolve_term(
|
||||
{"key": "sin_baseline", "label": "Término libre", "definition": "Texto."})
|
||||
assert label == "Término libre"
|
||||
assert definition == "Texto."
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges / degradation — the chapter disappears instead of raising.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_none_when_no_glossary():
|
||||
assert build_glosario({"table": "x"}, {}) is None
|
||||
assert build_glosario({"table": "x"}, None) is None
|
||||
|
||||
|
||||
def test_none_when_empty_collector():
|
||||
assert build_glosario({"table": "x"}, {"glossary": GlossaryCollector()}) is None
|
||||
|
||||
|
||||
def test_none_when_glossary_is_not_a_collector():
|
||||
# A stray value in ctx['glossary'] must not be treated as a collector.
|
||||
assert build_glosario({"table": "x"}, {"glossary": ["not", "a", "collector"]}) is None
|
||||
assert build_glosario({"table": "x"}, {"glossary": {"entropia": "x"}}) is None
|
||||
|
||||
|
||||
def test_none_profile_does_not_raise():
|
||||
# The glossary ignores the profile; a None profile with a valid collector still
|
||||
# builds, and a None profile with no glossary still returns None (no crash).
|
||||
g = GlossaryCollector()
|
||||
g.add("entropia", "Entropía", "def")
|
||||
chapter = build_glosario(None, {"glossary": g})
|
||||
assert isinstance(chapter, Chapter)
|
||||
assert build_glosario(None, None) is None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Click target — each entry carries its registration key.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_entries_carry_registered_key_as_click_target():
|
||||
g = _collector_three_terms()
|
||||
chapter = build_glosario({}, {"glossary": g})
|
||||
keys = {e.key for e in _entries(chapter)}
|
||||
assert keys == {"entropia", "pagina_categorica", "histograma_boxplot"}
|
||||
@@ -0,0 +1,65 @@
|
||||
---
|
||||
name: scrape_gumroad_discover
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def scrape_gumroad_discover(taxonomy: str, sort: str = 'best_selling', max_products: int = 300, page_size: int = 100) -> list[dict]"
|
||||
description: "Scrapea el marketplace publico de Gumroad Discover usando el endpoint JSON verificado gumroad.com/products/search (taxonomy+sort+from+size). Recolecta los productos de una taxonomy (nicho) ordenados por el criterio elegido y estampa en cada producto el total de la taxonomy (saturacion del nicho). Normaliza cada producto a un dict plano con id, seller_name, ratings, precio (cents/usd), pay-what-you-want/free, native_type, url y metadatos de scrape (taxonomy, total_in_taxonomy, sort_used, rank 0-based). Solo stdlib (urllib+json+time)."
|
||||
tags: [gumroad, scraping, market-intel, trends, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_normaliza_producto_a_dict_plano", "test_paginacion_para_al_agotar_ventana", "test_sort_invalido_lanza_valueerror", "test_body_no_json_lanza_runtimeerror"]
|
||||
test_file_path: "python/functions/datascience/scrape_gumroad_discover_test.py"
|
||||
file_path: "python/functions/datascience/scrape_gumroad_discover.py"
|
||||
params:
|
||||
- name: taxonomy
|
||||
desc: "Slug de taxonomy / nicho de Gumroad (ej. 'design', 'business-and-money', '3d'). Determina el segmento de mercado scrapeado y el valor total_in_taxonomy (numero total de productos = saturacion del nicho) que se estampa en cada producto."
|
||||
- name: sort
|
||||
desc: "Criterio de orden. Uno de: best_selling, most_reviewed, hot_and_new, highest_rated, newest, price_asc, price_desc. Cualquier otro valor lanza ValueError. Default 'best_selling'."
|
||||
- name: max_products
|
||||
desc: "Cota superior de productos a recolectar entre paginas. Default 300. La ventana de paginacion de Gumroad es finita (from~960 aun devuelve datos), asi que valores muy altos pueden recibir menos productos de los pedidos."
|
||||
- name: page_size
|
||||
desc: "Numero de productos pedidos por pagina via 'size'. Gumroad admite al menos 300. Una pagina que devuelve menos de page_size items señala el fin de la ventana y detiene la paginacion. Default 100."
|
||||
output: "Lista de dicts planos, uno por producto, con exactamente estas claves: id, permalink, name, seller_name, ratings_count, ratings_avg, price_cents, currency_code, price_usd (float = price_cents/100), is_pay_what_you_want (bool), is_free (bool = price_cents==0), native_type, url, taxonomy (el arg), total_in_taxonomy (el 'total' del JSON = saturacion del nicho), sort_used (el arg sort), rank (posicion 0-based en el orden devuelto)."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.scrape_gumroad_discover import scrape_gumroad_discover
|
||||
|
||||
# Top best-sellers del nicho "design" en Gumroad Discover
|
||||
rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=300, page_size=100)
|
||||
print(len(rows), "productos")
|
||||
print("saturacion del nicho:", rows[0]["total_in_taxonomy"])
|
||||
print(rows[0])
|
||||
# {'id': '...', 'permalink': '...', 'name': '...', 'seller_name': '...',
|
||||
# 'ratings_count': 128, 'ratings_avg': 4.9, 'price_cents': 2900,
|
||||
# 'currency_code': 'usd', 'price_usd': 29.0, 'is_pay_what_you_want': False,
|
||||
# 'is_free': False, 'native_type': 'digital', 'url': 'https://...',
|
||||
# 'taxonomy': 'design', 'total_in_taxonomy': 4213, 'sort_used': 'best_selling', 'rank': 0}
|
||||
|
||||
# Productos mas nuevos de un nicho concreto
|
||||
nuevos = scrape_gumroad_discover(taxonomy="3d", sort="newest", max_products=50)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Usala cuando quieras hacer market intelligence sobre productos digitales: descubrir que se vende mas en un nicho de Gumroad, medir la saturacion del nicho (`total_in_taxonomy`) y capturar precios, valoraciones y vendedores para decidir si un nicho merece la pena o esta saturado. Es la fuente de un pipeline de deteccion de oportunidades de producto digital (grupo `market-intel`): scrapea varias taxonomies/sorts, cruza los snapshots y prioriza nichos con demanda alta y competencia manejable. Llamala antes de cualquier analisis de catalogo digital; el dict devuelto es plano y esta listo para insertar en una tabla tras añadir `snapshot_date`/`scraped_at`.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **ratings.count son REVIEWS, no ventas**: `ratings_count` cuenta valoraciones dejadas, NO unidades vendidas. Como proxy de ventas hay que multiplicar por un factor incierto (solo una fraccion de compradores valora, y esa fraccion varia por nicho/precio). Trata `ratings_count` como un limite inferior ruidoso de la demanda, no como ventas reales.
|
||||
- **price=0 no siempre significa gratis util**: `price_cents==0` marca `is_free=True`, pero puede tratarse de un producto pay-what-you-want (`is_pay_what_you_want=True`) con minimo 0, no de un regalo. Cruza siempre `is_free` con `is_pay_what_you_want` antes de sacar conclusiones de precio.
|
||||
- **Ventana de paginacion finita**: `page`/`per_page` se IGNORAN (siempre devuelven desde 0); solo `from`+`size` paginan. La ventana es amplia pero finita (from~960 aun devuelve, mas alla se agota). Pedir `max_products` muy alto puede recibir menos productos de los pedidos: la funcion para cuando una pagina devuelve menos de `page_size` items.
|
||||
- **Cloudflare bloquea sin UA de navegador**: el endpoint exige `Accept: application/json` y un `User-Agent` de navegador. Sin ello Gumroad/Cloudflare puede devolver una pagina de challenge en HTML (no JSON) o redirigir. La funcion ya envia un UA de Chrome; si aun asi recibe un body no-JSON lanza `RuntimeError` claro — en ese caso cae al navegador del ecosistema (browser MCP / CDP).
|
||||
- **Moneda no siempre USD**: `price_usd` es solo `price_cents/100` por conveniencia; si `currency_code != 'usd'` el valor NO esta convertido a dolares. Conserva y usa `currency_code` para convertir tu mismo.
|
||||
@@ -0,0 +1,245 @@
|
||||
"""Scrape the public Gumroad Discover marketplace for niche/market intelligence.
|
||||
|
||||
Uses Gumroad's verified public JSON search endpoint
|
||||
|
||||
GET https://gumroad.com/products/search?taxonomy=<taxonomy>&sort=<sort>&from=<offset>&size=<n>
|
||||
|
||||
to collect the products of a taxonomy (niche) sorted by a chosen criterion. The
|
||||
endpoint exposes, besides the product list, the ``total`` count of products in
|
||||
that taxonomy (a proxy for niche saturation) and ``tags_data`` (sub-niches with
|
||||
their own product counts). This scraper focuses on the product list and stamps
|
||||
each product with the taxonomy-level ``total`` so a downstream consumer can
|
||||
reason about saturation without a second request.
|
||||
|
||||
Only stdlib (``urllib``, ``json``, ``time``) is used — no heavy dependencies.
|
||||
The function is impure (it performs network I/O) and raises ``RuntimeError`` on
|
||||
HTTP / JSON failures.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import zlib
|
||||
|
||||
_BASE_URL = "https://gumroad.com/products/search"
|
||||
|
||||
# A browser User-Agent is required: without it Gumroad / Cloudflare may reject
|
||||
# the request or redirect away from the JSON payload.
|
||||
_USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
_VALID_SORTS = (
|
||||
"best_selling",
|
||||
"most_reviewed",
|
||||
"hot_and_new",
|
||||
"highest_rated",
|
||||
"newest",
|
||||
"price_asc",
|
||||
"price_desc",
|
||||
)
|
||||
|
||||
|
||||
def _build_headers() -> dict:
|
||||
"""Headers Gumroad needs to serve the JSON search payload."""
|
||||
return {
|
||||
"User-Agent": _USER_AGENT,
|
||||
"Accept": "application/json",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
# Request an uncompressed body: urllib does not transparently inflate
|
||||
# gzip/deflate, and Cloudflare serves gzip when a browser UA is present.
|
||||
# Asking for identity keeps the payload as plain JSON. A defensive
|
||||
# inflate in _fetch_json covers the case where Cloudflare ignores this.
|
||||
"Accept-Encoding": "identity",
|
||||
"Connection": "keep-alive",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
}
|
||||
|
||||
|
||||
def _build_url(taxonomy: str, sort: str, offset: int, size: int) -> str:
|
||||
"""Compose the Discover search URL for a page window.
|
||||
|
||||
Note: Gumroad ignores ``page``/``per_page`` (they always return from 0).
|
||||
Only ``from`` (offset) + ``size`` paginate.
|
||||
"""
|
||||
query = urllib.parse.urlencode(
|
||||
{
|
||||
"taxonomy": taxonomy,
|
||||
"sort": sort,
|
||||
"from": offset,
|
||||
"size": size,
|
||||
}
|
||||
)
|
||||
return f"{_BASE_URL}?{query}"
|
||||
|
||||
|
||||
def _fetch_json(url: str, headers: dict, timeout: int) -> dict:
|
||||
"""GET the URL and decode the JSON body. Raises RuntimeError on failure."""
|
||||
req = urllib.request.Request(url, headers=headers, method="GET")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read()
|
||||
# Defensive inflate: Cloudflare may still return a gzip/deflate body
|
||||
# (magic bytes 1f 8b for gzip) even when we ask for identity.
|
||||
encoding = (resp.headers.get("Content-Encoding") or "").lower()
|
||||
if "gzip" in encoding or raw[:2] == b"\x1f\x8b":
|
||||
raw = gzip.decompress(raw)
|
||||
elif "deflate" in encoding:
|
||||
raw = zlib.decompress(raw)
|
||||
except urllib.error.HTTPError as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search HTTP {exc.code} for {url}: {exc.reason}. "
|
||||
"Cloudflare may be blocking the request; ensure a browser "
|
||||
"User-Agent is sent, or fall back to the browser MCP/CDP path."
|
||||
) from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search request to {url} failed: {exc.reason}"
|
||||
) from exc
|
||||
|
||||
try:
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
except (ValueError, UnicodeDecodeError) as exc:
|
||||
raise RuntimeError(
|
||||
f"Gumroad search returned non-JSON body for {url}: {exc}. "
|
||||
"A browser User-Agent is required; a Cloudflare challenge page "
|
||||
"is returned as HTML, not JSON."
|
||||
) from exc
|
||||
|
||||
|
||||
def _normalize_product(
|
||||
product: dict,
|
||||
taxonomy: str,
|
||||
total_in_taxonomy: int,
|
||||
sort: str,
|
||||
rank: int,
|
||||
) -> dict:
|
||||
"""Flatten a raw Gumroad product into the flat dict contract."""
|
||||
seller = product.get("seller") or {}
|
||||
ratings = product.get("ratings") or {}
|
||||
price_cents = product.get("price_cents")
|
||||
if not isinstance(price_cents, int):
|
||||
price_cents = 0
|
||||
currency_code = product.get("currency_code")
|
||||
|
||||
return {
|
||||
"id": product.get("id"),
|
||||
"permalink": product.get("permalink"),
|
||||
"name": product.get("name"),
|
||||
"seller_name": seller.get("name"),
|
||||
"ratings_count": ratings.get("count"),
|
||||
"ratings_avg": ratings.get("average"),
|
||||
"price_cents": price_cents,
|
||||
"currency_code": currency_code,
|
||||
# price_usd is a convenience float (cents/100). If the currency is not
|
||||
# USD we keep the numeric value but preserve currency_code so the
|
||||
# consumer can convert/decide.
|
||||
"price_usd": price_cents / 100.0,
|
||||
"is_pay_what_you_want": bool(product.get("is_pay_what_you_want")),
|
||||
"is_free": price_cents == 0,
|
||||
"native_type": product.get("native_type"),
|
||||
"url": product.get("url"),
|
||||
"taxonomy": taxonomy,
|
||||
"total_in_taxonomy": total_in_taxonomy,
|
||||
"sort_used": sort,
|
||||
"rank": rank,
|
||||
}
|
||||
|
||||
|
||||
def scrape_gumroad_discover(
|
||||
taxonomy: str,
|
||||
sort: str = "best_selling",
|
||||
max_products: int = 300,
|
||||
page_size: int = 100,
|
||||
) -> list[dict]:
|
||||
"""Scrape the public Gumroad Discover marketplace for a taxonomy (niche).
|
||||
|
||||
Paginates the verified Gumroad search endpoint with ``from``+``size`` until
|
||||
``max_products`` are collected or a page returns fewer than ``page_size``
|
||||
items (end of window). Each product is normalized to a flat dict carrying
|
||||
the taxonomy-level ``total`` (niche saturation), the sort used and the
|
||||
0-based rank in the returned order.
|
||||
|
||||
Args:
|
||||
taxonomy: Gumroad taxonomy slug / niche, e.g. ``"design"``,
|
||||
``"business-and-money"``, ``"3d"``. Determines the market segment
|
||||
scraped and the ``total_in_taxonomy`` reported on every product.
|
||||
sort: One of ``best_selling, most_reviewed, hot_and_new,
|
||||
highest_rated, newest, price_asc, price_desc``. Any other value
|
||||
raises ``ValueError``.
|
||||
max_products: Upper bound on how many products to collect across pages.
|
||||
Gumroad's pagination window is finite (from~960 still returns), so
|
||||
very high values may hit fewer results than requested.
|
||||
page_size: Items requested per page via ``size``. Gumroad accepts at
|
||||
least 300; a page returning fewer than this signals the end.
|
||||
|
||||
Returns:
|
||||
A list of flat dicts, one per product, with exactly these keys:
|
||||
``id, permalink, name, seller_name, ratings_count, ratings_avg,
|
||||
price_cents, currency_code, price_usd, is_pay_what_you_want, is_free,
|
||||
native_type, url, taxonomy, total_in_taxonomy, sort_used, rank``.
|
||||
|
||||
Raises:
|
||||
ValueError: If ``sort`` is not one of the allowed values, or if
|
||||
``max_products``/``page_size`` are not positive.
|
||||
RuntimeError: On network failure, non-2xx HTTP, or a non-JSON body
|
||||
(typically a Cloudflare challenge served without a browser UA).
|
||||
"""
|
||||
if sort not in _VALID_SORTS:
|
||||
raise ValueError(
|
||||
f"sort must be one of {_VALID_SORTS}, got {sort!r}"
|
||||
)
|
||||
if max_products <= 0:
|
||||
raise ValueError(f"max_products must be positive, got {max_products}")
|
||||
if page_size <= 0:
|
||||
raise ValueError(f"page_size must be positive, got {page_size}")
|
||||
|
||||
headers = _build_headers()
|
||||
results: list[dict] = []
|
||||
total_in_taxonomy = 0
|
||||
offset = 0
|
||||
|
||||
while len(results) < max_products:
|
||||
# Never ask for more than we still need on the last page.
|
||||
size = min(page_size, max_products - len(results))
|
||||
url = _build_url(taxonomy, sort, offset, page_size)
|
||||
payload = _fetch_json(url, headers, timeout=20)
|
||||
|
||||
# The taxonomy-level total is stamped on every product; capture it once.
|
||||
total_val = payload.get("total")
|
||||
if isinstance(total_val, int):
|
||||
total_in_taxonomy = total_val
|
||||
|
||||
products = payload.get("products") or []
|
||||
if not products:
|
||||
break
|
||||
|
||||
for product in products:
|
||||
if len(results) >= max_products:
|
||||
break
|
||||
rank = len(results) # 0-based position across the whole scrape
|
||||
results.append(
|
||||
_normalize_product(
|
||||
product,
|
||||
taxonomy=taxonomy,
|
||||
total_in_taxonomy=total_in_taxonomy,
|
||||
sort=sort,
|
||||
rank=rank,
|
||||
)
|
||||
)
|
||||
|
||||
# A short page means we exhausted the window: stop.
|
||||
if len(products) < page_size:
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
# Be polite between requests so we don't hammer Gumroad.
|
||||
time.sleep(0.4)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,177 @@
|
||||
"""Tests para scrape_gumroad_discover.
|
||||
|
||||
Mockean urllib.request.urlopen para NO hacer red: se inyecta un cuerpo JSON de
|
||||
Gumroad con productos de ejemplo y se verifica la normalizacion a dict plano, el
|
||||
corte de la paginacion, la validacion de sort y el manejo de un body no-JSON
|
||||
(escenario tipico de challenge de Cloudflare). El scrape real no se testea aqui.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from scrape_gumroad_discover import scrape_gumroad_discover
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
"""Context manager que imita la respuesta de urllib.request.urlopen."""
|
||||
|
||||
def __init__(self, raw: bytes, headers: dict | None = None):
|
||||
self._raw = raw
|
||||
# urllib response exposes .headers; the scraper reads Content-Encoding
|
||||
# from it to decide whether to inflate the body.
|
||||
self.headers = headers or {}
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc):
|
||||
return False
|
||||
|
||||
def read(self):
|
||||
return self._raw
|
||||
|
||||
|
||||
def _json_response(payload: dict) -> _FakeResponse:
|
||||
return _FakeResponse(json.dumps(payload).encode("utf-8"))
|
||||
|
||||
|
||||
# Dos productos de ejemplo con la estructura real verificada de Gumroad.
|
||||
_SAMPLE_PRODUCTS = [
|
||||
{
|
||||
"id": "prod_1",
|
||||
"permalink": "coolkit",
|
||||
"name": "Cool Design Kit",
|
||||
"seller": {"id": "s1", "name": "Alice Design", "avatar_url": "http://a"},
|
||||
"ratings": {"count": 128, "average": 4.9},
|
||||
"thumbnail_url": "http://thumb1",
|
||||
"native_type": "digital",
|
||||
"price_cents": 2900,
|
||||
"currency_code": "usd",
|
||||
"is_pay_what_you_want": False,
|
||||
"url": "https://alice.gumroad.com/l/coolkit",
|
||||
"description": "A kit",
|
||||
},
|
||||
{
|
||||
"id": "prod_2",
|
||||
"permalink": "freebie",
|
||||
"name": "Free Font Pack",
|
||||
"seller": {"id": "s2", "name": "Bob Type"},
|
||||
"ratings": {"count": 0, "average": 0.0},
|
||||
"native_type": "digital",
|
||||
"price_cents": 0,
|
||||
"currency_code": "eur",
|
||||
"is_pay_what_you_want": True,
|
||||
"url": "https://bob.gumroad.com/l/freebie",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_normaliza_producto_a_dict_plano(monkeypatch):
|
||||
payload = {"total": 4213, "tags_data": [], "products": _SAMPLE_PRODUCTS}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _json_response(payload)
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", sort="best_selling", max_products=10, page_size=100)
|
||||
|
||||
assert len(rows) == 2
|
||||
|
||||
first = rows[0]
|
||||
# Estructura plana exacta.
|
||||
assert set(first.keys()) == {
|
||||
"id", "permalink", "name", "seller_name", "ratings_count", "ratings_avg",
|
||||
"price_cents", "currency_code", "price_usd", "is_pay_what_you_want",
|
||||
"is_free", "native_type", "url", "taxonomy", "total_in_taxonomy",
|
||||
"sort_used", "rank",
|
||||
}
|
||||
assert first["id"] == "prod_1"
|
||||
assert first["name"] == "Cool Design Kit"
|
||||
assert first["seller_name"] == "Alice Design" # anidado -> plano
|
||||
assert first["ratings_count"] == 128
|
||||
assert first["ratings_avg"] == 4.9
|
||||
assert first["price_cents"] == 2900
|
||||
assert first["price_usd"] == 29.0 # cents/100
|
||||
assert first["currency_code"] == "usd"
|
||||
assert first["is_pay_what_you_want"] is False
|
||||
assert first["is_free"] is False
|
||||
assert first["native_type"] == "digital"
|
||||
assert first["taxonomy"] == "design" # el arg
|
||||
assert first["total_in_taxonomy"] == 4213 # el total del JSON
|
||||
assert first["sort_used"] == "best_selling"
|
||||
assert first["rank"] == 0
|
||||
|
||||
# Segundo producto: gratis / pay-what-you-want, moneda no-usd conservada.
|
||||
second = rows[1]
|
||||
assert second["price_cents"] == 0
|
||||
assert second["price_usd"] == 0.0
|
||||
assert second["is_free"] is True
|
||||
assert second["is_pay_what_you_want"] is True
|
||||
assert second["currency_code"] == "eur" # se conserva, no se convierte
|
||||
assert second["rank"] == 1
|
||||
|
||||
|
||||
def test_paginacion_para_al_agotar_ventana(monkeypatch):
|
||||
# page_size=2 y una sola pagina con 2 productos: como len(products) == page_size
|
||||
# se intentaria otra pagina; la segunda devuelve products vacios -> corta.
|
||||
call_count = {"n": 0}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return _json_response({"total": 2, "products": _SAMPLE_PRODUCTS})
|
||||
return _json_response({"total": 2, "products": []})
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
monkeypatch.setattr("time.sleep", lambda *_: None) # no dormir en test
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", max_products=100, page_size=2)
|
||||
|
||||
assert len(rows) == 2
|
||||
assert call_count["n"] == 2 # pidio segunda pagina, vino vacia, paro
|
||||
assert [r["rank"] for r in rows] == [0, 1]
|
||||
|
||||
|
||||
def test_sort_invalido_lanza_valueerror(monkeypatch):
|
||||
# No debe llegar a hacer red: falla en validacion antes.
|
||||
def fake_urlopen(req, timeout=None):
|
||||
raise AssertionError("no deberia hacer red con sort invalido")
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
with pytest.raises(ValueError, match="sort must be one of"):
|
||||
scrape_gumroad_discover(taxonomy="design", sort="trending")
|
||||
|
||||
|
||||
def test_body_gzip_se_descomprime(monkeypatch):
|
||||
# Cloudflare puede servir el JSON gzip-comprimido aunque se pida identity.
|
||||
# El scraper debe inflar el cuerpo (magic bytes 1f 8b) y parsear el JSON.
|
||||
import gzip as _gzip
|
||||
|
||||
payload = {"total": 7, "products": _SAMPLE_PRODUCTS}
|
||||
gz = _gzip.compress(json.dumps(payload).encode("utf-8"))
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _FakeResponse(gz, headers={"Content-Encoding": "gzip"})
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
monkeypatch.setattr("time.sleep", lambda *_: None)
|
||||
|
||||
rows = scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
|
||||
|
||||
assert len(rows) == 2
|
||||
assert rows[0]["name"] == "Cool Design Kit"
|
||||
assert rows[0]["total_in_taxonomy"] == 7
|
||||
|
||||
|
||||
def test_body_no_json_lanza_runtimeerror(monkeypatch):
|
||||
# Cloudflare challenge: devuelve HTML, no JSON.
|
||||
def fake_urlopen(req, timeout=None):
|
||||
return _FakeResponse(b"<html><body>Just a moment...</body></html>")
|
||||
|
||||
monkeypatch.setattr("urllib.request.urlopen", fake_urlopen)
|
||||
|
||||
with pytest.raises(RuntimeError, match="non-JSON"):
|
||||
scrape_gumroad_discover(taxonomy="design", max_products=10, page_size=100)
|
||||
Reference in New Issue
Block a user