763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
100 lines
3.4 KiB
Python
100 lines
3.4 KiB
Python
"""fetch_reddit_search — busca posts en Reddit via la API JSON publica (sin auth).
|
|
|
|
Funcion impura: hace peticiones HTTP a www.reddit.com. Tolera errores por
|
|
subreddit y normaliza cada post a un shape comun de market intelligence.
|
|
"""
|
|
|
|
import requests
|
|
|
|
_UA = "demand_radar/0.1 (registry market-intel)"
|
|
_TIMEOUT = 15
|
|
|
|
|
|
def _parse_children(children: list, query: str) -> list[dict]:
|
|
"""Normaliza la lista children de la respuesta de Reddit al shape comun."""
|
|
rows = []
|
|
for child in children:
|
|
data = child.get("data", {}) if isinstance(child, dict) else {}
|
|
permalink = data.get("permalink", "") or ""
|
|
rows.append({
|
|
"source": "reddit",
|
|
"platform_id": str(data.get("id", "")),
|
|
"title": data.get("title", "") or "",
|
|
"body": data.get("selftext", "") or "",
|
|
"url": "https://www.reddit.com" + permalink,
|
|
"author": data.get("author", "") or "",
|
|
"channel": data.get("subreddit", "") or "",
|
|
"created_utc": float(data.get("created_utc") or 0.0),
|
|
"platform_score": int(data.get("ups") or 0),
|
|
"query": query,
|
|
})
|
|
return rows
|
|
|
|
|
|
def fetch_reddit_search(
|
|
query: str,
|
|
subreddits: list[str] = None,
|
|
limit: int = 50,
|
|
sort: str = "new",
|
|
) -> list[dict]:
|
|
"""Busca posts en Reddit usando la API JSON publica (sin autenticacion).
|
|
|
|
Por cada subreddit en `subreddits` hace una busqueda restringida a ese
|
|
subreddit. Si `subreddits` es None o vacio hace una busqueda global. Cada
|
|
fallo por subreddit (red, 429, JSON malformado) se captura y se omite,
|
|
continuando con los demas.
|
|
|
|
Args:
|
|
query: Termino de busqueda.
|
|
subreddits: Lista de subreddits a buscar (sin el prefijo "r/"). Si None
|
|
o vacio, busqueda global en todo Reddit.
|
|
limit: Maximo de resultados por subreddit (o por la busqueda global).
|
|
sort: Orden de Reddit: "new", "relevance", "top", "comments", "hot".
|
|
|
|
Returns:
|
|
Lista de dicts normalizados (puede ser []). Cada dict tiene las claves:
|
|
source, platform_id, title, body, url, author, channel, created_utc,
|
|
platform_score, query.
|
|
"""
|
|
headers = {"User-Agent": _UA}
|
|
results: list[dict] = []
|
|
|
|
targets = subreddits if subreddits else [None]
|
|
|
|
for sub in targets:
|
|
try:
|
|
if sub:
|
|
url = f"https://www.reddit.com/r/{sub}/search.json"
|
|
params = {
|
|
"q": query,
|
|
"restrict_sr": 1,
|
|
"sort": sort,
|
|
"limit": limit,
|
|
"t": "year",
|
|
}
|
|
else:
|
|
url = "https://www.reddit.com/search.json"
|
|
params = {
|
|
"q": query,
|
|
"sort": sort,
|
|
"limit": limit,
|
|
"t": "year",
|
|
}
|
|
|
|
resp = requests.get(
|
|
url, params=params, headers=headers, timeout=_TIMEOUT
|
|
)
|
|
resp.raise_for_status()
|
|
payload = resp.json()
|
|
children = (
|
|
payload.get("data", {}).get("children", [])
|
|
if isinstance(payload, dict)
|
|
else []
|
|
)
|
|
results.extend(_parse_children(children, query))
|
|
except Exception:
|
|
# Tolerar fallo por subreddit (red, 429, parsing) y seguir.
|
|
continue
|
|
|
|
return results
|