fn_registry/python/functions/datascience/fetch_reddit_search.py

"""fetch_reddit_search — busca posts en Reddit via la API JSON publica (sin auth).

Funcion impura: hace peticiones HTTP a www.reddit.com. Tolera errores por
subreddit y normaliza cada post a un shape comun de market intelligence.
"""

import requests

_UA = "demand_radar/0.1 (registry market-intel)"
_TIMEOUT = 15


def _parse_children(children: list, query: str) -> list[dict]:
    """Normaliza la lista children de la respuesta de Reddit al shape comun."""
    rows = []
    for child in children:
        data = child.get("data", {}) if isinstance(child, dict) else {}
        permalink = data.get("permalink", "") or ""
        rows.append({
            "source": "reddit",
            "platform_id": str(data.get("id", "")),
            "title": data.get("title", "") or "",
            "body": data.get("selftext", "") or "",
            "url": "https://www.reddit.com" + permalink,
            "author": data.get("author", "") or "",
            "channel": data.get("subreddit", "") or "",
            "created_utc": float(data.get("created_utc") or 0.0),
            "platform_score": int(data.get("ups") or 0),
            "query": query,
        })
    return rows


def fetch_reddit_search(
    query: str,
    subreddits: list[str] = None,
    limit: int = 50,
    sort: str = "new",
) -> list[dict]:
    """Busca posts en Reddit usando la API JSON publica (sin autenticacion).

    Por cada subreddit en `subreddits` hace una busqueda restringida a ese
    subreddit. Si `subreddits` es None o vacio hace una busqueda global. Cada
    fallo por subreddit (red, 429, JSON malformado) se captura y se omite,
    continuando con los demas.

    Args:
        query: Termino de busqueda.
        subreddits: Lista de subreddits a buscar (sin el prefijo "r/"). Si None
                    o vacio, busqueda global en todo Reddit.
        limit: Maximo de resultados por subreddit (o por la busqueda global).
        sort: Orden de Reddit: "new", "relevance", "top", "comments", "hot".

    Returns:
        Lista de dicts normalizados (puede ser []). Cada dict tiene las claves:
        source, platform_id, title, body, url, author, channel, created_utc,
        platform_score, query.
    """
    headers = {"User-Agent": _UA}
    results: list[dict] = []

    targets = subreddits if subreddits else [None]

    for sub in targets:
        try:
            if sub:
                url = f"https://www.reddit.com/r/{sub}/search.json"
                params = {
                    "q": query,
                    "restrict_sr": 1,
                    "sort": sort,
                    "limit": limit,
                    "t": "year",
                }
            else:
                url = "https://www.reddit.com/search.json"
                params = {
                    "q": query,
                    "sort": sort,
                    "limit": limit,
                    "t": "year",
                }

            resp = requests.get(
                url, params=params, headers=headers, timeout=_TIMEOUT
            )
            resp.raise_for_status()
            payload = resp.json()
            children = (
                payload.get("data", {}).get("children", [])
                if isinstance(payload, dict)
                else []
            )
            results.extend(_parse_children(children, query))
        except Exception:
            # Tolerar fallo por subreddit (red, 429, parsing) y seguir.
            continue

    return results