gnula_grabber/crawl.py

#!/usr/bin/env python3
"""PARTE 1 — Crawler gnula: detecta pelis en ESPAÑOL (bandera es.png) y las cataloga.

Usa FlareSolverr (Chrome NordVPN+CF, headless) para fetchear las páginas listado
(gnula está ISP-bloqueado + Cloudflare). Parsea los <article class="item movies">,
filtra los que tienen flag es.png (Español/castellano), dedup vs catálogo + Radarr,
y guarda en SQLite. NO descarga nada (eso es Parte 2: grab_stream.py).

Uso: python gnula_crawl.py [base_path] [max_pages]
  base_path: ej 'peliculas/estrenos' (default), 'release/2025', 'peliculas/accion'...
"""
import json, re, sqlite3, sys, time, urllib.request

FLARE = "http://localhost:8191/v1"
SITE = "https://www.gnularetro.cc"
DB = "/home/lucas/.config/popelis/gnula_catalog.db"
RADARR = ("http://localhost:7878", "63fb51c8c95746e2a327740baac02f5e")

def flare_get(url, timeout=150, retries=2):
  for _a in range(retries+1):
    try:
      return _flare_get(url, timeout)
    except Exception as e:
      if _a==retries: raise
      time.sleep(5)
def _flare_get(url, timeout=150):
    body = json.dumps({"cmd": "request.get", "url": url, "maxTimeout": 60000}).encode()
    req = urllib.request.Request(FLARE, data=body, headers={"Content-Type": "application/json"})
    resp = json.load(urllib.request.urlopen(req, timeout=timeout))
    sol = resp.get("solution", {})
    return sol.get("status"), sol.get("response", "")

def parse_cards(html):
    out = []
    for blk in re.findall(r'<article[^>]*class="[^"]*item movies[^"]*".*?</article>', html, re.S):
        href = re.search(r'href="([^"]*ver-pelicula[^"]*)"', blk)
        title = re.search(r'<h3[^>]*>.*?<a[^>]*>([^<]+)', blk, re.S) or re.search(r'title="([^"]+)"', blk)
        flags = re.findall(r'/flags/(\w+\.png)', blk)
        yr = re.search(r'<span>(\d{4})</span>', blk) or re.search(r'\b(20\d{2})\b', blk)
        if href and title:
            out.append({
                "href": href.group(1),
                "title": title.group(1).strip(),
                "year": int(yr.group(1)) if yr else None,
                "flags": sorted(set(flags)),
                "lang_es": 1 if any("es.png" == f for f in flags) else 0,
            })
    return out

def db_init():
    import os; os.makedirs("/home/lucas/.config/popelis", exist_ok=True)
    c = sqlite3.connect(DB)
    c.execute("""CREATE TABLE IF NOT EXISTS movies(
        href TEXT PRIMARY KEY, title TEXT, year INTEGER, flags TEXT,
        lang_es INTEGER, status TEXT DEFAULT 'pending', in_library INTEGER DEFAULT 0,
        detected_at TEXT, downloaded_at TEXT)""")
    c.commit(); return c

def radarr_titles():
    try:
        req = urllib.request.Request(f"{RADARR[0]}/api/v3/movie", headers={"X-Api-Key": RADARR[1]})
        return {m["title"].lower() for m in json.load(urllib.request.urlopen(req, timeout=20))}
    except Exception:
        return set()

def main():
    base = sys.argv[1] if len(sys.argv) > 1 else "peliculas/estrenos"
    max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 8
    c = db_init(); have = radarr_titles()
    now = time.strftime("%Y-%m-%dT%H:%M:%S")
    total_new = 0; total_es = 0
    for p in range(1, max_pages + 1):
        url = f"{SITE}/{base}/" if p == 1 else f"{SITE}/{base}/page/{p}/"
        st, html = flare_get(url)
        if st != 200:
            print(f"[crawl] page {p}: HTTP {st} -> stop"); break
        cards = parse_cards(html)
        if not cards:
            print(f"[crawl] page {p}: 0 cards -> stop"); break
        es = [x for x in cards if x["lang_es"]]
        for x in es:
            inlib = 1 if any(t in x["title"].lower() or x["title"].lower() in t for t in have) else 0
            c.execute("""INSERT INTO movies(href,title,year,flags,lang_es,status,in_library,detected_at)
                VALUES(?,?,?,?,?,?,?,?)
                ON CONFLICT(href) DO UPDATE SET flags=excluded.flags, in_library=excluded.in_library""",
                (x["href"], x["title"], x["year"], ",".join(x["flags"]), 1,
                 "have" if inlib else "pending", inlib, now))
        c.commit()
        total_es += len(es); total_new += len(cards)
        print(f"[crawl] page {p}: {len(cards)} pelis, {len(es)} en español")
        time.sleep(1)
    # resumen
    cur = c.execute("SELECT COUNT(*),SUM(in_library),SUM(status='pending') FROM movies WHERE lang_es=1")
    tot, inlib, pend = cur.fetchone()
    print(json.dumps({"crawled_pages_upto": max_pages, "es_seen": total_es,
                      "catalog_total_es": tot, "in_library": inlib or 0,
                      "pending_download": pend or 0, "db": DB}))

if __name__ == "__main__":
    main()