#!/usr/bin/env python3 """PARTE 1 — Crawler gnula: detecta pelis en ESPAÑOL (bandera es.png) y las cataloga. Usa FlareSolverr (Chrome NordVPN+CF, headless) para fetchear las páginas listado (gnula está ISP-bloqueado + Cloudflare). Parsea los
, filtra los que tienen flag es.png (Español/castellano), dedup vs catálogo + Radarr, y guarda en SQLite. NO descarga nada (eso es Parte 2: grab_stream.py). Uso: python gnula_crawl.py [base_path] [max_pages] base_path: ej 'peliculas/estrenos' (default), 'release/2025', 'peliculas/accion'... """ import json, re, sqlite3, sys, time, urllib.request FLARE = "http://localhost:8191/v1" SITE = "https://www.gnularetro.cc" DB = "/home/lucas/.config/popelis/gnula_catalog.db" RADARR = ("http://localhost:7878", "63fb51c8c95746e2a327740baac02f5e") def flare_get(url, timeout=150, retries=2): for _a in range(retries+1): try: return _flare_get(url, timeout) except Exception as e: if _a==retries: raise time.sleep(5) def _flare_get(url, timeout=150): body = json.dumps({"cmd": "request.get", "url": url, "maxTimeout": 60000}).encode() req = urllib.request.Request(FLARE, data=body, headers={"Content-Type": "application/json"}) resp = json.load(urllib.request.urlopen(req, timeout=timeout)) sol = resp.get("solution", {}) return sol.get("status"), sol.get("response", "") def parse_cards(html): out = [] for blk in re.findall(r']*class="[^"]*item movies[^"]*".*?
', html, re.S): href = re.search(r'href="([^"]*ver-pelicula[^"]*)"', blk) title = re.search(r']*>.*?]*>([^<]+)', blk, re.S) or re.search(r'title="([^"]+)"', blk) flags = re.findall(r'/flags/(\w+\.png)', blk) yr = re.search(r'(\d{4})', blk) or re.search(r'\b(20\d{2})\b', blk) if href and title: out.append({ "href": href.group(1), "title": title.group(1).strip(), "year": int(yr.group(1)) if yr else None, "flags": sorted(set(flags)), "lang_es": 1 if any("es.png" == f for f in flags) else 0, }) return out def db_init(): import os; os.makedirs("/home/lucas/.config/popelis", exist_ok=True) c = sqlite3.connect(DB) c.execute("""CREATE TABLE IF NOT EXISTS movies( href TEXT PRIMARY KEY, title TEXT, year INTEGER, flags TEXT, lang_es INTEGER, status TEXT DEFAULT 'pending', in_library INTEGER DEFAULT 0, detected_at TEXT, downloaded_at TEXT)""") c.commit(); return c def radarr_titles(): try: req = urllib.request.Request(f"{RADARR[0]}/api/v3/movie", headers={"X-Api-Key": RADARR[1]}) return {m["title"].lower() for m in json.load(urllib.request.urlopen(req, timeout=20))} except Exception: return set() def main(): base = sys.argv[1] if len(sys.argv) > 1 else "peliculas/estrenos" max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 8 c = db_init(); have = radarr_titles() now = time.strftime("%Y-%m-%dT%H:%M:%S") total_new = 0; total_es = 0 for p in range(1, max_pages + 1): url = f"{SITE}/{base}/" if p == 1 else f"{SITE}/{base}/page/{p}/" st, html = flare_get(url) if st != 200: print(f"[crawl] page {p}: HTTP {st} -> stop"); break cards = parse_cards(html) if not cards: print(f"[crawl] page {p}: 0 cards -> stop"); break es = [x for x in cards if x["lang_es"]] for x in es: inlib = 1 if any(t in x["title"].lower() or x["title"].lower() in t for t in have) else 0 c.execute("""INSERT INTO movies(href,title,year,flags,lang_es,status,in_library,detected_at) VALUES(?,?,?,?,?,?,?,?) ON CONFLICT(href) DO UPDATE SET flags=excluded.flags, in_library=excluded.in_library""", (x["href"], x["title"], x["year"], ",".join(x["flags"]), 1, "have" if inlib else "pending", inlib, now)) c.commit() total_es += len(es); total_new += len(cards) print(f"[crawl] page {p}: {len(cards)} pelis, {len(es)} en español") time.sleep(1) # resumen cur = c.execute("SELECT COUNT(*),SUM(in_library),SUM(status='pending') FROM movies WHERE lang_es=1") tot, inlib, pend = cur.fetchone() print(json.dumps({"crawled_pages_upto": max_pages, "es_seen": total_es, "catalog_total_es": tot, "in_library": inlib or 0, "pending_download": pend or 0, "db": DB})) if __name__ == "__main__": main()