Files
gnula_grabber/crawl.py
T

101 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""PARTE 1 — Crawler gnula: detecta pelis en ESPAÑOL (bandera es.png) y las cataloga.
Usa FlareSolverr (Chrome NordVPN+CF, headless) para fetchear las páginas listado
(gnula está ISP-bloqueado + Cloudflare). Parsea los <article class="item movies">,
filtra los que tienen flag es.png (Español/castellano), dedup vs catálogo + Radarr,
y guarda en SQLite. NO descarga nada (eso es Parte 2: grab_stream.py).
Uso: python gnula_crawl.py [base_path] [max_pages]
base_path: ej 'peliculas/estrenos' (default), 'release/2025', 'peliculas/accion'...
"""
import json, re, sqlite3, sys, time, urllib.request
FLARE = "http://localhost:8191/v1"
SITE = "https://www.gnularetro.cc"
DB = "/home/lucas/.config/popelis/gnula_catalog.db"
RADARR = ("http://localhost:7878", "63fb51c8c95746e2a327740baac02f5e")
def flare_get(url, timeout=150, retries=2):
for _a in range(retries+1):
try:
return _flare_get(url, timeout)
except Exception as e:
if _a==retries: raise
time.sleep(5)
def _flare_get(url, timeout=150):
body = json.dumps({"cmd": "request.get", "url": url, "maxTimeout": 60000}).encode()
req = urllib.request.Request(FLARE, data=body, headers={"Content-Type": "application/json"})
resp = json.load(urllib.request.urlopen(req, timeout=timeout))
sol = resp.get("solution", {})
return sol.get("status"), sol.get("response", "")
def parse_cards(html):
out = []
for blk in re.findall(r'<article[^>]*class="[^"]*item movies[^"]*".*?</article>', html, re.S):
href = re.search(r'href="([^"]*ver-pelicula[^"]*)"', blk)
title = re.search(r'<h3[^>]*>.*?<a[^>]*>([^<]+)', blk, re.S) or re.search(r'title="([^"]+)"', blk)
flags = re.findall(r'/flags/(\w+\.png)', blk)
yr = re.search(r'<span>(\d{4})</span>', blk) or re.search(r'\b(20\d{2})\b', blk)
if href and title:
out.append({
"href": href.group(1),
"title": title.group(1).strip(),
"year": int(yr.group(1)) if yr else None,
"flags": sorted(set(flags)),
"lang_es": 1 if any("es.png" == f for f in flags) else 0,
})
return out
def db_init():
import os; os.makedirs("/home/lucas/.config/popelis", exist_ok=True)
c = sqlite3.connect(DB)
c.execute("""CREATE TABLE IF NOT EXISTS movies(
href TEXT PRIMARY KEY, title TEXT, year INTEGER, flags TEXT,
lang_es INTEGER, status TEXT DEFAULT 'pending', in_library INTEGER DEFAULT 0,
detected_at TEXT, downloaded_at TEXT)""")
c.commit(); return c
def radarr_titles():
try:
req = urllib.request.Request(f"{RADARR[0]}/api/v3/movie", headers={"X-Api-Key": RADARR[1]})
return {m["title"].lower() for m in json.load(urllib.request.urlopen(req, timeout=20))}
except Exception:
return set()
def main():
base = sys.argv[1] if len(sys.argv) > 1 else "peliculas/estrenos"
max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 8
c = db_init(); have = radarr_titles()
now = time.strftime("%Y-%m-%dT%H:%M:%S")
total_new = 0; total_es = 0
for p in range(1, max_pages + 1):
url = f"{SITE}/{base}/" if p == 1 else f"{SITE}/{base}/page/{p}/"
st, html = flare_get(url)
if st != 200:
print(f"[crawl] page {p}: HTTP {st} -> stop"); break
cards = parse_cards(html)
if not cards:
print(f"[crawl] page {p}: 0 cards -> stop"); break
es = [x for x in cards if x["lang_es"]]
for x in es:
inlib = 1 if any(t in x["title"].lower() or x["title"].lower() in t for t in have) else 0
c.execute("""INSERT INTO movies(href,title,year,flags,lang_es,status,in_library,detected_at)
VALUES(?,?,?,?,?,?,?,?)
ON CONFLICT(href) DO UPDATE SET flags=excluded.flags, in_library=excluded.in_library""",
(x["href"], x["title"], x["year"], ",".join(x["flags"]), 1,
"have" if inlib else "pending", inlib, now))
c.commit()
total_es += len(es); total_new += len(cards)
print(f"[crawl] page {p}: {len(cards)} pelis, {len(es)} en español")
time.sleep(1)
# resumen
cur = c.execute("SELECT COUNT(*),SUM(in_library),SUM(status='pending') FROM movies WHERE lang_es=1")
tot, inlib, pend = cur.fetchone()
print(json.dumps({"crawled_pages_upto": max_pages, "es_seen": total_es,
"catalog_total_es": tot, "in_library": inlib or 0,
"pending_download": pend or 0, "db": DB}))
if __name__ == "__main__":
main()