feat: gnula_grabber 2-part pipeline (crawl ES + CDP stream download)
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""PARTE 1 — Crawler gnula: detecta pelis en ESPAÑOL (bandera es.png) y las cataloga.
|
||||
|
||||
Usa FlareSolverr (Chrome NordVPN+CF, headless) para fetchear las páginas listado
|
||||
(gnula está ISP-bloqueado + Cloudflare). Parsea los <article class="item movies">,
|
||||
filtra los que tienen flag es.png (Español/castellano), dedup vs catálogo + Radarr,
|
||||
y guarda en SQLite. NO descarga nada (eso es Parte 2: grab_stream.py).
|
||||
|
||||
Uso: python gnula_crawl.py [base_path] [max_pages]
|
||||
base_path: ej 'peliculas/estrenos' (default), 'release/2025', 'peliculas/accion'...
|
||||
"""
|
||||
import json, re, sqlite3, sys, time, urllib.request
|
||||
|
||||
FLARE = "http://localhost:8191/v1"
|
||||
SITE = "https://www.gnularetro.cc"
|
||||
DB = "/home/lucas/.config/popelis/gnula_catalog.db"
|
||||
RADARR = ("http://localhost:7878", "63fb51c8c95746e2a327740baac02f5e")
|
||||
|
||||
def flare_get(url, timeout=150, retries=2):
|
||||
for _a in range(retries+1):
|
||||
try:
|
||||
return _flare_get(url, timeout)
|
||||
except Exception as e:
|
||||
if _a==retries: raise
|
||||
time.sleep(5)
|
||||
def _flare_get(url, timeout=150):
|
||||
body = json.dumps({"cmd": "request.get", "url": url, "maxTimeout": 60000}).encode()
|
||||
req = urllib.request.Request(FLARE, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = json.load(urllib.request.urlopen(req, timeout=timeout))
|
||||
sol = resp.get("solution", {})
|
||||
return sol.get("status"), sol.get("response", "")
|
||||
|
||||
def parse_cards(html):
|
||||
out = []
|
||||
for blk in re.findall(r'<article[^>]*class="[^"]*item movies[^"]*".*?</article>', html, re.S):
|
||||
href = re.search(r'href="([^"]*ver-pelicula[^"]*)"', blk)
|
||||
title = re.search(r'<h3[^>]*>.*?<a[^>]*>([^<]+)', blk, re.S) or re.search(r'title="([^"]+)"', blk)
|
||||
flags = re.findall(r'/flags/(\w+\.png)', blk)
|
||||
yr = re.search(r'<span>(\d{4})</span>', blk) or re.search(r'\b(20\d{2})\b', blk)
|
||||
if href and title:
|
||||
out.append({
|
||||
"href": href.group(1),
|
||||
"title": title.group(1).strip(),
|
||||
"year": int(yr.group(1)) if yr else None,
|
||||
"flags": sorted(set(flags)),
|
||||
"lang_es": 1 if any("es.png" == f for f in flags) else 0,
|
||||
})
|
||||
return out
|
||||
|
||||
def db_init():
|
||||
import os; os.makedirs("/home/lucas/.config/popelis", exist_ok=True)
|
||||
c = sqlite3.connect(DB)
|
||||
c.execute("""CREATE TABLE IF NOT EXISTS movies(
|
||||
href TEXT PRIMARY KEY, title TEXT, year INTEGER, flags TEXT,
|
||||
lang_es INTEGER, status TEXT DEFAULT 'pending', in_library INTEGER DEFAULT 0,
|
||||
detected_at TEXT, downloaded_at TEXT)""")
|
||||
c.commit(); return c
|
||||
|
||||
def radarr_titles():
|
||||
try:
|
||||
req = urllib.request.Request(f"{RADARR[0]}/api/v3/movie", headers={"X-Api-Key": RADARR[1]})
|
||||
return {m["title"].lower() for m in json.load(urllib.request.urlopen(req, timeout=20))}
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
def main():
|
||||
base = sys.argv[1] if len(sys.argv) > 1 else "peliculas/estrenos"
|
||||
max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 8
|
||||
c = db_init(); have = radarr_titles()
|
||||
now = time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
total_new = 0; total_es = 0
|
||||
for p in range(1, max_pages + 1):
|
||||
url = f"{SITE}/{base}/" if p == 1 else f"{SITE}/{base}/page/{p}/"
|
||||
st, html = flare_get(url)
|
||||
if st != 200:
|
||||
print(f"[crawl] page {p}: HTTP {st} -> stop"); break
|
||||
cards = parse_cards(html)
|
||||
if not cards:
|
||||
print(f"[crawl] page {p}: 0 cards -> stop"); break
|
||||
es = [x for x in cards if x["lang_es"]]
|
||||
for x in es:
|
||||
inlib = 1 if any(t in x["title"].lower() or x["title"].lower() in t for t in have) else 0
|
||||
c.execute("""INSERT INTO movies(href,title,year,flags,lang_es,status,in_library,detected_at)
|
||||
VALUES(?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(href) DO UPDATE SET flags=excluded.flags, in_library=excluded.in_library""",
|
||||
(x["href"], x["title"], x["year"], ",".join(x["flags"]), 1,
|
||||
"have" if inlib else "pending", inlib, now))
|
||||
c.commit()
|
||||
total_es += len(es); total_new += len(cards)
|
||||
print(f"[crawl] page {p}: {len(cards)} pelis, {len(es)} en español")
|
||||
time.sleep(1)
|
||||
# resumen
|
||||
cur = c.execute("SELECT COUNT(*),SUM(in_library),SUM(status='pending') FROM movies WHERE lang_es=1")
|
||||
tot, inlib, pend = cur.fetchone()
|
||||
print(json.dumps({"crawled_pages_upto": max_pages, "es_seen": total_es,
|
||||
"catalog_total_es": tot, "in_library": inlib or 0,
|
||||
"pending_download": pend or 0, "db": DB}))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user