feat: gnula_grabber 2-part pipeline (crawl ES + CDP stream download)

This commit is contained in:
agent
2026-05-30 13:54:57 +02:00
commit 79c2c8a804
6 changed files with 497 additions and 0 deletions
+119
View File
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""PARTE 2 — Downloader: consume el catálogo (gnula_crawl.py) y descarga las pelis
español pendientes via la sesión Chrome+CDP+NordVPN.
Por cada peli pending: navega a su página, clica el server Español (luluvid/luluvdo),
quita el overlay de ad + play (jwplayer), espera master.m3u8, y delega a grab_stream
(streaming + descifrado AES local) → manual/movies. Marca downloaded en el catálogo.
El captcha (si sale) lo resuelve el humano: el script espera a que aparezca master.
Uso: python gnula_download.py [limit] (default 1 peli; usar N para varias)
Requiere: Chrome con --remote-debugging-port=9222 (NordVPN proxy) abierto.
"""
import json, re, sqlite3, subprocess, sys, time, urllib.request
import websocket
PORT = 9222
DB = "/home/lucas/.config/popelis/gnula_catalog.db"
GRAB = "/home/lucas/fn_registry/apps/gnula_grabber/grab_stream.py"
def tabs(): return json.load(urllib.request.urlopen(f"http://127.0.0.1:{PORT}/json/list", timeout=8))
def conn(t):
w = websocket.create_connection(t["webSocketDebuggerUrl"], timeout=20, header=["Origin: http://localhost"]); w.settimeout(3); return w
def ev(w, expr, to=15):
w.send(json.dumps({"id": 1, "method": "Runtime.evaluate", "params": {"expression": expr, "returnByValue": True}}))
s = time.time()
while time.time() - s < to:
try: m = json.loads(w.recv())
except Exception: continue
if m.get("id") == 1: return m.get("result", {}).get("result", {}).get("value")
def cmd(w, method, params, to=8):
w.send(json.dumps({"id": 2, "method": method, "params": params}))
s = time.time()
while time.time() - s < to:
try: m = json.loads(w.recv())
except Exception: continue
if m.get("id") == 2: return m.get("result", {})
def page_target(sub):
return next((t for t in tabs() if sub in t.get("url", "") and t.get("type") == "page"), None)
def player_target():
return next((t for t in tabs() if any(h in t.get("url", "") for h in ["luluvdo", "lulustream"])), None)
def safe_name(title, year):
base = re.split(r"\s*\|", title)[0].strip()
base = re.sub(r"\(20\d{2}\)", "", base).strip()
return re.sub(r"[^\w .-]", "", f"{base} ({year})" if year else base)[:90]
def download_one(href, title, year):
name = safe_name(title, year)
print(f"[dl] {name} -> {href}", flush=True)
# 1. navegar a la página de la peli
g = next((t for t in tabs() if t.get("type") == "page"), None)
w = conn(g); ev(w, f"location.href={json.dumps(href)}"); w.close(); time.sleep(7)
# 2. clic server Español (dnume 2 = luluvid). Prefiere luluvid; si no, primer dnume del grupo es.png
pg = page_target("/ver-pelicula/")
if not pg: print("[dl] no movie page"); return False
w = conn(pg)
clicked = ev(w, r'''(()=>{
const lis=[...document.querySelectorAll('li[data-nume]')];
// grupo Español: li cuyo flag es es.png; preferir luluvid
const esLis=lis.filter(l=>[...l.querySelectorAll('img')].some(i=>/es\.png/.test(i.src||'')));
const pick=esLis.find(l=>/luluv/i.test(l.textContent||''))||esLis[0]||lis[0];
if(!pick)return"no-options"; pick.click(); return"clicked:"+(pick.textContent||'').trim().slice(0,15);
})()'''); w.close()
print("[dl] server:", clicked); time.sleep(6)
# 3. play: quitar overlays + jwplayer.play + trusted click
pl = player_target()
for _ in range(4):
if pl: break
time.sleep(3); pl = player_target()
if not pl: print("[dl] no player iframe (captcha?)"); return False
w = conn(pl)
ev(w, r'''(()=>{let n=0;document.querySelectorAll('div,iframe').forEach(e=>{if(parseInt(getComputedStyle(e).zIndex||0)>=1000){e.remove();n++;}});return n;})()''')
rect = ev(w, r'''(()=>{const b=document.querySelector('.jw-icon-display,.jw-display-icon-container,video');if(!b)return null;const r=b.getBoundingClientRect();return Math.round(r.left+r.width/2)+","+Math.round(r.top+r.height/2);})()''')
if rect and "," in rect:
cx, cy = [int(x) for x in rect.split(",")]
for _ in range(2):
cmd(w, "Input.dispatchMouseEvent", {"type": "mousePressed", "x": cx, "y": cy, "button": "left", "clickCount": 1})
cmd(w, "Input.dispatchMouseEvent", {"type": "mouseReleased", "x": cx, "y": cy, "button": "left", "clickCount": 1})
time.sleep(2)
ev(w, r'''(()=>{try{jwplayer().play(true);}catch(e){}})()''')
# 4. esperar master (humano resuelve captcha si hace falta) hasta 120s
got = False
for _ in range(40):
ev(w, r'''(()=>{document.querySelectorAll('div,iframe').forEach(e=>{if(parseInt(getComputedStyle(e).zIndex||0)>=1000)e.remove();});})()''')
if ev(w, r'''performance.getEntriesByType('resource').some(e=>/master\.m3u8/.test(e.name))'''):
got = True; break
time.sleep(3)
w.close()
if not got:
print("[dl] master no apareció (captcha/no play). Skip."); return False
# 5. grab streaming
r = subprocess.run(["uv", "run", "--with", "websocket-client", "--with", "pycryptodome",
"python", GRAB, name], cwd="/home/lucas/fn_registry/apps/gnula_grabber",
capture_output=True, text=True, timeout=3600)
print(r.stdout[-400:]);
ok = '"status": "ok"' in r.stdout
return ok
def main():
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 1
c = sqlite3.connect(DB)
rows = c.execute("SELECT href,title,year FROM movies WHERE lang_es=1 AND status='pending' ORDER BY detected_at LIMIT ?", (limit,)).fetchall()
print(f"[dl] {len(rows)} pelis pending a descargar (limit {limit})")
done = 0
for href, title, year in rows:
try:
ok = download_one(href, title, year)
except Exception as e:
print("[dl] error:", e); ok = False
if ok:
c.execute("UPDATE movies SET status='downloaded', downloaded_at=? WHERE href=?",
(time.strftime("%Y-%m-%dT%H:%M:%S"), href)); c.commit(); done += 1
else:
c.execute("UPDATE movies SET status='failed' WHERE href=?", (href,)); c.commit()
print(json.dumps({"attempted": len(rows), "downloaded": done}))
if __name__ == "__main__":
main()