feat: Implement cookie extraction script for Chrome v20 and enhance browser interaction

2025-06-01 15:31:13 +02:00
parent 628cddc3ae
commit e1b756ac99
8 changed files with 717 additions and 64 deletions
@@ -0,0 +1,122 @@
+import asyncio
+import os
+import re
+from src.ScrappingWeb.Navegador import Navegador
+from src.ScrappingWeb.Scrapper import Scrapper
+from src.ScrappingWeb.Tab import Tab
+import aiohttp
+import csv
+
+
+async def esperar_chrome_listo(port, timeout=10):
+    url = f"http://127.0.0.1:{port}/json"
+    for _ in range(timeout * 2):
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url) as resp:
+                    if resp.status == 200:
+                        return
+        except Exception:
+            pass
+        await asyncio.sleep(0.5)
+    raise TimeoutError(f"Chrome en puerto {port} no respondió dentro del tiempo esperado.")
+
+chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
+
+def sanitizar_nombre(nombre: str) -> str:
+    # Eliminar caracteres inválidos para nombre de archivo
+    return re.sub(r'[\\/*?:"<>|]', "_", nombre).strip()[:100]
+
+
+async def iniciar_y_scrapear(id: int):
+    user_data_dir = os.path.abspath(f"./Perfiles_usuario/chrome_profile_{id}")
+    port = 9222 + id
+    navegador = Navegador(
+        chrome_path=chrome_path,
+        user_data_dir=user_data_dir,
+        id=id,
+        download_dir=os.path.join(user_data_dir, "downloads"),
+        debugging_port=port,
+        headless=False,
+        user_agent=f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/{100+id}.0.0.0 Safari/537.36"
+    )
+
+    # Iniciar navegador en background
+    asyncio.create_task(navegador.iniciar())
+
+    # Esperamos a que el navegador esté listo
+    await esperar_chrome_listo(port)
+
+    # Conectarse con el scraper al navegador
+    scrapper = Scrapper(debugging_url=f"http://127.0.0.1:{port}")
+    tab = await scrapper.nueva_tab("", wait_time=6)
+
+    # Ejecutar acciones desde la clase Tab
+    ua = await tab.obtener_user_agent()
+    print(f"🧭 [{id}] User-Agent:", ua)
+
+    title = await tab.evaluar_js("document.title")
+    print(f"📄 [{id}] Título:", title)
+
+
+    # botones= await tab.get_elements_by_css_selector("#mw-content-text > div.mw-content-ltr.mw-parser-output > figure:nth-child(27) > a > img")
+
+    # for boton in botones:
+    #     await boton.click()
+
+
+    # # Crear carpeta si no existe
+    # os.makedirs("wikipedia_md", exist_ok=True)
+
+
+    # # Guardar el HTML completo
+    # html = await tab.obtener_html_completo()
+    # with open(f"contenido.html", "w", encoding="utf-8") as f:
+    #     f.write(html)
+
+    # # Leer enlaces del CSV
+    # with open("enlaces_extraidos.csv", "r", encoding="utf-8") as f:
+    #     reader = csv.reader(f)
+    #     next(reader)  # saltar encabezados
+    #     enlaces = list(reader)
+
+    # for texto, enlace in enlaces:
+    #     nombre_archivo = sanitizar_nombre(texto or "sin_titulo") + ".png"
+    #     ruta_archivo = os.path.join("wikipedia", nombre_archivo)
+
+    #     try:
+    #         print(f"🌐 Visitando: {enlace}")
+    #         tab = await scrapper.nueva_tab(enlace, wait_time=6)
+
+    #         await tab.capturar_screenshot(ruta_archivo)
+    #         print(f"📸 Captura guardada: {ruta_archivo}")
+
+    #         await tab.cerrar()
+    #     except Exception as e:
+    #         print(f"❌ Error con {enlace}: {e}")
+
+
+    # await tab.capturar_screenshot(f"screenshot_{id}.png")
+
+    # html = await tab.obtener_html_completo()
+    # print(html)
+
+    # with open("contenido.html", "w", encoding="utf-8") as f:
+    #     f.write(html)
+
+    # Extraer enlaces y guardarlos en CSV
+
+
+
+
+    # # # Cerrar tab y navegador si quieres
+    # await asyncio.sleep(10)
+    # await tab.cerrar()
+    # await navegador.cerrar()
+
+async def main():
+    tareas = [iniciar_y_scrapear(i) for i in range(1)]
+    await asyncio.gather(*tareas)
+
+if __name__ == "__main__":
+    asyncio.run(main())