Fitz_Studio/scrappers/prueba_navegadores.py

import asyncio
import os
import re
from domains.ScrappingWeb.Navegador import Navegador
from domains.ScrappingWeb.Scrapper import Scrapper
from domains.ScrappingWeb.Tab import Tab
import aiohttp
import csv


async def esperar_chrome_listo(port, timeout=10):
    url = f"http://127.0.0.1:{port}/json"
    for _ in range(timeout * 2):
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp:
                    if resp.status == 200:
                        return
        except Exception:
            pass
        await asyncio.sleep(0.5)
    raise TimeoutError(f"Chrome en puerto {port} no respondió dentro del tiempo esperado.")

chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"

def sanitizar_nombre(nombre: str) -> str:
    # Eliminar caracteres inválidos para nombre de archivo
    return re.sub(r'[\\/*?:"<>|]', "_", nombre).strip()[:100]


async def iniciar_y_scrapear(id: int):
    user_data_dir = os.path.abspath(f"./Perfiles_usuario/chrome_profile_{id}")
    port = 9222 + id
    navegador = Navegador(
        chrome_path=chrome_path,
        user_data_dir=user_data_dir,
        id=id,
        download_dir=os.path.join(user_data_dir, "downloads"),
        debugging_port=port,
        headless=False,
        user_agent=f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/{100+id}.0.0.0 Safari/537.36"
    )

    # Iniciar navegador en background
    asyncio.create_task(navegador.iniciar())

    # Esperamos a que el navegador esté listo
    await esperar_chrome_listo(port)

    # Conectarse con el scraper al navegador
    scrapper = Scrapper(debugging_url=f"http://127.0.0.1:{port}")
    tab = await scrapper.nueva_tab("", wait_time=6)

    # Ejecutar acciones desde la clase Tab
    ua = await tab.obtener_user_agent()
    print(f"🧭 [{id}] User-Agent:", ua)

    title = await tab.evaluar_js("document.title")
    print(f"📄 [{id}] Título:", title)


    # botones= await tab.get_elements_by_css_selector("#mw-content-text > div.mw-content-ltr.mw-parser-output > figure:nth-child(27) > a > img")

    # for boton in botones:
    #     await boton.click()


    # # Crear carpeta si no existe
    # os.makedirs("wikipedia_md", exist_ok=True)


    # # Guardar el HTML completo
    # html = await tab.obtener_html_completo()
    # with open(f"contenido.html", "w", encoding="utf-8") as f:
    #     f.write(html)

    # # Leer enlaces del CSV
    # with open("enlaces_extraidos.csv", "r", encoding="utf-8") as f:
    #     reader = csv.reader(f)
    #     next(reader)  # saltar encabezados
    #     enlaces = list(reader)

    # for texto, enlace in enlaces:
    #     nombre_archivo = sanitizar_nombre(texto or "sin_titulo") + ".png"
    #     ruta_archivo = os.path.join("wikipedia", nombre_archivo)

    #     try:
    #         print(f"🌐 Visitando: {enlace}")
    #         tab = await scrapper.nueva_tab(enlace, wait_time=6)

    #         await tab.capturar_screenshot(ruta_archivo)
    #         print(f"📸 Captura guardada: {ruta_archivo}")

    #         await tab.cerrar()
    #     except Exception as e:
    #         print(f"❌ Error con {enlace}: {e}")


    # await tab.capturar_screenshot(f"screenshot_{id}.png")

    # html = await tab.obtener_html_completo()
    # print(html)

    # with open("contenido.html", "w", encoding="utf-8") as f:
    #     f.write(html)

    # Extraer enlaces y guardarlos en CSV


    # # # Cerrar tab y navegador si quieres
    # await asyncio.sleep(10)
    # await tab.cerrar()
    # await navegador.cerrar()

async def main():
    tareas = [iniciar_y_scrapear(i) for i in range(1)]
    await asyncio.gather(*tareas)

if __name__ == "__main__":
    asyncio.run(main())