This repository has been archived on 2025-11-27. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Fitz_Studio/scrappers/prueba_navegadores.py
egutierrez aef8791151 feat: Implement main application shell with navigation and color scheme toggle
- Added Appshell component with responsive navbar and main content area
- Integrated ColorSchemeToggle for light/dark mode switching
- Created Welcome component with styled title and introductory text
- Developed ChatPage for LLM interaction with WebSocket support
- Implemented Biblioteca for managing notes with rich text editor
- Added LoginPage for user authentication with error handling
- Introduced MessageList and MessageBubble components for chat messages
- Styled components with CSS modules for consistent design
2025-06-21 02:01:21 +02:00

123 lines
3.7 KiB
Python

import asyncio
import os
import re
from domains.ScrappingWeb.Navegador import Navegador
from domains.ScrappingWeb.Scrapper import Scrapper
from domains.ScrappingWeb.Tab import Tab
import aiohttp
import csv
async def esperar_chrome_listo(port, timeout=10):
url = f"http://127.0.0.1:{port}/json"
for _ in range(timeout * 2):
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if resp.status == 200:
return
except Exception:
pass
await asyncio.sleep(0.5)
raise TimeoutError(f"Chrome en puerto {port} no respondió dentro del tiempo esperado.")
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
def sanitizar_nombre(nombre: str) -> str:
# Eliminar caracteres inválidos para nombre de archivo
return re.sub(r'[\\/*?:"<>|]', "_", nombre).strip()[:100]
async def iniciar_y_scrapear(id: int):
user_data_dir = os.path.abspath(f"./Perfiles_usuario/chrome_profile_{id}")
port = 9222 + id
navegador = Navegador(
chrome_path=chrome_path,
user_data_dir=user_data_dir,
id=id,
download_dir=os.path.join(user_data_dir, "downloads"),
debugging_port=port,
headless=False,
user_agent=f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/{100+id}.0.0.0 Safari/537.36"
)
# Iniciar navegador en background
asyncio.create_task(navegador.iniciar())
# Esperamos a que el navegador esté listo
await esperar_chrome_listo(port)
# Conectarse con el scraper al navegador
scrapper = Scrapper(debugging_url=f"http://127.0.0.1:{port}")
tab = await scrapper.nueva_tab("", wait_time=6)
# Ejecutar acciones desde la clase Tab
ua = await tab.obtener_user_agent()
print(f"🧭 [{id}] User-Agent:", ua)
title = await tab.evaluar_js("document.title")
print(f"📄 [{id}] Título:", title)
# botones= await tab.get_elements_by_css_selector("#mw-content-text > div.mw-content-ltr.mw-parser-output > figure:nth-child(27) > a > img")
# for boton in botones:
# await boton.click()
# # Crear carpeta si no existe
# os.makedirs("wikipedia_md", exist_ok=True)
# # Guardar el HTML completo
# html = await tab.obtener_html_completo()
# with open(f"contenido.html", "w", encoding="utf-8") as f:
# f.write(html)
# # Leer enlaces del CSV
# with open("enlaces_extraidos.csv", "r", encoding="utf-8") as f:
# reader = csv.reader(f)
# next(reader) # saltar encabezados
# enlaces = list(reader)
# for texto, enlace in enlaces:
# nombre_archivo = sanitizar_nombre(texto or "sin_titulo") + ".png"
# ruta_archivo = os.path.join("wikipedia", nombre_archivo)
# try:
# print(f"🌐 Visitando: {enlace}")
# tab = await scrapper.nueva_tab(enlace, wait_time=6)
# await tab.capturar_screenshot(ruta_archivo)
# print(f"📸 Captura guardada: {ruta_archivo}")
# await tab.cerrar()
# except Exception as e:
# print(f"❌ Error con {enlace}: {e}")
# await tab.capturar_screenshot(f"screenshot_{id}.png")
# html = await tab.obtener_html_completo()
# print(html)
# with open("contenido.html", "w", encoding="utf-8") as f:
# f.write(html)
# Extraer enlaces y guardarlos en CSV
# # # Cerrar tab y navegador si quieres
# await asyncio.sleep(10)
# await tab.cerrar()
# await navegador.cerrar()
async def main():
tareas = [iniciar_y_scrapear(i) for i in range(1)]
await asyncio.gather(*tareas)
if __name__ == "__main__":
asyncio.run(main())