aef8791151
- Added Appshell component with responsive navbar and main content area - Integrated ColorSchemeToggle for light/dark mode switching - Created Welcome component with styled title and introductory text - Developed ChatPage for LLM interaction with WebSocket support - Implemented Biblioteca for managing notes with rich text editor - Added LoginPage for user authentication with error handling - Introduced MessageList and MessageBubble components for chat messages - Styled components with CSS modules for consistent design
123 lines
3.7 KiB
Python
123 lines
3.7 KiB
Python
import asyncio
|
|
import os
|
|
import re
|
|
from domains.ScrappingWeb.Navegador import Navegador
|
|
from domains.ScrappingWeb.Scrapper import Scrapper
|
|
from domains.ScrappingWeb.Tab import Tab
|
|
import aiohttp
|
|
import csv
|
|
|
|
|
|
async def esperar_chrome_listo(port, timeout=10):
|
|
url = f"http://127.0.0.1:{port}/json"
|
|
for _ in range(timeout * 2):
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url) as resp:
|
|
if resp.status == 200:
|
|
return
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(0.5)
|
|
raise TimeoutError(f"Chrome en puerto {port} no respondió dentro del tiempo esperado.")
|
|
|
|
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
|
|
def sanitizar_nombre(nombre: str) -> str:
|
|
# Eliminar caracteres inválidos para nombre de archivo
|
|
return re.sub(r'[\\/*?:"<>|]', "_", nombre).strip()[:100]
|
|
|
|
|
|
async def iniciar_y_scrapear(id: int):
|
|
user_data_dir = os.path.abspath(f"./Perfiles_usuario/chrome_profile_{id}")
|
|
port = 9222 + id
|
|
navegador = Navegador(
|
|
chrome_path=chrome_path,
|
|
user_data_dir=user_data_dir,
|
|
id=id,
|
|
download_dir=os.path.join(user_data_dir, "downloads"),
|
|
debugging_port=port,
|
|
headless=False,
|
|
user_agent=f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/{100+id}.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
# Iniciar navegador en background
|
|
asyncio.create_task(navegador.iniciar())
|
|
|
|
# Esperamos a que el navegador esté listo
|
|
await esperar_chrome_listo(port)
|
|
|
|
# Conectarse con el scraper al navegador
|
|
scrapper = Scrapper(debugging_url=f"http://127.0.0.1:{port}")
|
|
tab = await scrapper.nueva_tab("", wait_time=6)
|
|
|
|
# Ejecutar acciones desde la clase Tab
|
|
ua = await tab.obtener_user_agent()
|
|
print(f"🧭 [{id}] User-Agent:", ua)
|
|
|
|
title = await tab.evaluar_js("document.title")
|
|
print(f"📄 [{id}] Título:", title)
|
|
|
|
|
|
# botones= await tab.get_elements_by_css_selector("#mw-content-text > div.mw-content-ltr.mw-parser-output > figure:nth-child(27) > a > img")
|
|
|
|
# for boton in botones:
|
|
# await boton.click()
|
|
|
|
|
|
# # Crear carpeta si no existe
|
|
# os.makedirs("wikipedia_md", exist_ok=True)
|
|
|
|
|
|
# # Guardar el HTML completo
|
|
# html = await tab.obtener_html_completo()
|
|
# with open(f"contenido.html", "w", encoding="utf-8") as f:
|
|
# f.write(html)
|
|
|
|
# # Leer enlaces del CSV
|
|
# with open("enlaces_extraidos.csv", "r", encoding="utf-8") as f:
|
|
# reader = csv.reader(f)
|
|
# next(reader) # saltar encabezados
|
|
# enlaces = list(reader)
|
|
|
|
# for texto, enlace in enlaces:
|
|
# nombre_archivo = sanitizar_nombre(texto or "sin_titulo") + ".png"
|
|
# ruta_archivo = os.path.join("wikipedia", nombre_archivo)
|
|
|
|
# try:
|
|
# print(f"🌐 Visitando: {enlace}")
|
|
# tab = await scrapper.nueva_tab(enlace, wait_time=6)
|
|
|
|
# await tab.capturar_screenshot(ruta_archivo)
|
|
# print(f"📸 Captura guardada: {ruta_archivo}")
|
|
|
|
# await tab.cerrar()
|
|
# except Exception as e:
|
|
# print(f"❌ Error con {enlace}: {e}")
|
|
|
|
|
|
# await tab.capturar_screenshot(f"screenshot_{id}.png")
|
|
|
|
# html = await tab.obtener_html_completo()
|
|
# print(html)
|
|
|
|
# with open("contenido.html", "w", encoding="utf-8") as f:
|
|
# f.write(html)
|
|
|
|
# Extraer enlaces y guardarlos en CSV
|
|
|
|
|
|
|
|
|
|
# # # Cerrar tab y navegador si quieres
|
|
# await asyncio.sleep(10)
|
|
# await tab.cerrar()
|
|
# await navegador.cerrar()
|
|
|
|
async def main():
|
|
tareas = [iniciar_y_scrapear(i) for i in range(1)]
|
|
await asyncio.gather(*tareas)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|