diff --git a/__pycache__/crawler_db_model.cpython-310.pyc b/__pycache__/crawler_db_model.cpython-310.pyc index 0d9161f..ff32e78 100644 Binary files a/__pycache__/crawler_db_model.cpython-310.pyc and b/__pycache__/crawler_db_model.cpython-310.pyc differ diff --git a/admin_tools.py b/admin_tools.py new file mode 100644 index 0000000..bbc56ad --- /dev/null +++ b/admin_tools.py @@ -0,0 +1,26 @@ +from sqlalchemy.orm import sessionmaker +from crawler_db_model import WebsPorVisitar, WebsVisitadas, engine + +# Crear sesión de base de datos +Session = sessionmaker(bind=engine) +session = Session() + +def limpiar_base_datos(): + """Elimina todos los registros de las tablas webs_por_visitar y webs_visitadas.""" + try: + # Eliminar todos los registros de WebsPorVisitar + session.query(WebsPorVisitar).delete() + # Eliminar todos los registros de WebsVisitadas + session.query(WebsVisitadas).delete() + + # Confirmar cambios + session.commit() + print("La base de datos ha sido limpiada completamente.") + except Exception as e: + print(f"Error al limpiar la base de datos: {e}") + session.rollback() + +if __name__ == "__main__": + # Limpiar la base de datos cuando se ejecuta este script directamente + # limpiar_base_datos() + pass \ No newline at end of file diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..daff036 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,117 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql://postgres:oL2YGslA8BlnDqEzgg1k6fF82kkibA@localhost:5432/webpages + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/__pycache__/env.cpython-310.pyc b/alembic/__pycache__/env.cpython-310.pyc new file mode 100644 index 0000000..ca6eb8f Binary files /dev/null and b/alembic/__pycache__/env.cpython-310.pyc differ diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..b736f81 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,81 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +from crawler_db_model import Base + + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/71e5651bf6a8_add_detalles_json_column.py b/alembic/versions/71e5651bf6a8_add_detalles_json_column.py new file mode 100644 index 0000000..342a660 --- /dev/null +++ b/alembic/versions/71e5651bf6a8_add_detalles_json_column.py @@ -0,0 +1,32 @@ +"""Add detalles_json column + +Revision ID: 71e5651bf6a8 +Revises: +Create Date: 2024-12-19 23:54:41.284998 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '71e5651bf6a8' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('webs_visitadas', sa.Column('detalles_json', sa.JSON(), nullable=True)) + op.drop_column('webs_visitadas', 'es_dinamico') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('webs_visitadas', sa.Column('es_dinamico', sa.BOOLEAN(), autoincrement=False, nullable=True)) + op.drop_column('webs_visitadas', 'detalles_json') + # ### end Alembic commands ### diff --git a/alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc b/alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc new file mode 100644 index 0000000..b6de46c Binary files /dev/null and b/alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc differ diff --git a/crawler.py b/crawler.py index 2c1ff5a..20eb5cd 100644 --- a/crawler.py +++ b/crawler.py @@ -62,14 +62,18 @@ def generate_summary(soup): # Extraer texto de párrafos principales paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')][:5] + # Generar un resumen en texto plano + resumen_texto = f"Título: {titulo}. Descripción: {descripcion}. Encabezados: {', '.join(headers['h1'][:3])}." + # Resumen estructurado - summary = { + summary_json = { "titulo": titulo, "descripcion": descripcion, "encabezados": headers, "parrafos": paragraphs } - return summary + + return resumen_texto, summary_json def process_url(url): """Procesa una URL: extrae datos y las inserta en las tablas correspondientes.""" @@ -86,7 +90,7 @@ def process_url(url): soup = BeautifulSoup(response.text, 'html.parser') dominio = urlparse(url).netloc - summary = generate_summary(soup) + resumen_texto, resumen_json = generate_summary(soup) ip = requests.get(f"https://api.ipify.org?domain={dominio}&format=json").json().get('ip', "") contenido_hash = hash_content(response.text) @@ -94,8 +98,8 @@ def process_url(url): visitada = WebsVisitadas( url=url, dominio=dominio, - titulo=summary['titulo'], - resumen=f"Descripción: {summary['descripcion']}, Encabezados: {summary['encabezados']}, Párrafos: {summary['parrafos']}", + titulo=resumen_json['titulo'], + resumen=resumen_texto, # Resumen en texto plano ip=ip, codigo_http=status_code, contenido_hash=contenido_hash @@ -133,7 +137,7 @@ def crawl(): if __name__ == "__main__": # URL inicial - url_inicial = "http://example.com" + url_inicial = "https://datos.bancomundial.org/" # Insertar URL inicial si no existe if not session.query(WebsPorVisitar).filter_by(url=url_inicial).first(): diff --git a/crawler_db_model.py b/crawler_db_model.py index 366dd46..bdfa548 100644 --- a/crawler_db_model.py +++ b/crawler_db_model.py @@ -1,6 +1,7 @@ from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker +from sqlalchemy import JSON from datetime import datetime from dotenv import load_dotenv import os @@ -22,12 +23,12 @@ class WebsVisitadas(Base): url = Column(String(2048), unique=True, nullable=False) dominio = Column(String(255), nullable=False) titulo = Column(String(255), nullable=True) - resumen = Column(Text, nullable=True) - ip = Column(String(45), nullable=True) # IPv4 o IPv6 + resumen = Column(Text, nullable=True) # Resumen en texto plano + detalles_json = Column(JSON, nullable=True) # JSON estructurado + ip = Column(String(45), nullable=True) fecha_creacion = Column(DateTime, default=datetime.utcnow, nullable=False) - codigo_http = Column(Integer, nullable=True) # Código de respuesta HTTP - contenido_hash = Column(String(64), nullable=True) # Hash del contenido para detectar cambios - es_dinamico = Column(Boolean, default=False) # Si es una página generada dinámicamente + codigo_http = Column(Integer, nullable=True) + contenido_hash = Column(String(64), nullable=True) # Cargar variables de entorno desde el archivo .env load_dotenv()