Add database migration scripts and update models for JSON support

This commit is contained in:
2024-12-19 23:55:37 +01:00
parent abbc23ae46
commit ed7eb57f6c
11 changed files with 299 additions and 11 deletions
Binary file not shown.
+26
View File
@@ -0,0 +1,26 @@
from sqlalchemy.orm import sessionmaker
from crawler_db_model import WebsPorVisitar, WebsVisitadas, engine
# Crear sesión de base de datos
Session = sessionmaker(bind=engine)
session = Session()
def limpiar_base_datos():
"""Elimina todos los registros de las tablas webs_por_visitar y webs_visitadas."""
try:
# Eliminar todos los registros de WebsPorVisitar
session.query(WebsPorVisitar).delete()
# Eliminar todos los registros de WebsVisitadas
session.query(WebsVisitadas).delete()
# Confirmar cambios
session.commit()
print("La base de datos ha sido limpiada completamente.")
except Exception as e:
print(f"Error al limpiar la base de datos: {e}")
session.rollback()
if __name__ == "__main__":
# Limpiar la base de datos cuando se ejecuta este script directamente
# limpiar_base_datos()
pass
+117
View File
@@ -0,0 +1,117 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
# Use forward slashes (/) also on windows to provide an os agnostic path
script_location = alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
# version_path_separator = newline
version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = postgresql://postgres:oL2YGslA8BlnDqEzgg1k6fF82kkibA@localhost:5432/webpages
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
+1
View File
@@ -0,0 +1 @@
Generic single-database configuration.
Binary file not shown.
+81
View File
@@ -0,0 +1,81 @@
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
from crawler_db_model import Base
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
+26
View File
@@ -0,0 +1,26 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}
@@ -0,0 +1,32 @@
"""Add detalles_json column
Revision ID: 71e5651bf6a8
Revises:
Create Date: 2024-12-19 23:54:41.284998
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '71e5651bf6a8'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('webs_visitadas', sa.Column('detalles_json', sa.JSON(), nullable=True))
op.drop_column('webs_visitadas', 'es_dinamico')
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('webs_visitadas', sa.Column('es_dinamico', sa.BOOLEAN(), autoincrement=False, nullable=True))
op.drop_column('webs_visitadas', 'detalles_json')
# ### end Alembic commands ###
+10 -6
View File
@@ -62,14 +62,18 @@ def generate_summary(soup):
# Extraer texto de párrafos principales
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')][:5]
# Generar un resumen en texto plano
resumen_texto = f"Título: {titulo}. Descripción: {descripcion}. Encabezados: {', '.join(headers['h1'][:3])}."
# Resumen estructurado
summary = {
summary_json = {
"titulo": titulo,
"descripcion": descripcion,
"encabezados": headers,
"parrafos": paragraphs
}
return summary
return resumen_texto, summary_json
def process_url(url):
"""Procesa una URL: extrae datos y las inserta en las tablas correspondientes."""
@@ -86,7 +90,7 @@ def process_url(url):
soup = BeautifulSoup(response.text, 'html.parser')
dominio = urlparse(url).netloc
summary = generate_summary(soup)
resumen_texto, resumen_json = generate_summary(soup)
ip = requests.get(f"https://api.ipify.org?domain={dominio}&format=json").json().get('ip', "")
contenido_hash = hash_content(response.text)
@@ -94,8 +98,8 @@ def process_url(url):
visitada = WebsVisitadas(
url=url,
dominio=dominio,
titulo=summary['titulo'],
resumen=f"Descripción: {summary['descripcion']}, Encabezados: {summary['encabezados']}, Párrafos: {summary['parrafos']}",
titulo=resumen_json['titulo'],
resumen=resumen_texto, # Resumen en texto plano
ip=ip,
codigo_http=status_code,
contenido_hash=contenido_hash
@@ -133,7 +137,7 @@ def crawl():
if __name__ == "__main__":
# URL inicial
url_inicial = "http://example.com"
url_inicial = "https://datos.bancomundial.org/"
# Insertar URL inicial si no existe
if not session.query(WebsPorVisitar).filter_by(url=url_inicial).first():
+6 -5
View File
@@ -1,6 +1,7 @@
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import JSON
from datetime import datetime
from dotenv import load_dotenv
import os
@@ -22,12 +23,12 @@ class WebsVisitadas(Base):
url = Column(String(2048), unique=True, nullable=False)
dominio = Column(String(255), nullable=False)
titulo = Column(String(255), nullable=True)
resumen = Column(Text, nullable=True)
ip = Column(String(45), nullable=True) # IPv4 o IPv6
resumen = Column(Text, nullable=True) # Resumen en texto plano
detalles_json = Column(JSON, nullable=True) # JSON estructurado
ip = Column(String(45), nullable=True)
fecha_creacion = Column(DateTime, default=datetime.utcnow, nullable=False)
codigo_http = Column(Integer, nullable=True) # Código de respuesta HTTP
contenido_hash = Column(String(64), nullable=True) # Hash del contenido para detectar cambios
es_dinamico = Column(Boolean, default=False) # Si es una página generada dinámicamente
codigo_http = Column(Integer, nullable=True)
contenido_hash = Column(String(64), nullable=True)
# Cargar variables de entorno desde el archivo .env
load_dotenv()