From ed7eb57f6c82699d57fee82c789c1dea0b62c44f Mon Sep 17 00:00:00 2001 From: egutierrez Date: Thu, 19 Dec 2024 23:55:37 +0100 Subject: [PATCH] Add database migration scripts and update models for JSON support --- __pycache__/crawler_db_model.cpython-310.pyc | Bin 1606 -> 1628 bytes admin_tools.py | 26 ++++ alembic.ini | 117 ++++++++++++++++++ alembic/README | 1 + alembic/__pycache__/env.cpython-310.pyc | Bin 0 -> 1794 bytes alembic/env.py | 81 ++++++++++++ alembic/script.py.mako | 26 ++++ .../71e5651bf6a8_add_detalles_json_column.py | 32 +++++ ...8_add_detalles_json_column.cpython-310.pyc | Bin 0 -> 1098 bytes crawler.py | 16 ++- crawler_db_model.py | 11 +- 11 files changed, 299 insertions(+), 11 deletions(-) create mode 100644 admin_tools.py create mode 100644 alembic.ini create mode 100644 alembic/README create mode 100644 alembic/__pycache__/env.cpython-310.pyc create mode 100644 alembic/env.py create mode 100644 alembic/script.py.mako create mode 100644 alembic/versions/71e5651bf6a8_add_detalles_json_column.py create mode 100644 alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc diff --git a/__pycache__/crawler_db_model.cpython-310.pyc b/__pycache__/crawler_db_model.cpython-310.pyc index 0d9161f3a579e2effbf94461fca0b90d897c2fa2..ff32e78b76554a06863bec525e419207ecb36b50 100644 GIT binary patch delta 622 zcmYL_O=}ZD7{}+??B;c5vw5{?TARjFNJUV*ikMPgs8Ye$3WHE8Gn9r+R%WXw!NYpZ zAGnBbwW8Zf%s_T@Ct*dotO_GG@aMlH&)pq8-LnVq z2U_-#^AfoI>}=+HnA!B~cjtE3M_IuYb3~E3;5&5)pbo~jYq@uig>qDJ=15<=x}Zu4 zT>2{d07KN3r^>UOY@qipYt5{;^1rvGIOY-dXKwvPm9DNM9+(F5*I&c{OpEjlglUty zhu9mmO^0@t30=YpAxCfts|1g5)oQoDVdx=JD1CYqhB`hR$I&!(PF|Y=_0R*&n*HeB zI_QyHV4Z9qL5boe#Ns0Y;5Y-E9GZ}INxn0DKHMIT?jPgih}@T}B{;?&(finXE7HhufW+uiDsT!12^#XFNoy$SVQKo?VSgK@B1`IUdx CrhBCT delta 605 zcmZvYKX21O7{>1$+i~Kv~*# z@Q=An6jw>fWm`p~&e0jwKIVAqt2T%!mtEDxKbOGWWodANVP>aKO~$|S#Ozsn+&Ay7 z^X@vH>vi+R+FS`xQs_b;JZAIIih0Cit^>1Kdv&`Gjgo@}-qezTvQ?;BqX-;mPBAHZ z=l5NVx(@!|+C}sU)(L#qh5*xq=Csy-7*gv6#0?!IPkta$t`pLi5V}oj3xTH{-0r3w z-Njum5ta!ngc`vntP&i;n$h;TsIrF7be+6U=$Uu+%HBRGl%`?pzWL=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql://postgres:oL2YGslA8BlnDqEzgg1k6fF82kkibA@localhost:5432/webpages + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/README b/alembic/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/alembic/__pycache__/env.cpython-310.pyc b/alembic/__pycache__/env.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca6eb8f67787e692f3c2650063a2028393c65c3f GIT binary patch literal 1794 zcmZvc%WE4)7{F&%tCh5R*m0YXLkUADI7@5yd*hwK!j7id#3WFH!jHFR#XVsZm zKZ1R7FNK=idX}M={!4r9ssBRL;(jwL*>c-mX7-!c_uAh#dAA!Qc(#9K6V^oNcQ;sn zYB2Z$n)w+zh8UJ8!vl<+9LON64QgJlfgBD(FNd<8MT5x8b=k<8gC<5inzaTk-kQZM zDzJXcv6Ih16K~GigAVJkCTsl|4!R)4tPN6+M~jV1q@VKcIRb6bE5v&%De3+J9-#!6 z@mNT{o2#*yz-WuBiBNnv)_FEeSF_XfY>3<4wr z?Lsr(KxfecGkl59kPT+FQ)ELHuo?>mX7vjQ+6IgIC3@TSQgjKdAq z$b5JN!MgW5Xl^cP&@aK%%R-S%Of|%Ng&-F@fjxR2WhPRA_F9 zopLvH(U2oyr({+bODOsJZ0maA<`_#szBan8VtXFE^#Ul5y<5vV() zLOKh}NOq}|;D5eyzt4|(T3G%lA>KUCg!R@Y!f7;Cd-J)Q^A=62#oUs-u=Bz$mA>hj zA0(COlxHXXLd(}>Jg1sw#-2!C))mb-OTw}SOXY^8dcy56h_gAU8v#B zPfoCQW%nD>@4@HKN6$Y!*w^_9m^25wnjT56hjTjN<^cT2Mj}1n>aagQDev9ZGhBi~ zK#O@Fy0{bKz`qfW{)!thet4zt!on@S1DI$*^L+mTf<+cD0*hxgfi44vZ8)p5;G%Yh zPSFB*e*4iGwvAcy6x$Z7T?D{+V-drA$4l)c@!^ViwF$gGS?3+<;d+&YuUviH8pHl) z>-*iFr2gio|0@plpW1k#fDM4zq+C%ga=^C&(Lv@|1qfGVa9teKm^>AJ&5@`e`|zu( zT3Rcv6<_T&zLKDf=9-ViaoKV=ODaJbOvX2QJPLbBN7o>~x(8j^cJ!FaOu1D-GZad! z=5*+GHTMcB_d%(@tejV#GQH{aIKK29*KI)u7TZgUuDtYpx8R7XItN5~_us_$?0o`D z<^gm;qXW2vIQlz|LifFj!x%rf>b(o<58r9GB?)u`K6}aLpeOSQpgrkV&T9u!8Q&zP z3f70wSd#0VA8e*es@Z7h?lh?*I9jD;bD0TujSpR&md#aRR`EK1y(i^{Fe~J}nzdkk o*goRsXJDa_{Bw7Wn-8Gtpia=i(1NJ$!i%D~8Q*W-i*X$M1DtT-ZvX%Q literal 0 HcmV?d00001 diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..b736f81 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,81 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +from crawler_db_model import Base + + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/script.py.mako b/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/alembic/versions/71e5651bf6a8_add_detalles_json_column.py b/alembic/versions/71e5651bf6a8_add_detalles_json_column.py new file mode 100644 index 0000000..342a660 --- /dev/null +++ b/alembic/versions/71e5651bf6a8_add_detalles_json_column.py @@ -0,0 +1,32 @@ +"""Add detalles_json column + +Revision ID: 71e5651bf6a8 +Revises: +Create Date: 2024-12-19 23:54:41.284998 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '71e5651bf6a8' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('webs_visitadas', sa.Column('detalles_json', sa.JSON(), nullable=True)) + op.drop_column('webs_visitadas', 'es_dinamico') + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('webs_visitadas', sa.Column('es_dinamico', sa.BOOLEAN(), autoincrement=False, nullable=True)) + op.drop_column('webs_visitadas', 'detalles_json') + # ### end Alembic commands ### diff --git a/alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc b/alembic/versions/__pycache__/71e5651bf6a8_add_detalles_json_column.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6de46cc1955043de1738accc7578770d7f5ceb3 GIT binary patch literal 1098 zcmZvb&ubGw6vuaGe?}%lU6FefvfcmMn5dW_^F-cWux;NANdvK z!~i2OzyuPRF$qX)1(vCOE3#uJaB|y@+`t1TDDWcp!QC~$4ockOMX*oFs(~>tkCks6HL&|ev;KLs2haeXu_)q_p35|8glRO5 z6W{;DFUN9>fp-Vp#>*Dp+}dmnPq*0iJi}$T;qMF1GTt~q={DNW+nx1Rd%d;OXus%g zcDkL`Mti%nv$O5{zYv1aB&yKo7gL^uTsbER9^Nz+NhiIayj(yP#GWxwwq^;8K&I$5N@8{#3pk<0J?VR!IRq%$67aAFyN0g4Bg!tshdK3|1m3KFlV-b#{JB|xwe<6 zxp&mOp3r{`Q9*foj3ckGX_k(YQ1F;1nJDXI31z`ZRCS=HrH-P!H%CVw4)=QEfezg< znOiG*M^S$h`Bm1?&9CxQU*$1+O(Jyjy=Gl`ltLO(s@!Zg8RH8C6T(7K-h$#{Nb(|L zQFzAV*}M-&@3CYiRE<)WBx%Mn&6-l->N7ko`n$^M<#q3Ew6GheFpc;t6ZcBpyKB_F RGAWp4d&PW$Ki{f)e*nB84Cep< literal 0 HcmV?d00001 diff --git a/crawler.py b/crawler.py index 2c1ff5a..20eb5cd 100644 --- a/crawler.py +++ b/crawler.py @@ -62,14 +62,18 @@ def generate_summary(soup): # Extraer texto de párrafos principales paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')][:5] + # Generar un resumen en texto plano + resumen_texto = f"Título: {titulo}. Descripción: {descripcion}. Encabezados: {', '.join(headers['h1'][:3])}." + # Resumen estructurado - summary = { + summary_json = { "titulo": titulo, "descripcion": descripcion, "encabezados": headers, "parrafos": paragraphs } - return summary + + return resumen_texto, summary_json def process_url(url): """Procesa una URL: extrae datos y las inserta en las tablas correspondientes.""" @@ -86,7 +90,7 @@ def process_url(url): soup = BeautifulSoup(response.text, 'html.parser') dominio = urlparse(url).netloc - summary = generate_summary(soup) + resumen_texto, resumen_json = generate_summary(soup) ip = requests.get(f"https://api.ipify.org?domain={dominio}&format=json").json().get('ip', "") contenido_hash = hash_content(response.text) @@ -94,8 +98,8 @@ def process_url(url): visitada = WebsVisitadas( url=url, dominio=dominio, - titulo=summary['titulo'], - resumen=f"Descripción: {summary['descripcion']}, Encabezados: {summary['encabezados']}, Párrafos: {summary['parrafos']}", + titulo=resumen_json['titulo'], + resumen=resumen_texto, # Resumen en texto plano ip=ip, codigo_http=status_code, contenido_hash=contenido_hash @@ -133,7 +137,7 @@ def crawl(): if __name__ == "__main__": # URL inicial - url_inicial = "http://example.com" + url_inicial = "https://datos.bancomundial.org/" # Insertar URL inicial si no existe if not session.query(WebsPorVisitar).filter_by(url=url_inicial).first(): diff --git a/crawler_db_model.py b/crawler_db_model.py index 366dd46..bdfa548 100644 --- a/crawler_db_model.py +++ b/crawler_db_model.py @@ -1,6 +1,7 @@ from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker +from sqlalchemy import JSON from datetime import datetime from dotenv import load_dotenv import os @@ -22,12 +23,12 @@ class WebsVisitadas(Base): url = Column(String(2048), unique=True, nullable=False) dominio = Column(String(255), nullable=False) titulo = Column(String(255), nullable=True) - resumen = Column(Text, nullable=True) - ip = Column(String(45), nullable=True) # IPv4 o IPv6 + resumen = Column(Text, nullable=True) # Resumen en texto plano + detalles_json = Column(JSON, nullable=True) # JSON estructurado + ip = Column(String(45), nullable=True) fecha_creacion = Column(DateTime, default=datetime.utcnow, nullable=False) - codigo_http = Column(Integer, nullable=True) # Código de respuesta HTTP - contenido_hash = Column(String(64), nullable=True) # Hash del contenido para detectar cambios - es_dinamico = Column(Boolean, default=False) # Si es una página generada dinámicamente + codigo_http = Column(Integer, nullable=True) + contenido_hash = Column(String(64), nullable=True) # Cargar variables de entorno desde el archivo .env load_dotenv()