"""Tests para extract_pdf_metadata.""" import os import sys from pypdf import PdfWriter sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from cybersecurity.extract_pdf_metadata import extract_pdf_metadata def _make_pdf_with_metadata(path: str) -> None: """Crea un PDF de 2 paginas con doc info (titulo, autor, fechas).""" writer = PdfWriter() writer.add_blank_page(width=200, height=200) writer.add_blank_page(width=200, height=200) writer.add_metadata( { "/Title": "Documento OSINT", "/Author": "Enmanuel G.", "/Creator": "PyTestRig", "/Producer": "pypdf", "/CreationDate": "D:20240311102200+01'00'", "/ModDate": "D:20240312113000+01'00'", } ) with open(path, "wb") as fh: writer.write(fh) def _make_pdf_without_metadata(path: str) -> None: """Crea un PDF de 1 pagina sin doc info.""" writer = PdfWriter() writer.add_blank_page(width=100, height=100) with open(path, "wb") as fh: writer.write(fh) def test_pdf_con_metadatos_devuelve_titulo_autor_paginas(tmp_path): """PDF con metadatos devuelve titulo, autor y num_pages.""" p = str(tmp_path / "withmeta.pdf") _make_pdf_with_metadata(p) meta = extract_pdf_metadata(p) assert meta["error"] is None assert meta["title"] == "Documento OSINT" assert meta["author"] == "Enmanuel G." assert meta["creator"] == "PyTestRig" assert meta["producer"] == "pypdf" assert meta["num_pages"] == 2 assert meta["raw"] # no vacio def test_pdf_sin_doc_info_devuelve_none_sin_petar(tmp_path): """PDF sin doc info devuelve campos None sin petar.""" p = str(tmp_path / "nometa.pdf") _make_pdf_without_metadata(p) meta = extract_pdf_metadata(p) assert meta["error"] is None assert meta["num_pages"] == 1 assert meta["title"] is None assert meta["author"] is None def test_fechas_parseables_en_iso_8601(tmp_path): """fechas parseables se devuelven en ISO 8601.""" p = str(tmp_path / "dates.pdf") _make_pdf_with_metadata(p) meta = extract_pdf_metadata(p) # pypdf parsea D:YYYYMMDDHHmmSS a datetime; isoformat() lleva 'T'. assert meta["creation_date"] is not None assert "2024-03-11" in meta["creation_date"] assert "T" in meta["creation_date"] assert meta["mod_date"] is not None assert "2024-03-12" in meta["mod_date"] if __name__ == "__main__": import tempfile from pathlib import Path with tempfile.TemporaryDirectory() as d: test_pdf_con_metadatos_devuelve_titulo_autor_paginas(Path(d)) test_pdf_sin_doc_info_devuelve_none_sin_petar(Path(d)) test_fechas_parseables_en_iso_8601(Path(d)) print("Todos los tests pasaron.")