eb8dbf66a1
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
"""Tests para extract_pdf_metadata."""
|
|
|
|
import os
|
|
import sys
|
|
|
|
from pypdf import PdfWriter
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from cybersecurity.extract_pdf_metadata import extract_pdf_metadata
|
|
|
|
|
|
def _make_pdf_with_metadata(path: str) -> None:
|
|
"""Crea un PDF de 2 paginas con doc info (titulo, autor, fechas)."""
|
|
writer = PdfWriter()
|
|
writer.add_blank_page(width=200, height=200)
|
|
writer.add_blank_page(width=200, height=200)
|
|
writer.add_metadata(
|
|
{
|
|
"/Title": "Documento OSINT",
|
|
"/Author": "Enmanuel G.",
|
|
"/Creator": "PyTestRig",
|
|
"/Producer": "pypdf",
|
|
"/CreationDate": "D:20240311102200+01'00'",
|
|
"/ModDate": "D:20240312113000+01'00'",
|
|
}
|
|
)
|
|
with open(path, "wb") as fh:
|
|
writer.write(fh)
|
|
|
|
|
|
def _make_pdf_without_metadata(path: str) -> None:
|
|
"""Crea un PDF de 1 pagina sin doc info."""
|
|
writer = PdfWriter()
|
|
writer.add_blank_page(width=100, height=100)
|
|
with open(path, "wb") as fh:
|
|
writer.write(fh)
|
|
|
|
|
|
def test_pdf_con_metadatos_devuelve_titulo_autor_paginas(tmp_path):
|
|
"""PDF con metadatos devuelve titulo, autor y num_pages."""
|
|
p = str(tmp_path / "withmeta.pdf")
|
|
_make_pdf_with_metadata(p)
|
|
|
|
meta = extract_pdf_metadata(p)
|
|
|
|
assert meta["error"] is None
|
|
assert meta["title"] == "Documento OSINT"
|
|
assert meta["author"] == "Enmanuel G."
|
|
assert meta["creator"] == "PyTestRig"
|
|
assert meta["producer"] == "pypdf"
|
|
assert meta["num_pages"] == 2
|
|
assert meta["raw"] # no vacio
|
|
|
|
|
|
def test_pdf_sin_doc_info_devuelve_none_sin_petar(tmp_path):
|
|
"""PDF sin doc info devuelve campos None sin petar."""
|
|
p = str(tmp_path / "nometa.pdf")
|
|
_make_pdf_without_metadata(p)
|
|
|
|
meta = extract_pdf_metadata(p)
|
|
|
|
assert meta["error"] is None
|
|
assert meta["num_pages"] == 1
|
|
assert meta["title"] is None
|
|
assert meta["author"] is None
|
|
|
|
|
|
def test_fechas_parseables_en_iso_8601(tmp_path):
|
|
"""fechas parseables se devuelven en ISO 8601."""
|
|
p = str(tmp_path / "dates.pdf")
|
|
_make_pdf_with_metadata(p)
|
|
|
|
meta = extract_pdf_metadata(p)
|
|
|
|
# pypdf parsea D:YYYYMMDDHHmmSS a datetime; isoformat() lleva 'T'.
|
|
assert meta["creation_date"] is not None
|
|
assert "2024-03-11" in meta["creation_date"]
|
|
assert "T" in meta["creation_date"]
|
|
assert meta["mod_date"] is not None
|
|
assert "2024-03-12" in meta["mod_date"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
with tempfile.TemporaryDirectory() as d:
|
|
test_pdf_con_metadatos_devuelve_titulo_autor_paginas(Path(d))
|
|
test_pdf_sin_doc_info_devuelve_none_sin_petar(Path(d))
|
|
test_fechas_parseables_en_iso_8601(Path(d))
|
|
print("Todos los tests pasaron.")
|