eb8dbf66a1
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
87 lines
3.0 KiB
Python
87 lines
3.0 KiB
Python
"""Extrae metadatos de un PDF con pypdf (OSINT pasiva sobre documentos propios)."""
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
|
def _iso_or_raw(reader: PdfReader, getter_name: str, raw_value):
|
|
"""Devuelve una fecha en ISO 8601 si pypdf la sabe parsear, sino el valor crudo.
|
|
|
|
pypdf expone `creation_date`/`modification_date` (datetime) ademas del
|
|
string crudo `/CreationDate`/`/ModDate` en formato `D:YYYYMMDDHHmmSS`.
|
|
"""
|
|
try:
|
|
dt = getattr(reader.metadata, getter_name, None)
|
|
except Exception:
|
|
dt = None
|
|
if dt is not None:
|
|
try:
|
|
return dt.isoformat()
|
|
except Exception:
|
|
pass
|
|
return str(raw_value) if raw_value is not None else None
|
|
|
|
|
|
def extract_pdf_metadata(pdf_path: str) -> dict:
|
|
"""Lee los metadatos del Document Info de un PDF.
|
|
|
|
Abre el PDF con pypdf y extrae los campos estandar del diccionario de
|
|
informacion (titulo, autor, creador, productor, fechas) mas el numero de
|
|
paginas. Las fechas se devuelven en ISO 8601 cuando son parseables, en su
|
|
valor crudo en caso contrario. No falla si el PDF esta cifrado: captura la
|
|
excepcion, devuelve lo que pueda y rellena el campo `error`.
|
|
|
|
Args:
|
|
pdf_path: ruta al archivo PDF en disco.
|
|
|
|
Returns:
|
|
dict con las claves: title, author, creator, producer, creation_date,
|
|
mod_date, num_pages, raw (dict con todo el doc info) y error (None si
|
|
todo fue bien, mensaje de error en caso contrario).
|
|
"""
|
|
result = {
|
|
"title": None,
|
|
"author": None,
|
|
"creator": None,
|
|
"producer": None,
|
|
"creation_date": None,
|
|
"mod_date": None,
|
|
"num_pages": None,
|
|
"raw": {},
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
reader = PdfReader(pdf_path)
|
|
|
|
# PDF cifrado: intentar abrir con password vacio (caso comun).
|
|
if getattr(reader, "is_encrypted", False):
|
|
try:
|
|
reader.decrypt("")
|
|
except Exception as exc:
|
|
result["error"] = f"encrypted: {exc}"
|
|
|
|
try:
|
|
result["num_pages"] = len(reader.pages)
|
|
except Exception as exc:
|
|
result["error"] = result["error"] or f"pages: {exc}"
|
|
|
|
meta = reader.metadata
|
|
if meta is not None:
|
|
result["title"] = str(meta.title) if meta.title is not None else None
|
|
result["author"] = str(meta.author) if meta.author is not None else None
|
|
result["creator"] = str(meta.creator) if meta.creator is not None else None
|
|
result["producer"] = (
|
|
str(meta.producer) if meta.producer is not None else None
|
|
)
|
|
result["creation_date"] = _iso_or_raw(
|
|
reader, "creation_date", meta.get("/CreationDate")
|
|
)
|
|
result["mod_date"] = _iso_or_raw(
|
|
reader, "modification_date", meta.get("/ModDate")
|
|
)
|
|
result["raw"] = {str(k): str(v) for k, v in meta.items()}
|
|
except Exception as exc:
|
|
result["error"] = result["error"] or str(exc)
|
|
|
|
return result
|