eb8dbf66a1
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
108 lines
3.7 KiB
Python
108 lines
3.7 KiB
Python
"""Orquestador OSINT pasivo: escanea metadatos de los attachments de una ficha.
|
|
|
|
Recorre un directorio de attachments (imagenes y PDFs) y extrae sus metadatos
|
|
componiendo las funciones atomicas del registro (`extract_exif_metadata`,
|
|
`extract_pdf_metadata`). Agrega los puntos GPS y las fechas encontradas para
|
|
dar una vista rapida de la huella de metadatos de una persona/organizacion.
|
|
|
|
Funcion IMPURA: hace I/O sobre el sistema de archivos.
|
|
"""
|
|
|
|
import os
|
|
|
|
from cybersecurity import extract_exif_metadata, extract_pdf_metadata
|
|
|
|
_IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".heic")
|
|
_PDF_EXTS = (".pdf",)
|
|
|
|
|
|
def _iter_files(attachments_dir: str):
|
|
"""Recorre recursivamente el directorio devolviendo rutas de archivo ordenadas."""
|
|
for root, _dirs, files in os.walk(attachments_dir):
|
|
for name in sorted(files):
|
|
yield os.path.join(root, name)
|
|
|
|
|
|
def scan_ficha_attachments_metadata(attachments_dir: str) -> dict:
|
|
"""Escanea un directorio de attachments y agrega los metadatos extraidos.
|
|
|
|
Aplica `extract_exif_metadata` a las imagenes (.jpg/.jpeg/.png/.heic) y
|
|
`extract_pdf_metadata` a los PDFs (.pdf). Agrega los puntos GPS de las
|
|
imagenes y todas las fechas detectadas (EXIF + PDF).
|
|
|
|
Args:
|
|
attachments_dir: ruta a un directorio de attachments, p.ej.
|
|
`/home/enmanuel/Obsidian/osint/attachments/personas/<slug>/`.
|
|
Se recorre recursivamente.
|
|
|
|
Returns:
|
|
dict con las claves:
|
|
- files: lista de {path, type, metadata} por archivo procesado.
|
|
- gps_points: lista de {file, lat, lon} con las coordenadas EXIF.
|
|
- dates: lista de fechas (str) encontradas en EXIF y PDF.
|
|
- summary: {n_files, n_images, n_pdfs, n_gps_points, n_dates,
|
|
errors} con conteos agregados.
|
|
|
|
Raises:
|
|
NotADirectoryError: si `attachments_dir` no es un directorio existente.
|
|
"""
|
|
if not os.path.isdir(attachments_dir):
|
|
raise NotADirectoryError(f"no es un directorio: {attachments_dir}")
|
|
|
|
files: list[dict] = []
|
|
gps_points: list[dict] = []
|
|
dates: list[str] = []
|
|
n_images = 0
|
|
n_pdfs = 0
|
|
errors = 0
|
|
|
|
for path in _iter_files(attachments_dir):
|
|
ext = os.path.splitext(path)[1].lower()
|
|
|
|
if ext in _IMAGE_EXTS:
|
|
ftype = "image"
|
|
elif ext in _PDF_EXTS:
|
|
ftype = "pdf"
|
|
else:
|
|
continue
|
|
|
|
try:
|
|
if ftype == "image":
|
|
metadata = extract_exif_metadata(path)
|
|
n_images += 1
|
|
lat = metadata.get("gps_lat")
|
|
lon = metadata.get("gps_lon")
|
|
if lat is not None and lon is not None:
|
|
gps_points.append({"file": path, "lat": lat, "lon": lon})
|
|
dt = metadata.get("datetime")
|
|
if dt:
|
|
dates.append(str(dt))
|
|
else:
|
|
metadata = extract_pdf_metadata(path)
|
|
n_pdfs += 1
|
|
for key in ("creation_date", "modification_date", "creationDate", "modDate"):
|
|
val = metadata.get(key)
|
|
if val:
|
|
dates.append(str(val))
|
|
except Exception as exc: # noqa: BLE001 - I/O sobre archivos heterogeneos
|
|
errors += 1
|
|
metadata = {"error": f"{type(exc).__name__}: {exc}"}
|
|
|
|
files.append({"path": path, "type": ftype, "metadata": metadata})
|
|
|
|
summary = {
|
|
"n_files": len(files),
|
|
"n_images": n_images,
|
|
"n_pdfs": n_pdfs,
|
|
"n_gps_points": len(gps_points),
|
|
"n_dates": len(dates),
|
|
"errors": errors,
|
|
}
|
|
|
|
return {
|
|
"files": files,
|
|
"gps_points": gps_points,
|
|
"dates": dates,
|
|
"summary": summary,
|
|
}
|