5a324f6554
Infra: cache_to_file, cache_to_sqlite, http_download_file, http_get_json, http_post_json, read_file_with_encoding, safe_extract_zip, scan_directory, setup_logger, normalize_zip_filenames. Tipos: 30+ tipos core (agent_action, context, task, message, parse_result...), 6 tipos datascience (entity_candidate, extraction_result...), 2 tipos infra. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
"""Descarga de archivos en streaming — HTTP client sin dependencias externas."""
|
|
|
|
import os
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
|
|
def http_download_file(
|
|
url: str,
|
|
dest_path: str,
|
|
headers: dict[str, str] | None = None,
|
|
timeout: float = 120.0,
|
|
chunk_size: int = 8192,
|
|
) -> dict:
|
|
"""Descarga un archivo por HTTP en streaming (sin cargar todo en memoria).
|
|
|
|
Crea los directorios intermedios si no existen. Si el archivo destino
|
|
ya existe lo sobreescribe. La descarga se hace en chunks para evitar
|
|
consumo de memoria excesivo con archivos grandes.
|
|
|
|
Args:
|
|
url: URL del archivo a descargar.
|
|
dest_path: Ruta local destino donde guardar el archivo.
|
|
headers: Headers HTTP adicionales.
|
|
timeout: Segundos maximo de espera para la conexion (default 120).
|
|
chunk_size: Tamano de cada chunk en bytes (default 8192).
|
|
|
|
Returns:
|
|
dict con campos ``path`` (str), ``size_bytes`` (int) y
|
|
``content_type`` (str).
|
|
|
|
Raises:
|
|
RuntimeError: Si el status HTTP es >= 400.
|
|
"""
|
|
req = urllib.request.Request(url, headers=headers or {}, method="GET")
|
|
|
|
os.makedirs(os.path.dirname(os.path.abspath(dest_path)), exist_ok=True)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
content_type: str = resp.headers.get("Content-Type", "")
|
|
size_bytes = 0
|
|
with open(dest_path, "wb") as f:
|
|
while True:
|
|
chunk = resp.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
f.write(chunk)
|
|
size_bytes += len(chunk)
|
|
except urllib.error.HTTPError as e:
|
|
short_url = url[:100] if len(url) > 100 else url
|
|
raise RuntimeError(
|
|
f"http_download_file: HTTP {e.code} at {short_url!r}"
|
|
) from e
|
|
|
|
return {
|
|
"path": dest_path,
|
|
"size_bytes": size_bytes,
|
|
"content_type": content_type,
|
|
}
|