fn_registry/python/functions/infra/read_xlsx.py

"""Lee un archivo Excel (.xlsx) a estructuras en memoria con openpyxl.

Funcion impura: abre un libro Excel y devuelve sus hojas como listas de Python
(headers + rows), no como markdown. Es el espejo en lectura de
`write_xlsx_sheets`: lo que aquella escribe desde un dict {hoja: filas}, esta lo
recupera al mismo shape. Maneja los tipos de celda nativos de Excel (fechas a
ISO 8601, numeros int/float, bool, None) y lee el valor calculado de las
formulas con data_only=True.

No lanza: cualquier fallo (archivo inexistente, hoja inexistente, openpyxl
ausente) se devuelve como dict {"status": "error", "error": "..."}.
"""

import datetime
import os


def read_xlsx(
    path: str,
    sheet: str = None,
    max_rows: int = None,
    header: bool = True,
) -> dict:
    """Lee un .xlsx a estructuras en memoria (headers + rows).

    Args:
        path: Ruta al archivo .xlsx a leer.
        sheet: Nombre de la hoja a leer. Si None (default) se leen TODAS las
            hojas del libro.
        max_rows: Maximo de filas a devolver por hoja (cuenta de filas de datos,
            sin contar la cabecera cuando header=True). None (default) = sin
            limite.
        header: Si True (default) la primera fila de cada hoja se interpreta como
            cabecera y va en "headers"; el resto va en "rows". Si False, no hay
            cabecera ("headers" es []) y todas las filas van en "rows".

    Returns:
        Dict. En exito:
            {"status": "ok",
             "sheets": {nombre_hoja: {"headers": [...], "rows": [[...], ...]}}}
        En error:
            {"status": "error", "error": "<mensaje>"}.
        Los valores de celda se devuelven como tipos nativos de Python:
        fechas/datetimes como str ISO 8601, int/float, bool, str y None.
    """
    if not path:
        return {"status": "error", "error": "path no puede estar vacio"}

    abs_path = os.path.abspath(path)
    if not os.path.exists(abs_path):
        return {"status": "error", "error": f"archivo no encontrado: {abs_path}"}

    try:
        from openpyxl import load_workbook
    except ImportError:  # pragma: no cover - dependencia del entorno
        return {
            "status": "error",
            "error": (
                "openpyxl es requerido para read_xlsx. "
                "Instalar con: cd python && uv add openpyxl"
            ),
        }

    try:
        # data_only=True devuelve el valor calculado de las formulas (no la
        # formula). read_only acelera y reduce memoria en libros grandes.
        wb = load_workbook(abs_path, data_only=True, read_only=True)
    except Exception as exc:  # noqa: BLE001 - el contrato del grupo es no lanzar
        return {"status": "error", "error": f"no se pudo abrir el libro: {exc}"}

    try:
        if sheet is not None:
            if sheet not in wb.sheetnames:
                return {
                    "status": "error",
                    "error": (
                        f"hoja '{sheet}' no existe. "
                        f"Hojas disponibles: {wb.sheetnames}"
                    ),
                }
            target = [sheet]
        else:
            target = list(wb.sheetnames)

        sheets = {}
        for name in target:
            ws = wb[name]
            sheets[name] = _read_sheet(ws, max_rows, header)

        return {"status": "ok", "sheets": sheets}
    finally:
        # En modo read_only conviene cerrar para liberar el archivo subyacente.
        wb.close()


def _read_sheet(ws, max_rows, header) -> dict:
    """Lee una hoja a {"headers": [...], "rows": [[...]]} aplicando max_rows."""
    headers = []
    rows = []
    first = True

    for raw_row in ws.iter_rows(values_only=True):
        row = [_coerce(v) for v in raw_row]
        if header and first:
            headers = row
            first = False
            continue
        first = False
        if max_rows is not None and len(rows) >= max_rows:
            break
        rows.append(row)

    return {"headers": headers, "rows": rows}


def _coerce(value):
    """Convierte un valor de celda openpyxl a un tipo nativo de Python.

    Reglas: None se conserva; bool/int/float/str se conservan; fechas y
    datetimes se serializan a ISO 8601 (date a YYYY-MM-DD, datetime sin
    componente horario a YYYY-MM-DD, con hora a YYYY-MM-DDTHH:MM:SS); cualquier
    otro tipo se serializa a str.
    """
    if value is None:
        return None
    # bool es subclase de int: comprobarlo antes que int.
    if isinstance(value, bool):
        return value
    if isinstance(value, (int, float, str)):
        return value
    if isinstance(value, datetime.datetime):
        if value.hour == 0 and value.minute == 0 and value.second == 0:
            return value.date().isoformat()
        return value.isoformat()
    if isinstance(value, datetime.date):
        return value.isoformat()
    return str(value)


if __name__ == "__main__":  # pragma: no cover - smoke manual
    import tempfile

    from openpyxl import Workbook

    tmp = os.path.join(tempfile.gettempdir(), "read_xlsx_demo.xlsx")
    wb = Workbook()
    ws = wb.active
    ws.title = "Ventas"
    ws.append(["Producto", "Unidades", "Precio", "Activo"])
    ws.append(["Teclado", 12, 29.99, True])
    ws.append(["Raton", 30, 14.5, False])
    wb.save(tmp)

    print(read_xlsx(tmp))
    print(read_xlsx(tmp, sheet="Ventas", max_rows=1))