"""Pure EDA helper: cardinality metrics block from a `summarize_categorical` output.

Part of the `eda` capability group. Consumes the per-column dict produced by
``summarize_categorical`` (for a single categorical/text column) plus the total
row count of the dataset and derives render-ready cardinality metrics: distinct
ratio, normalized entropy, singleton count, and the ``id_like`` / ``dominated``
flags.

It does NOT recompute the entropy nor reimplement ``summarize_categorical`` — it
only reads that function's output. Dict-no-throw style of the `eda` group: it
never raises. Missing or malformed inputs yield ``None``/``False``/``0`` for the
affected keys, never an exception. Stdlib only (``math.log2``).
"""

from math import log2


def _num(value):
    """Return ``value`` unchanged if it is a real (non-bool) number, else ``None``.

    ``bool`` is rejected on purpose: in Python ``True`` is an ``int`` but it is
    never a meaningful count/ratio here.
    """
    if isinstance(value, bool):
        return None
    if isinstance(value, (int, float)):
        return value
    return None


def categorical_cardinality_block(cat: dict, n_rows: int) -> dict:
    """Derive cardinality metrics for one categorical column.

    Args:
        cat: The per-column dict produced by ``summarize_categorical`` for a
            single categorical/text column. Expected (all optional, read
            defensively) keys: ``top`` (list of ``{value, count, pct}``),
            ``mode``, ``mode_pct``, ``n_distinct``, ``entropy`` (Shannon, bits),
            ``imbalance``, ``len_min``, ``len_mean``, ``len_max``. ``None`` or a
            non-dict is treated as ``{}``.
        n_rows: Total number of rows in the dataset (used for ``pct_distinct``).

    Returns:
        Dict with exactly these keys, every one always present:
        ``n_distinct``, ``n_rows``, ``pct_distinct``, ``entropy``,
        ``entropy_max``, ``entropy_norm``, ``mode``, ``mode_pct``,
        ``imbalance``, ``n_singletons``, ``n_singletons_partial``, ``len_min``,
        ``len_mean``, ``len_max``, ``id_like``, ``dominated``. Values are
        ``None``/``False`` when not derivable; the function never raises.
    """
    cat = cat if isinstance(cat, dict) else {}

    # --- passthroughs (numeric-validated, type preserved) ---
    n_distinct = _num(cat.get("n_distinct"))
    n_rows_out = _num(n_rows)
    entropy = _num(cat.get("entropy"))
    imbalance = _num(cat.get("imbalance"))
    len_min = _num(cat.get("len_min"))
    len_mean = _num(cat.get("len_mean"))
    len_max = _num(cat.get("len_max"))
    mode = cat.get("mode")  # any value (or None); passthrough as-is

    # --- pct_distinct ---
    if n_distinct is None or n_rows_out is None or n_rows_out == 0:
        pct_distinct = None
    else:
        pct_distinct = n_distinct / n_rows_out * 100.0

    # --- entropy_max = log2(n_distinct) ---
    if n_distinct is None:
        entropy_max = None
    elif n_distinct > 1:
        entropy_max = log2(n_distinct)
    else:  # n_distinct in {0, 1}
        entropy_max = 0.0

    # --- entropy_norm = entropy / entropy_max, clipped to [0, 1] ---
    if entropy_max is not None and entropy_max > 0 and entropy is not None:
        entropy_norm = entropy / entropy_max
        entropy_norm = max(0.0, min(1.0, entropy_norm))
    else:
        entropy_norm = None

    # --- mode_pct: prefer cat['mode_pct']; else derive from top[0].pct ---
    mode_pct = _num(cat.get("mode_pct"))
    top = cat.get("top")
    has_top = isinstance(top, (list, tuple)) and len(top) > 0
    if mode_pct is None and has_top:
        first = top[0]
        if isinstance(first, dict):
            first_pct = _num(first.get("pct"))
            if first_pct is not None:
                # Normalize to 0-100: a fraction (<= 1) becomes a percentage.
                mode_pct = first_pct * 100.0 if first_pct <= 1 else first_pct

    # --- singletons (count == 1) within the visible top ---
    if has_top:
        n_singletons = sum(
            1
            for item in top
            if isinstance(item, dict) and _num(item.get("count")) == 1
        )
    else:
        n_singletons = None

    # The singleton count only covers the visible top; there may be more
    # distinct values (and thus more singletons) outside it.
    top_len = len(top) if isinstance(top, (list, tuple)) else 0
    n_singletons_partial = bool(n_distinct is not None and n_distinct > top_len)

    # --- derived flags ---
    id_like = pct_distinct is not None and pct_distinct >= 99.0
    dominated = mode_pct is not None and mode_pct >= 90.0

    return {
        "n_distinct": n_distinct,
        "n_rows": n_rows_out,
        "pct_distinct": pct_distinct,
        "entropy": entropy,
        "entropy_max": entropy_max,
        "entropy_norm": entropy_norm,
        "mode": mode,
        "mode_pct": mode_pct,
        "imbalance": imbalance,
        "n_singletons": n_singletons,
        "n_singletons_partial": n_singletons_partial,
        "len_min": len_min,
        "len_mean": len_mean,
        "len_max": len_max,
        "id_like": id_like,
        "dominated": dominated,
    }