venta_web/notebooks/01_exploracion_venta_web.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e76ae307",
   "metadata": {},
   "source": [
    "# Exploracion Venta Web — Aurgi\n",
    "\n",
    "Objetivo: contrastar las consultas y fuentes de datos entre:\n",
    "\n",
    "- **Documents** Metabase: `2`, `3`, `4`, `8` — https://reports.autingo.es/document/{id}\n",
    "- **Dashboard** `734` — *BI Ventas Portfolio Producto* (centros digital / venta web)\n",
    "\n",
    "Queremos descubrir qué queries ejecutan, qué databases/tablas usan y qué solapamiento hay entre ambas vistas."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0d6c35f",
   "metadata": {},
   "source": [
    "## 1. Setup — cliente Metabase\n",
    "\n",
    "Usamos la funcion `MetabaseClient` del registry (`python/functions/metabase/client.py`).\n",
    "La API key vive en `./.env` (gitignored) y tambien en `pass metabase/aurgi-api-key`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "30c7e035",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Conectado a https://reports.autingo.es como api-key-user-e55e1da7-c8ea-4268-9faf-84343fd72150@api-key.invalid (id=301, admin=True)\n"
     ]
    }
   ],
   "source": [
    "import os, json, re\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "# Cargar .env local\n",
    "env_path = Path('.env') if Path('.env').exists() else Path('../.env')\n",
    "for line in env_path.read_text().splitlines():\n",
    "    if line and not line.startswith('#') and '=' in line:\n",
    "        k, v = line.split('=', 1)\n",
    "        os.environ.setdefault(k.strip(), v.strip())\n",
    "\n",
    "# Importar modulo metabase del registry (el kernel startup ya inyecta el sys.path)\n",
    "from metabase import MetabaseClient, metabase_get_dashboard, metabase_get_document, metabase_get_card, metabase_list_databases\n",
    "\n",
    "URL = os.environ['METABASE_URL']\n",
    "KEY = os.environ['METABASE_API_KEY']\n",
    "client = MetabaseClient(URL, KEY)\n",
    "\n",
    "me = client.request('GET', '/api/user/current')\n",
    "print(f\"Conectado a {URL} como {me.get('email')} (id={me.get('id')}, admin={me.get('is_superuser')})\")\n",
    "pd.set_option('display.max_colwidth', 120)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35825d9c",
   "metadata": {},
   "source": [
    "## 2. Databases disponibles\n",
    "\n",
    "Para entender de donde vienen los datos necesitamos el catalogo de databases."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ce7e91e6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>engine</th>\n",
       "      <th>dbname</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>ANJANA_DB</td>\n",
       "      <td>postgres</td>\n",
       "      <td>production</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38</td>\n",
       "      <td>Call Center Analysis</td>\n",
       "      <td>postgres</td>\n",
       "      <td>call-center-analysis</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>39</td>\n",
       "      <td>CGDATA</td>\n",
       "      <td>postgres</td>\n",
       "      <td>cglass-data</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>CuidamosTuCoche.net</td>\n",
       "      <td>mysql</td>\n",
       "      <td>sanzauto_production_web</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>DATACENTRIC_DB_STAGING</td>\n",
       "      <td>postgres</td>\n",
       "      <td>datacentric_dev</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>DCBigQuery</td>\n",
       "      <td>bigquery-cloud-sdk</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>37</td>\n",
       "      <td>Plaza Pilot</td>\n",
       "      <td>postgres</td>\n",
       "      <td>plaza_pilot_prod</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5</td>\n",
       "      <td>PRE STAGING</td>\n",
       "      <td>postgres</td>\n",
       "      <td>async_datacentric_dev</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>40</td>\n",
       "      <td>QA_DB</td>\n",
       "      <td>postgres</td>\n",
       "      <td>qa_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>Sample Database</td>\n",
       "      <td>h2</td>\n",
       "      <td>file:/plugins/sample-database.db;USER=GUEST;PASSWORD=guest</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                    name              engine  \\\n",
       "0   2               ANJANA_DB            postgres   \n",
       "1  38    Call Center Analysis            postgres   \n",
       "2  39                  CGDATA            postgres   \n",
       "3   3     CuidamosTuCoche.net               mysql   \n",
       "4   4  DATACENTRIC_DB_STAGING            postgres   \n",
       "5   6              DCBigQuery  bigquery-cloud-sdk   \n",
       "6  37             Plaza Pilot            postgres   \n",
       "7   5             PRE STAGING            postgres   \n",
       "8  40                   QA_DB            postgres   \n",
       "9   1         Sample Database                  h2   \n",
       "\n",
       "                                                       dbname  \n",
       "0                                                  production  \n",
       "1                                        call-center-analysis  \n",
       "2                                                 cglass-data  \n",
       "3                                     sanzauto_production_web  \n",
       "4                                             datacentric_dev  \n",
       "5                                                         NaN  \n",
       "6                                            plaza_pilot_prod  \n",
       "7                                       async_datacentric_dev  \n",
       "8                                                        qa_1  \n",
       "9  file:/plugins/sample-database.db;USER=GUEST;PASSWORD=guest  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dbs = metabase_list_databases(client)\n",
    "df_dbs = pd.DataFrame([\n",
    "    {'id': d['id'], 'name': d['name'], 'engine': d['engine'], 'dbname': (d.get('details') or {}).get('db') or (d.get('details') or {}).get('dbname')}\n",
    "    for d in dbs\n",
    "])\n",
    "df_dbs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e42dc954",
   "metadata": {},
   "source": [
    "## 3. Dashboard 734 — *BI Ventas Portfolio Producto*\n",
    "\n",
    "Extraemos metadata, tabs, parametros y todas las dashcards con su query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "beefa761",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dashboard 734: BI - VENTAS - PORTFOLIO PRODUCTO\n",
      "Descripcion: -\n",
      "Coleccion: 500 | Tabs: 11 | Dashcards: 94\n",
      "Parameters: ['fecha', 'compa%C3%B1ia', 'centro', 'zona', 'comunidad_autonoma', 'provincia', 'jefe_regional', 'jefe_de_zona', 'rmo', 'tipo', 'categoria', 'subcategoria', 'producto', 'navid_producto', 'centro_1', 'centro_2', 'fecha_inicio', 'fecha_fin', 'temporada', 'marca', 'velocidad', 'ancho', 'carga', 'perfil', 'diametro', 'activo_%2F_no_activo']\n"
     ]
    }
   ],
   "source": [
    "dash = metabase_get_dashboard(client, 734)\n",
    "print(f\"Dashboard {dash['id']}: {dash['name']}\")\n",
    "print(f\"Descripcion: {dash.get('description') or '-'}\")\n",
    "print(f\"Coleccion: {dash.get('collection_id')} | Tabs: {len(dash.get('tabs') or [])} | Dashcards: {len(dash.get('dashcards') or [])}\")\n",
    "print(f\"Parameters: {[p.get('slug') or p.get('name') for p in (dash.get('parameters') or [])]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b0feb87",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total dashcards (incluye text/heading): 94 | con card real: 91\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dashcard_id</th>\n",
       "      <th>tab</th>\n",
       "      <th>card_id</th>\n",
       "      <th>name</th>\n",
       "      <th>display</th>\n",
       "      <th>type</th>\n",
       "      <th>database_id</th>\n",
       "      <th>has_sql</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3911</td>\n",
       "      <td>Pelicula</td>\n",
       "      <td>5686.0</td>\n",
       "      <td>pelicula_categorias_semana</td>\n",
       "      <td>area</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3908</td>\n",
       "      <td>Portafolio Producto</td>\n",
       "      <td>5740.0</td>\n",
       "      <td>Ventas Totales</td>\n",
       "      <td>scalar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3906</td>\n",
       "      <td>Foto Centros - Compara Semana</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3907</td>\n",
       "      <td>Foto Categorias - Compara Semana</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3914</td>\n",
       "      <td>Portafolio Producto</td>\n",
       "      <td>5673.0</td>\n",
       "      <td>Evolucion de ventas por producto</td>\n",
       "      <td>bar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>5885</td>\n",
       "      <td>Foto Categorias - Compara Semana</td>\n",
       "      <td>7519.0</td>\n",
       "      <td>Objetivo por Categoría</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>6419</td>\n",
       "      <td>5 Min View</td>\n",
       "      <td>7708.0</td>\n",
       "      <td>Margen € N-1</td>\n",
       "      <td>scalar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>6420</td>\n",
       "      <td>5 Min View</td>\n",
       "      <td>7709.0</td>\n",
       "      <td>% Margen</td>\n",
       "      <td>scalar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>6418</td>\n",
       "      <td>5 Min View</td>\n",
       "      <td>7707.0</td>\n",
       "      <td>Margen €</td>\n",
       "      <td>scalar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>6421</td>\n",
       "      <td>5 Min View</td>\n",
       "      <td>7710.0</td>\n",
       "      <td>% Margen vs N-1</td>\n",
       "      <td>scalar</td>\n",
       "      <td>None</td>\n",
       "      <td>6.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>94 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    dashcard_id                               tab  card_id  \\\n",
       "0          3911                          Pelicula   5686.0   \n",
       "1          3908               Portafolio Producto   5740.0   \n",
       "2          3906     Foto Centros - Compara Semana      NaN   \n",
       "3          3907  Foto Categorias - Compara Semana      NaN   \n",
       "4          3914               Portafolio Producto   5673.0   \n",
       "..          ...                               ...      ...   \n",
       "89         5885  Foto Categorias - Compara Semana   7519.0   \n",
       "90         6419                        5 Min View   7708.0   \n",
       "91         6420                        5 Min View   7709.0   \n",
       "92         6418                        5 Min View   7707.0   \n",
       "93         6421                        5 Min View   7710.0   \n",
       "\n",
       "                                name display  type  database_id  has_sql  \n",
       "0         pelicula_categorias_semana    area  None          6.0    False  \n",
       "1                     Ventas Totales  scalar  None          6.0    False  \n",
       "2                                NaN     NaN  None          NaN    False  \n",
       "3                                NaN     NaN  None          NaN    False  \n",
       "4   Evolucion de ventas por producto     bar  None          6.0    False  \n",
       "..                               ...     ...   ...          ...      ...  \n",
       "89            Objetivo por Categoría   table  None          6.0    False  \n",
       "90                      Margen € N-1  scalar  None          6.0    False  \n",
       "91                          % Margen  scalar  None          6.0    False  \n",
       "92                          Margen €  scalar  None          6.0    False  \n",
       "93                   % Margen vs N-1  scalar  None          6.0    False  \n",
       "\n",
       "[94 rows x 8 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Resumen por tab\n",
    "tabs = {t['id']: t['name'] for t in (dash.get('tabs') or [])}\n",
    "\n",
    "rows = []\n",
    "for dc in dash.get('dashcards') or []:\n",
    "    card = dc.get('card') or {}\n",
    "    dq = card.get('dataset_query') or {}\n",
    "    rows.append({\n",
    "        'dashcard_id': dc['id'],\n",
    "        'tab': tabs.get(dc.get('dashboard_tab_id'), '-'),\n",
    "        'card_id': card.get('id'),\n",
    "        'name': card.get('name'),\n",
    "        'display': card.get('display'),\n",
    "        'type': dq.get('type'),\n",
    "        'database_id': dq.get('database'),\n",
    "        'has_sql': bool((dq.get('native') or {}).get('query')),\n",
    "    })\n",
    "df_cards = pd.DataFrame(rows)\n",
    "print(f\"Total dashcards (incluye text/heading): {len(df_cards)} | con card real: {df_cards['card_id'].notna().sum()}\")\n",
    "df_cards"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a295891",
   "metadata": {},
   "source": [
    "### 3.1 Queries nativas (SQL) del dashboard\n",
    "\n",
    "Mostramos solo las cards que son SQL nativo (no MBQL), porque son las que revelan tablas directamente."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "13cace9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def card_query(dc):\n",
    "    card = dc.get('card') or {}\n",
    "    dq = card.get('dataset_query') or {}\n",
    "    if dq.get('type') == 'native':\n",
    "        return (dq.get('native') or {}).get('query')\n",
    "    return None\n",
    "\n",
    "for dc in dash.get('dashcards') or []:\n",
    "    q = card_query(dc)\n",
    "    if not q:\n",
    "        continue\n",
    "    card = dc['card']\n",
    "    print('='*80)\n",
    "    print(f\"card {card['id']} — {card['name']}  [db={card['dataset_query'].get('database')}]\")\n",
    "    print('-'*80)\n",
    "    print(q.strip()[:2000])\n",
    "    if len(q) > 2000:\n",
    "        print(f'... ({len(q)-2000} chars mas)')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69f88d1d",
   "metadata": {},
   "source": [
    "### 3.2 Tablas referenciadas\n",
    "\n",
    "Heuristica simple: regex sobre `FROM ...` y `JOIN ...` en las queries nativas."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a544ae16",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'table'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 21\u001b[39m\n\u001b[32m     17\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;28;01min\u001b[39;00m extract_tables(q):\n\u001b[32m     18\u001b[39m         tbl_rows.append({\u001b[33m'card_id'\u001b[39m: card[\u001b[33m'id'\u001b[39m], \u001b[33m'card_name'\u001b[39m: card[\u001b[33m'name'\u001b[39m], \u001b[33m'db'\u001b[39m: card[\u001b[33m'dataset_query'\u001b[39m].get(\u001b[33m'database'\u001b[39m), \u001b[33m'table'\u001b[39m: t})\n\u001b[32m     19\u001b[39m \n\u001b[32m     20\u001b[39m df_tbls = pd.DataFrame(tbl_rows).drop_duplicates()\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m df_tbls.groupby(\u001b[33m'table'\u001b[39m).agg(cards=(\u001b[33m'card_id'\u001b[39m, \u001b[33m'nunique'\u001b[39m), sample_card=(\u001b[33m'card_name'\u001b[39m, \u001b[33m'first'\u001b[39m)).sort_values(\u001b[33m'cards'\u001b[39m, ascending=\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/fn_registry/projects/aurgi/analysis/venta_web/.venv/lib/python3.13/site-packages/pandas/util/_decorators.py:336\u001b[39m, in \u001b[36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    330\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) > num_allow_args:\n\u001b[32m    331\u001b[39m     warnings.warn(\n\u001b[32m    332\u001b[39m         msg.format(arguments=_format_argument_list(allow_args)),\n\u001b[32m    333\u001b[39m         klass,\n\u001b[32m    334\u001b[39m         stacklevel=find_stack_level(),\n\u001b[32m    335\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/fn_registry/projects/aurgi/analysis/venta_web/.venv/lib/python3.13/site-packages/pandas/core/frame.py:10821\u001b[39m, in \u001b[36mDataFrame.groupby\u001b[39m\u001b[34m(self, by, level, as_index, sort, group_keys, observed, dropna)\u001b[39m\n\u001b[32m  10817\u001b[39m \n\u001b[32m  10818\u001b[39m         \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;28;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mand\u001b[39;00m by \u001b[38;5;28;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m  10819\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m TypeError(\u001b[33m\"You have to supply one of 'by' and 'level'\"\u001b[39m)\n\u001b[32m  10820\u001b[39m \n\u001b[32m> \u001b[39m\u001b[32m10821\u001b[39m         return DataFrameGroupBy(\n\u001b[32m  10822\u001b[39m             obj=self,\n\u001b[32m  10823\u001b[39m             keys=by,\n\u001b[32m  10824\u001b[39m             level=level,\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/fn_registry/projects/aurgi/analysis/venta_web/.venv/lib/python3.13/site-packages/pandas/core/groupby/groupby.py:1095\u001b[39m, in \u001b[36mGroupBy.__init__\u001b[39m\u001b[34m(self, obj, keys, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)\u001b[39m\n\u001b[32m   1092\u001b[39m \u001b[38;5;28mself\u001b[39m.dropna = dropna\n\u001b[32m   1094\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m grouper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1095\u001b[39m     grouper, exclusions, obj = \u001b[43mget_grouper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1096\u001b[39m \u001b[43m        \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1097\u001b[39m \u001b[43m        \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1098\u001b[39m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1099\u001b[39m \u001b[43m        \u001b[49m\u001b[43msort\u001b[49m\u001b[43m=\u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1100\u001b[39m \u001b[43m        \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m=\u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1101\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdropna\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1102\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1104\u001b[39m \u001b[38;5;28mself\u001b[39m.observed = observed\n\u001b[32m   1105\u001b[39m \u001b[38;5;28mself\u001b[39m.obj = obj\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/fn_registry/projects/aurgi/analysis/venta_web/.venv/lib/python3.13/site-packages/pandas/core/groupby/grouper.py:901\u001b[39m, in \u001b[36mget_grouper\u001b[39m\u001b[34m(obj, key, level, sort, observed, validate, dropna)\u001b[39m\n\u001b[32m    899\u001b[39m         in_axis, level, gpr = \u001b[38;5;28;01mFalse\u001b[39;00m, gpr, \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m    900\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m901\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(gpr)\n\u001b[32m    902\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(gpr, Grouper) \u001b[38;5;129;01mand\u001b[39;00m gpr.key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    903\u001b[39m     \u001b[38;5;66;03m# Add key to exclusions\u001b[39;00m\n\u001b[32m    904\u001b[39m     exclusions.add(gpr.key)\n",
      "\u001b[31mKeyError\u001b[39m: 'table'"
     ]
    }
   ],
   "source": [
    "TABLE_RE = re.compile(r'\\b(?:FROM|JOIN)\\s+([`\"\\[]?[\\w\\.`\"\\]]+)', re.IGNORECASE)\n",
    "\n",
    "def extract_tables(sql: str) -> set:\n",
    "    if not sql:\n",
    "        return set()\n",
    "    # quitar comentarios de linea y multilinea\n",
    "    sql2 = re.sub(r'--[^\\n]*', ' ', sql)\n",
    "    sql2 = re.sub(r'/\\*.*?\\*/', ' ', sql2, flags=re.S)\n",
    "    return {m.strip('`\"[]') for m in TABLE_RE.findall(sql2)}\n",
    "\n",
    "tbl_rows = []\n",
    "for dc in dash.get('dashcards') or []:\n",
    "    q = card_query(dc)\n",
    "    if not q:\n",
    "        continue\n",
    "    card = dc['card']\n",
    "    for t in extract_tables(q):\n",
    "        tbl_rows.append({'card_id': card['id'], 'card_name': card['name'], 'db': card['dataset_query'].get('database'), 'table': t})\n",
    "\n",
    "df_tbls = pd.DataFrame(tbl_rows).drop_duplicates()\n",
    "df_tbls.groupby('table').agg(cards=('card_id', 'nunique'), sample_card=('card_name', 'first')).sort_values('cards', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9750c24f",
   "metadata": {},
   "source": [
    "## 4. Documents 2, 3, 4, 8\n",
    "\n",
    "Los Metabase documents son notas tipo Notion (arbol ProseMirror). Pueden incluir `cardEmbed` que referencia cards existentes. Extraemos el texto plano y las cards embebidas."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7b3d0052",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doc_id</th>\n",
       "      <th>name</th>\n",
       "      <th>collection_id</th>\n",
       "      <th>archived</th>\n",
       "      <th>text_chars</th>\n",
       "      <th>embedded_card_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>SO↔NAV — Resumen ejecutivo</td>\n",
       "      <td>600</td>\n",
       "      <td>False</td>\n",
       "      <td>833</td>\n",
       "      <td>[7749, 7750, 7751]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>SO↔NAV — Análisis de categorías y márgenes</td>\n",
       "      <td>600</td>\n",
       "      <td>False</td>\n",
       "      <td>1456</td>\n",
       "      <td>[7762, 7763, 7764, 7765, 7766]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>SO↔NAV — Diagnóstico de trazabilidad</td>\n",
       "      <td>600</td>\n",
       "      <td>False</td>\n",
       "      <td>1453</td>\n",
       "      <td>[7755, 7756, 7757]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8</td>\n",
       "      <td>WSWEB (Cubo) vs GA4/Supply Orders — ¿son lo mismo? (2025)</td>\n",
       "      <td>600</td>\n",
       "      <td>False</td>\n",
       "      <td>3025</td>\n",
       "      <td>[7845, 7846, 7847, 7848]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   doc_id                                                       name  \\\n",
       "0       2                                 SO↔NAV — Resumen ejecutivo   \n",
       "1       3                 SO↔NAV — Análisis de categorías y márgenes   \n",
       "2       4                       SO↔NAV — Diagnóstico de trazabilidad   \n",
       "3       8  WSWEB (Cubo) vs GA4/Supply Orders — ¿son lo mismo? (2025)   \n",
       "\n",
       "   collection_id  archived  text_chars               embedded_card_ids  \n",
       "0            600     False         833              [7749, 7750, 7751]  \n",
       "1            600     False        1456  [7762, 7763, 7764, 7765, 7766]  \n",
       "2            600     False        1453              [7755, 7756, 7757]  \n",
       "3            600     False        3025        [7845, 7846, 7847, 7848]  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DOC_IDS = [2, 3, 4, 8]\n",
    "\n",
    "def walk_prosemirror(node, out_text, out_embeds):\n",
    "    \"\"\"Recorre recursivamente el arbol ProseMirror.\n",
    "    Acumula texto plano y nodos cardEmbed con su card_id.\n",
    "    \"\"\"\n",
    "    if not isinstance(node, dict):\n",
    "        return\n",
    "    t = node.get('type')\n",
    "    if t == 'text':\n",
    "        out_text.append(node.get('text', ''))\n",
    "    elif t == 'cardEmbed':\n",
    "        attrs = node.get('attrs') or {}\n",
    "        out_embeds.append(attrs.get('id'))\n",
    "    # Tambien `smartLink` apunta a entidades Metabase\n",
    "    elif t == 'smartLink':\n",
    "        attrs = node.get('attrs') or {}\n",
    "        if attrs.get('model') in ('card', 'dataset'):\n",
    "            out_embeds.append(attrs.get('entityId') or attrs.get('id'))\n",
    "    for child in node.get('content') or []:\n",
    "        walk_prosemirror(child, out_text, out_embeds)\n",
    "\n",
    "documents = {}\n",
    "for did in DOC_IDS:\n",
    "    try:\n",
    "        documents[did] = metabase_get_document(client, did)\n",
    "    except Exception as e:\n",
    "        print(f'doc {did}: ERROR {e}')\n",
    "\n",
    "doc_rows = []\n",
    "for did, d in documents.items():\n",
    "    text, embeds = [], []\n",
    "    walk_prosemirror(d.get('document') or {}, text, embeds)\n",
    "    doc_rows.append({\n",
    "        'doc_id': did,\n",
    "        'name': d.get('name'),\n",
    "        'collection_id': d.get('collection_id'),\n",
    "        'archived': d.get('archived'),\n",
    "        'text_chars': sum(len(t) for t in text),\n",
    "        'embedded_card_ids': sorted({e for e in embeds if e}),\n",
    "    })\n",
    "df_docs = pd.DataFrame(doc_rows)\n",
    "df_docs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e8e42ab",
   "metadata": {},
   "source": [
    "### 4.1 Preview del texto de cada document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0db96337",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "================================================================================\n",
      "doc 2 — SO↔NAV — Resumen ejecutivo\n",
      "--------------------------------------------------------------------------------\n",
      "Supply Orders ↔ Facturas Navision Análisis del cruce entre  supply_orders  (pedidos de suministro) y  facturas de Navision . Periodo: enero 2025 en adelante. Datos actualizados automáticamente desde BigQuery. Cifras clave 558.571  supply orders totales 234.625  web (42%) +  300.643  centro (54%) + 23K sin origen 360.928  facturas NAV vinculadas (64.6% trazabilidad global) Revenue web:  27.4M€  | Margen web:  11.3M€  (41.3%) Evolución mensual Tendencia de volumen y trazabilidad Web vs Centro: Desglose por canal aurgi.com domina en revenue (17.8M€, 46.3% margen). Amazon segundo con 3.6M€ pero margen más bajo (34.3%). motortown.es tiene el mejor margen (52.3%) pero volumen pequeño. Revenue semanal por canal Evolución temporal para detectar tendencias y estacionalidad: Documento generado el 2026-04-14. Colección: Claude / Supply Orders ↔ Facturas NAV.\n",
      "\n",
      "================================================================================\n",
      "doc 3 — SO↔NAV — Análisis de categorías y márgenes\n",
      "--------------------------------------------------------------------------------\n",
      "Categorías ecommerce y márgenes Análisis de supply orders web por familia ecommerce, subfamilia y canal. Solo web porque los supply orders de centro  no tienen precios  (sale_price/purchase_price son NULL). Revenue por familia ecommerce El 53% del revenue web (14.4M€) corresponde a productos  sin categoría ecommerce  — probablemente neumáticos y productos legacy sin clasificar. mundo-coche aporta 6M€ con 38.2% de margen. Volumen por familia ecommerce Número de supply orders y facturas NAV por familia. (sin cat) y mundo-coche lideran también en volumen: % Margen por familia ecommerce Ranking de familias por porcentaje de margen. Solo familias con más de 1.000€ de revenue.  mundo-aurgi  tiene margen de solo 2.9% (efecto rappel — el margen real incluye rappel de proveedores que no aparece en sale/purchase price). Drill-down: subfamilias Desglose dentro de cada familia para identificar qué productos específicos generan más valor: Matriz canal x familia Cruce de canal de venta con familia ecommerce. Permite identificar qué categorías rinden mejor en cada marketplace: Hallazgos clave (sin cat)  domina en revenue pero es un hueco de datos — clasificar esos productos mejoraría la visibilid\n",
      "... (283 chars mas)\n",
      "\n",
      "================================================================================\n",
      "doc 4 — SO↔NAV — Diagnóstico de trazabilidad\n",
      "--------------------------------------------------------------------------------\n",
      "Diagnóstico: supply orders sin factura NAV De los 558K supply orders,  ~140K (25%)  no tienen factura Navision vinculada. Este documento analiza por qué y qué se puede hacer. Distribución por status 76.793  pending — pedidos aún en proceso, no esperan factura todavía 31.806  confirmed — deberían tener factura, posible gap 29.652  partial — parcialmente servidos, factura puede llegar 1.803  cancelled — cancelados, nunca tendrán factura 45  returned — devueltos Los  pending  son el grueso (55%): son pedidos en curso que aún no se han facturado. Es esperable. Los  confirmed sin factura  (31.8K) son el gap real a investigar. Supply orders sin factura por canal y status Tipo de envío y cobertura order_id El order_id se recupera con 3 fuentes en cascada: so.tpv_orders_order_id  — directo (centro) precaweb.order_id  — via tpv_precawebs_servicerequestjob (web cita/C&C) tpv_orders_invoice.order_id  — via factura: logistic.invoice_number → inv.nav_id (web envío) Cobertura combinada:  99.8% . Solo el 0.2% no tiene order_id por ninguna vía. Métodos de pago La distribución de métodos de pago NAV confirma los canales de entrada: Acciones recomendadas Investigar los 31.8K confirmed sin factura — \n",
      "... (291 chars mas)\n",
      "\n",
      "================================================================================\n",
      "doc 8 — WSWEB (Cubo) vs GA4/Supply Orders — ¿son lo mismo? (2025)\n",
      "--------------------------------------------------------------------------------\n",
      "WSWEB en Cubo_Ventas vs GA4/Supply Orders: comparativa Cuando el dashboard 734 (BI Ventas Portfolio) filtra por compañía  WSWEB , ¿está mostrando lo mismo que vemos en GA4 y en supply_orders? La respuesta corta:  no exactamente . Aquí se explica por qué y cómo se mapean. Las 3 fuentes de datos de ventas web 1. Cubo_Ventas_Calculado  — tabla:  anjana_bi_datamart.Cubo_Ventas_Calculado    Filtro digital:  Dim_NombreDimGlobal2 = 'WSWEB'    Revenue 2025:  5.879.313€  en 158.617 líneas 2. NAV directo  — tablas:  anjana_sales_invoice_header  +  anjana_sales_invoice_line    Filtro:  location_code IN ('18','19','405','421','422')  (centros web)    Revenue 2025:  15.878.883€  en 315.173 líneas 3. Supply Orders → NAV  — tablas:  supply_orders  →  logistic_orders  → NAV    Filtro:  service_request_id IS NOT NULL    Revenue 2025:  7.235.399€  en ~160K líneas ¿Por qué son diferentes? Cubo (5.88M€)  es el MÁS restrictivo. El Cubo_Ventas_Calculado es un datamart preprocesado que aplica filtros de negocio: solo movimientos tipo 'Venta', excluye devoluciones, ajustes y ciertos tipos de ticket. Además, su campo  Dim_NombreDimGlobal2  agrupa por 'compañía comercial' (quién gestiona la venta), no por c\n",
      "... (1910 chars mas)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for did, d in documents.items():\n",
    "    text, _ = [], []\n",
    "    walk_prosemirror(d.get('document') or {}, text, _)\n",
    "    plain = ' '.join(text).strip()\n",
    "    print('='*80)\n",
    "    print(f\"doc {did} — {d.get('name')}\")\n",
    "    print('-'*80)\n",
    "    print(plain[:1200] or '(sin texto plano, solo embeds)')\n",
    "    if len(plain) > 1200:\n",
    "        print(f'... ({len(plain)-1200} chars mas)')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff843977",
   "metadata": {},
   "source": [
    "### 4.2 Queries de las cards embebidas en los documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a24ef2bc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>card_ref</th>\n",
       "      <th>docs</th>\n",
       "      <th>name</th>\n",
       "      <th>display</th>\n",
       "      <th>type</th>\n",
       "      <th>database_id</th>\n",
       "      <th>archived</th>\n",
       "      <th>collection_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7749</td>\n",
       "      <td>[2]</td>\n",
       "      <td>SO↔NAV — Evolución mensual por origen</td>\n",
       "      <td>line</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7750</td>\n",
       "      <td>[2]</td>\n",
       "      <td>SO↔NAV — Desglose por canal</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7751</td>\n",
       "      <td>[2]</td>\n",
       "      <td>SO↔NAV — Revenue semanal web por canal</td>\n",
       "      <td>line</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7764</td>\n",
       "      <td>[3]</td>\n",
       "      <td>SO↔NAV — Revenue por familia ecommerce</td>\n",
       "      <td>bar</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7765</td>\n",
       "      <td>[3]</td>\n",
       "      <td>SO↔NAV — Volumen por familia ecommerce</td>\n",
       "      <td>bar</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>7766</td>\n",
       "      <td>[3]</td>\n",
       "      <td>SO↔NAV — % Margen por familia ecommerce</td>\n",
       "      <td>bar</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7762</td>\n",
       "      <td>[3]</td>\n",
       "      <td>SO↔NAV — Subfamilias ecommerce (web)</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7763</td>\n",
       "      <td>[3]</td>\n",
       "      <td>SO↔NAV — Margen canal x familia (web)</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>7757</td>\n",
       "      <td>[4]</td>\n",
       "      <td>SO↔NAV — Sin factura NAV (diagnóstico)</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7756</td>\n",
       "      <td>[4]</td>\n",
       "      <td>SO↔NAV — Tipo de envío</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>7755</td>\n",
       "      <td>[4]</td>\n",
       "      <td>SO↔NAV — Métodos de pago NAV</td>\n",
       "      <td>bar</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>7845</td>\n",
       "      <td>[8]</td>\n",
       "      <td>WSWEB vs GA4→NAV — Totales 2025</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>7847</td>\n",
       "      <td>[8]</td>\n",
       "      <td>WSWEB — Centros web en Cubo vs psql_dcpublic 2025</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7846</td>\n",
       "      <td>[8]</td>\n",
       "      <td>WSWEB — Categorías en Cubo_Ventas 2025</td>\n",
       "      <td>table</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>7848</td>\n",
       "      <td>[8]</td>\n",
       "      <td>WSWEB vs SO→NAV — Evolución mensual 2025</td>\n",
       "      <td>line</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    card_ref docs                                               name display  \\\n",
       "0       7749  [2]              SO↔NAV — Evolución mensual por origen    line   \n",
       "1       7750  [2]                        SO↔NAV — Desglose por canal   table   \n",
       "2       7751  [2]             SO↔NAV — Revenue semanal web por canal    line   \n",
       "3       7764  [3]             SO↔NAV — Revenue por familia ecommerce     bar   \n",
       "4       7765  [3]             SO↔NAV — Volumen por familia ecommerce     bar   \n",
       "5       7766  [3]            SO↔NAV — % Margen por familia ecommerce     bar   \n",
       "6       7762  [3]               SO↔NAV — Subfamilias ecommerce (web)   table   \n",
       "7       7763  [3]              SO↔NAV — Margen canal x familia (web)   table   \n",
       "8       7757  [4]             SO↔NAV — Sin factura NAV (diagnóstico)   table   \n",
       "9       7756  [4]                             SO↔NAV — Tipo de envío   table   \n",
       "10      7755  [4]                       SO↔NAV — Métodos de pago NAV     bar   \n",
       "11      7845  [8]                    WSWEB vs GA4→NAV — Totales 2025   table   \n",
       "12      7847  [8]  WSWEB — Centros web en Cubo vs psql_dcpublic 2025   table   \n",
       "13      7846  [8]             WSWEB — Categorías en Cubo_Ventas 2025   table   \n",
       "14      7848  [8]           WSWEB vs SO→NAV — Evolución mensual 2025    line   \n",
       "\n",
       "    type  database_id  archived  collection_id  \n",
       "0   None            6     False            600  \n",
       "1   None            6     False            600  \n",
       "2   None            6     False            600  \n",
       "3   None            6     False            600  \n",
       "4   None            6     False            600  \n",
       "5   None            6     False            600  \n",
       "6   None            6     False            600  \n",
       "7   None            6     False            600  \n",
       "8   None            6     False            600  \n",
       "9   None            6     False            600  \n",
       "10  None            6     False            600  \n",
       "11  None            6     False            600  \n",
       "12  None            6     False            600  \n",
       "13  None            6     False            600  \n",
       "14  None            6     False            600  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Recolectar todos los card_id embebidos\n",
    "embed_map = {}  # card_id -> [doc_id,...]\n",
    "for did, d in documents.items():\n",
    "    text, embeds = [], []\n",
    "    walk_prosemirror(d.get('document') or {}, text, embeds)\n",
    "    for cid in embeds:\n",
    "        if cid is None:\n",
    "            continue\n",
    "        embed_map.setdefault(cid, []).append(did)\n",
    "\n",
    "# Normalizar a int cuando sea posible (cardEmbed usa int, smartLink a veces UUID)\n",
    "cards_info = []\n",
    "for cid, docs_using in embed_map.items():\n",
    "    if not isinstance(cid, int):\n",
    "        cards_info.append({'card_ref': cid, 'docs': docs_using, 'status': 'no-int-ref (posible smartLink UUID)'})\n",
    "        continue\n",
    "    try:\n",
    "        c = metabase_get_card(client, cid)\n",
    "        dq = c.get('dataset_query') or {}\n",
    "        cards_info.append({\n",
    "            'card_ref': cid,\n",
    "            'docs': docs_using,\n",
    "            'name': c.get('name'),\n",
    "            'display': c.get('display'),\n",
    "            'type': dq.get('type'),\n",
    "            'database_id': dq.get('database'),\n",
    "            'archived': c.get('archived'),\n",
    "            'collection_id': c.get('collection_id'),\n",
    "        })\n",
    "    except Exception as e:\n",
    "        cards_info.append({'card_ref': cid, 'docs': docs_using, 'status': f'error {e}'})\n",
    "\n",
    "df_doc_cards = pd.DataFrame(cards_info)\n",
    "df_doc_cards"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e58916e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dump de queries de las cards embebidas (nativas primero)\n",
    "for cid, docs_using in embed_map.items():\n",
    "    if not isinstance(cid, int):\n",
    "        continue\n",
    "    try:\n",
    "        c = metabase_get_card(client, cid)\n",
    "    except Exception:\n",
    "        continue\n",
    "    dq = c.get('dataset_query') or {}\n",
    "    if dq.get('type') != 'native':\n",
    "        continue\n",
    "    q = (dq.get('native') or {}).get('query', '')\n",
    "    print('='*80)\n",
    "    print(f\"card {cid} — {c['name']}  [db={dq.get('database')}]  (en docs {docs_using})\")\n",
    "    print('-'*80)\n",
    "    print(q.strip()[:2500])\n",
    "    if len(q) > 2500:\n",
    "        print(f'... ({len(q)-2500} chars mas)')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a1fe9a3",
   "metadata": {},
   "source": [
    "## 5. Comparacion Dashboard 734 vs Documents\n",
    "\n",
    "Overlap de cards y tablas entre ambas vistas."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b4fe1167",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cards unicas en dashboard 734: 85\n",
      "Cards unicas en documents [2, 3, 4, 8]: 15\n",
      "Overlap (mismas cards en ambos):  0  -> []\n",
      "Solo en dashboard: 85\n",
      "Solo en documents: 15  -> [7749, 7750, 7751, 7755, 7756, 7757, 7762, 7763, 7764, 7765, 7766, 7845, 7846, 7847, 7848]\n"
     ]
    }
   ],
   "source": [
    "dash_card_ids = set(df_cards['card_id'].dropna().astype(int).tolist())\n",
    "doc_card_ids = {c for c in embed_map if isinstance(c, int)}\n",
    "\n",
    "overlap = dash_card_ids & doc_card_ids\n",
    "only_dash = dash_card_ids - doc_card_ids\n",
    "only_docs = doc_card_ids - dash_card_ids\n",
    "\n",
    "print(f\"Cards unicas en dashboard 734: {len(dash_card_ids)}\")\n",
    "print(f\"Cards unicas en documents {DOC_IDS}: {len(doc_card_ids)}\")\n",
    "print(f\"Overlap (mismas cards en ambos):  {len(overlap)}  -> {sorted(overlap)}\")\n",
    "print(f\"Solo en dashboard: {len(only_dash)}\")\n",
    "print(f\"Solo en documents: {len(only_docs)}  -> {sorted(only_docs)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c62d5192",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Comparar tablas tocadas por ambas vistas\n",
    "def tables_from_card(cid):\n",
    "    try:\n",
    "        c = metabase_get_card(client, cid)\n",
    "    except Exception:\n",
    "        return set()\n",
    "    dq = c.get('dataset_query') or {}\n",
    "    if dq.get('type') != 'native':\n",
    "        return set()\n",
    "    return extract_tables((dq.get('native') or {}).get('query', ''))\n",
    "\n",
    "dash_tables = set(df_tbls['table'])\n",
    "doc_tables = set()\n",
    "for cid in doc_card_ids:\n",
    "    doc_tables |= tables_from_card(cid)\n",
    "\n",
    "print('--- Tablas en dashboard 734 ---')\n",
    "for t in sorted(dash_tables):\n",
    "    print(' ', t)\n",
    "print('--- Tablas en documents ---')\n",
    "for t in sorted(doc_tables):\n",
    "    print(' ', t)\n",
    "print('--- Overlap ---')\n",
    "for t in sorted(dash_tables & doc_tables):\n",
    "    print(' ', t)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8d70438",
   "metadata": {},
   "source": [
    "## 6. Siguientes pasos\n",
    "\n",
    "1. Inspeccionar las queries SQL impresas en secciones 3.1 y 4.2 para entender joins y filtros.\n",
    "2. Para cards MBQL (no nativas) en el dashboard, decodificar `dataset_query.query` para sacar `source-table` y `breakouts`.\n",
    "3. Si hay tablas comunes, correr una query de control contra la DB para validar que ambos lados reportan cifras coherentes.\n",
    "4. Identificar el concepto de **\"centros digital / venta web\"** buscando en `df_cards['name']` y en las clausulas WHERE de las queries."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d70deb35",
   "metadata": {},
   "source": [
    "## 7. Resolver source-table de las cards MBQL\n",
    "\n",
    "Dashboard 734 y cards de documents son todas MBQL sobre DCBigQuery (db=6). Resolvemos los 'source-table' siguiendo 'source-query' o 'source-card' hasta encontrar tabla real, y luego mapeamos table_id -> (schema.name)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "aaa4dcb2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DB 6 (DCBigQuery) tablas indexadas: 2830\n",
      "table 184 -> None\n"
     ]
    }
   ],
   "source": [
    "# Cache metadata de DCBigQuery (db 6) para mapear table_id -> schema.name\n",
    "db6 = client.request(\"GET\", \"/api/database/6?include=tables\")\n",
    "table_by_id = {t[\"id\"]: f'{t.get(\"schema\",\"\")}.{t[\"name\"]}'.lstrip(\".\") for t in (db6.get(\"tables\") or [])}\n",
    "print(f'DB 6 ({db6[\"name\"]}) tablas indexadas: {len(table_by_id)}')\n",
    "print(f'table 184 -> {table_by_id.get(184)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3266674a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "table 184 -> db_id=2 schema=public name=supply_orders\n",
      "display_name: Supply Orders\n"
     ]
    }
   ],
   "source": [
    "# La tabla 184 no esta en db 6. Probar endpoint /api/table/184 directamente\n",
    "t184 = client.request(\"GET\", \"/api/table/184\")\n",
    "print(f'table 184 -> db_id={t184[\"db_id\"]} schema={t184.get(\"schema\")} name={t184[\"name\"]}')\n",
    "print(f'display_name: {t184.get(\"display_name\")}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4daaa759",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Resolver cadena MBQL -> tabla fisica para cada card relevante.\n",
    "# Estrategia: recorrer source-query/source-card hasta hallar source-table,\n",
    "# luego /api/table/:id para obtener (db_id, schema, name).\n",
    "\n",
    "_table_cache = {}\n",
    "_card_cache = {}\n",
    "\n",
    "def get_table(tid):\n",
    "    if tid not in _table_cache:\n",
    "        _table_cache[tid] = client.request(\"GET\", f\"/api/table/{tid}\")\n",
    "    return _table_cache[tid]\n",
    "\n",
    "def get_card(cid):\n",
    "    if cid not in _card_cache:\n",
    "        _card_cache[cid] = client.request(\"GET\", f\"/api/card/{cid}\")\n",
    "    return _card_cache[cid]\n",
    "\n",
    "def db_name(db_id):\n",
    "    for d in dbs:\n",
    "        if d[\"id\"] == db_id:\n",
    "            return d[\"name\"]\n",
    "    return f\"db:{db_id}\"\n",
    "\n",
    "def resolve_source(dataset_query, _depth=0):\n",
    "    \"\"\"Retorna (db_name, schema, table_name) siguiendo la cadena.\"\"\"\n",
    "    if _depth > 10: return (\"?\",\"?\",\"?\")\n",
    "    q = dataset_query.get(\"query\") or {}\n",
    "    # buscar hoja: source-table numerico\n",
    "    cur = q\n",
    "    while isinstance(cur, dict):\n",
    "        if \"source-table\" in cur and isinstance(cur[\"source-table\"], int):\n",
    "            t = get_table(cur[\"source-table\"])\n",
    "            return (db_name(t[\"db_id\"]), t.get(\"schema\",\"\"), t[\"name\"])\n",
    "        if \"source-table\" in cur and isinstance(cur[\"source-table\"], str) and cur[\"source-table\"].startswith(\"card__\"):\n",
    "            cid = int(cur[\"source-table\"].split(\"__\")[1])\n",
    "            return resolve_source(get_card(cid).get(\"dataset_query\") or {}, _depth+1)\n",
    "        if \"source-card-id\" in cur:\n",
    "            return resolve_source(get_card(cur[\"source-card-id\"]).get(\"dataset_query\") or {}, _depth+1)\n",
    "        cur = cur.get(\"source-query\") or {}\n",
    "    return (\"?\",\"?\",\"?\")\n",
    "\n",
    "# Candidatas unicas del dashboard\n",
    "dash_cards = [dc.get(\"card\") for dc in dash.get(\"dashcards\") or [] if dc.get(\"card\") and dc[\"card\"].get(\"id\")]\n",
    "dash_unique = {c[\"id\"]: c for c in dash_cards}.values()\n",
    "\n",
    "rows = []\n",
    "for c in dash_unique:\n",
    "    dbn, sch, tn = resolve_source(c.get(\"dataset_query\") or {})\n",
    "    rows.append({\"card_id\": c[\"id\"], \"name\": c[\"name\"], \"source_db\": dbn, \"source_table\": f\"{sch}.{tn}\".lstrip(\".\")})\n",
    "df_dash_sources = pd.DataFrame(rows)\n",
    "print(f\"Total cards dashboard: {len(df_dash_sources)}\")\n",
    "print()\n",
    "print(\"=== TABLAS FISICAS DEL DASHBOARD 734 ===\")\n",
    "print(df_dash_sources.groupby([\"source_db\",\"source_table\"]).size().sort_values(ascending=False).to_string())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e23ded90",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Version rapida: resolver solo las FUENTES TOP del dashboard, no las 85 cards una a una\n",
    "# Las 6 referencias top cubren ~70% del dashboard\n",
    "TOP_REFS = [(\"table\", 184), (\"card\", 4324), (\"card\", 4421), (\"card\", 5305), (\"card\", 4290), (\"card\", 7518)]\n",
    "\n",
    "def db_name(db_id):\n",
    "    for d in dbs:\n",
    "        if d[\"id\"] == db_id:\n",
    "            return d[\"name\"]\n",
    "    return f\"db:{db_id}\"\n",
    "\n",
    "def walk_to_table(card_id, _depth=0, _seen=None):\n",
    "    if _seen is None: _seen = set()\n",
    "    if card_id in _seen or _depth > 8: return None\n",
    "    _seen.add(card_id)\n",
    "    c = client.request(\"GET\", f\"/api/card/{card_id}\")\n",
    "    dq = c.get(\"dataset_query\") or {}\n",
    "    q = dq.get(\"query\") or {}\n",
    "    cur = q\n",
    "    while isinstance(cur, dict):\n",
    "        st = cur.get(\"source-table\")\n",
    "        if isinstance(st, int):\n",
    "            t = client.request(\"GET\", f\"/api/table/{st}\")\n",
    "            return {\"card_chain\": [card_id], \"db\": db_name(t[\"db_id\"]), \"table\": f'{t.get(\"schema\",\"\")}.{t[\"name\"]}'.lstrip(\".\")}\n",
    "        if isinstance(st, str) and st.startswith(\"card__\"):\n",
    "            sub = walk_to_table(int(st.split(\"__\")[1]), _depth+1, _seen)\n",
    "            if sub: sub[\"card_chain\"] = [card_id] + sub[\"card_chain\"]\n",
    "            return sub\n",
    "        if \"source-card-id\" in cur:\n",
    "            sub = walk_to_table(cur[\"source-card-id\"], _depth+1, _seen)\n",
    "            if sub: sub[\"card_chain\"] = [card_id] + sub[\"card_chain\"]\n",
    "            return sub\n",
    "        cur = cur.get(\"source-query\")\n",
    "    return None\n",
    "\n",
    "results = []\n",
    "for kind, ref in TOP_REFS:\n",
    "    if kind == \"table\":\n",
    "        t = client.request(\"GET\", f\"/api/table/{ref}\")\n",
    "        results.append({\"ref\": f\"table:{ref}\", \"db\": db_name(t[\"db_id\"]), \"table\": f'{t.get(\"schema\",\"\")}.{t[\"name\"]}'.lstrip(\".\"), \"chain\": []})\n",
    "    else:\n",
    "        info = walk_to_table(ref)\n",
    "        if info:\n",
    "            results.append({\"ref\": f\"card:{ref}\", \"db\": info[\"db\"], \"table\": info[\"table\"], \"chain\": info[\"card_chain\"]})\n",
    "        else:\n",
    "            results.append({\"ref\": f\"card:{ref}\", \"db\": \"?\", \"table\": \"?\", \"chain\": []})\n",
    "\n",
    "df_top = pd.DataFrame(results)\n",
    "print(\"=== FUENTES TOP DEL DASHBOARD 734 ===\")\n",
    "for _, r in df_top.iterrows():\n",
    "    chain_s = \" -> \".join(str(c) for c in r['chain']) if r['chain'] else \"-\"\n",
    "    print(f\"  {r['ref']:12}  db={r['db']:20}  table={r['table']:50}  chain: {chain_s}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}