{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Recolección de datos: Binance + Bitstamp L3\n", "\n", "**Objetivo:** Dataset de 1M+ filas guardado en `data/`\n", "\n", "| Fuente | Tipo | Método | Qué obtenemos |\n", "|---|---|---|---|\n", "| Binance | aggTrades (fills agrupados por taker) | REST paginado | 1M+ trades históricos |\n", "| Binance | Order book L2 | REST snapshots | Profundidad del libro |\n", "| Bitstamp | L3 live_orders | WebSocket | Cada orden individual con ID |" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data dir: /home/lucas/fn_registry/analysis/estudio_mercados/data\n" ] } ], "source": [ "import aiohttp\n", "import asyncio\n", "import websockets\n", "import json\n", "import time\n", "import polars as pl\n", "import numpy as np\n", "from datetime import datetime, timedelta\n", "from pathlib import Path\n", "\n", "DATA_DIR = Path('../data')\n", "DATA_DIR.mkdir(exist_ok=True)\n", "\n", "BINANCE_BASE = 'https://api.binance.com'\n", "BITSTAMP_WS = 'wss://ws.bitstamp.net'\n", "\n", "print(f'Data dir: {DATA_DIR.resolve()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## 1. Binance aggTrades — 1M+ filas\n", "\n", "Los `aggTrades` agrupan fills de la misma taker order:\n", "- Cada fila = 1 taker order (o parte si cruzó muchos niveles)\n", "- Campo `a` = aggregate trade ID\n", "- Campo `m` = true si el maker es buyer (taker es seller)\n", "- Paginamos con `fromId` para ir hacia atrás en el tiempo" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "fetch_binance_agg_trades() definida\n" ] } ], "source": [ "async def fetch_binance_agg_trades(\n", " symbol: str = 'BTCUSDT',\n", " target_rows: int = 1_000_000,\n", " batch_size: int = 1000,\n", ") -> pl.DataFrame:\n", " \"\"\"Descarga aggTrades de Binance paginando hacia atrás.\n", "\n", " Cada aggTrade agrupa fills de la misma taker order:\n", " - a: aggregate trade id\n", " - p: price\n", " - q: quantity\n", " - f: first trade id\n", " - l: last trade id\n", " - T: timestamp\n", " - m: was the buyer the maker? (true = taker sold, false = taker bought)\n", " \"\"\"\n", " all_records = []\n", " from_id = None\n", " total = 0\n", " start_time = time.time()\n", "\n", " async with aiohttp.ClientSession() as session:\n", " while total < target_rows:\n", " params = {'symbol': symbol, 'limit': batch_size}\n", " if from_id is not None:\n", " params['fromId'] = from_id\n", "\n", " async with session.get(f'{BINANCE_BASE}/api/v3/aggTrades', params=params) as resp:\n", " if resp.status != 200:\n", " text = await resp.text()\n", " print(f'Error {resp.status}: {text}')\n", " break\n", " data = await resp.json()\n", "\n", " if not data:\n", " break\n", "\n", " for row in data:\n", " all_records.append({\n", " 'agg_trade_id': row['a'],\n", " 'price': float(row['p']),\n", " 'qty': float(row['q']),\n", " 'first_trade_id': row['f'],\n", " 'last_trade_id': row['l'],\n", " 'timestamp': row['T'],\n", " 'is_buyer_maker': row['m'], # True = taker vendió\n", " 'side': 'sell' if row['m'] else 'buy', # taker side\n", " 'n_fills': row['l'] - row['f'] + 1, # fills en esta agg\n", " })\n", "\n", " # Avanzar: siguiente página desde el último ID + 1\n", " from_id = data[-1]['a'] + 1\n", " total += len(data)\n", "\n", " if total % 50_000 == 0:\n", " elapsed = time.time() - start_time\n", " rate = total / elapsed\n", " eta = (target_rows - total) / rate if rate > 0 else 0\n", " ts = datetime.fromtimestamp(data[-1]['T'] / 1000)\n", " print(f' {total:>8,} rows | {rate:,.0f} rows/s | ETA {eta:.0f}s | hasta {ts}')\n", "\n", " # Rate limit: Binance permite 1200 req/min en aggTrades\n", " await asyncio.sleep(0.05)\n", "\n", " elapsed = time.time() - start_time\n", " print(f'\\nDescargados {total:,} aggTrades en {elapsed:.1f}s ({total/elapsed:,.0f} rows/s)')\n", "\n", " return pl.DataFrame(all_records)\n", "\n", "\n", "print('fetch_binance_agg_trades() definida')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Descargados 1,000 aggTrades en 0.5s (1,871 rows/s)\n", "\n", "Shape: (1000, 9)\n", "Columnas: ['agg_trade_id', 'price', 'qty', 'first_trade_id', 'last_trade_id', 'timestamp', 'is_buyer_maker', 'side', 'n_fills']\n", "shape: (5, 9)\n", "┌──────────────┬──────────┬─────────┬─────────────┬───┬─────────────┬─────────────┬──────┬─────────┐\n", "│ agg_trade_id ┆ price ┆ qty ┆ first_trade ┆ … ┆ timestamp ┆ is_buyer_ma ┆ side ┆ n_fills │\n", "│ --- ┆ --- ┆ --- ┆ _id ┆ ┆ --- ┆ ker ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ --- ┆ ┆ i64 ┆ --- ┆ str ┆ i64 │\n", "│ ┆ ┆ ┆ i64 ┆ ┆ ┆ bool ┆ ┆ │\n", "╞══════════════╪══════════╪═════════╪═════════════╪═══╪═════════════╪═════════════╪══════╪═════════╡\n", "│ 3924566726 ┆ 66916.1 ┆ 0.00122 ┆ 6182451140 ┆ … ┆ 17752265945 ┆ false ┆ buy ┆ 1 │\n", "│ ┆ ┆ ┆ ┆ ┆ 74 ┆ ┆ ┆ │\n", "│ 3924566727 ┆ 66916.09 ┆ 0.00047 ┆ 6182451141 ┆ … ┆ 17752265955 ┆ true ┆ sell ┆ 1 │\n", "│ ┆ ┆ ┆ ┆ ┆ 80 ┆ ┆ ┆ │\n", "│ 3924566728 ┆ 66916.1 ┆ 0.00034 ┆ 6182451142 ┆ … ┆ 17752265956 ┆ false ┆ buy ┆ 1 │\n", "│ ┆ ┆ ┆ ┆ ┆ 00 ┆ ┆ ┆ │\n", "│ 3924566729 ┆ 66916.1 ┆ 0.03403 ┆ 6182451143 ┆ … ┆ 17752265957 ┆ false ┆ buy ┆ 1 │\n", "│ ┆ ┆ ┆ ┆ ┆ 48 ┆ ┆ ┆ │\n", "│ 3924566730 ┆ 66916.1 ┆ 0.00121 ┆ 6182451144 ┆ … ┆ 17752265966 ┆ false ┆ buy ┆ 1 │\n", "│ ┆ ┆ ┆ ┆ ┆ 48 ┆ ┆ ┆ │\n", "└──────────────┴──────────┴─────────┴─────────────┴───┴─────────────┴─────────────┴──────┴─────────┘\n", "\n", "Rango temporal:\n", " 2026-04-03 16:29:54.574000 → 2026-04-03 16:31:32.561000\n", " Duración: 0:01:37.987000\n" ] } ], "source": [ "# Descargar 1M+ aggTrades de BTC/USDT\n", "binance_trades = await fetch_binance_agg_trades('BTCUSDT', target_rows=1_000_000)\n", "\n", "print(f'\\nShape: {binance_trades.shape}')\n", "print(f'Columnas: {binance_trades.columns}')\n", "print(binance_trades.head(5))\n", "print(f'\\nRango temporal:')\n", "t_min = datetime.fromtimestamp(binance_trades['timestamp'].min() / 1000)\n", "t_max = datetime.fromtimestamp(binance_trades['timestamp'].max() / 1000)\n", "print(f' {t_min} → {t_max}')\n", "print(f' Duración: {t_max - t_min}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Guardar Binance aggTrades\n", "out_path = DATA_DIR / 'binance_btcusdt_aggtrades.csv'\n", "binance_trades.write_csv(str(out_path))\n", "size_mb = out_path.stat().st_size / 1024 / 1024\n", "print(f'Guardado: {out_path}')\n", "print(f' {binance_trades.shape[0]:,} filas, {size_mb:.1f} MB')\n", "\n", "# Estadísticas rápidas\n", "print(f'\\nEstadísticas:')\n", "print(f' Buys (taker): {binance_trades.filter(pl.col(\"side\") == \"buy\").shape[0]:,}')\n", "print(f' Sells (taker): {binance_trades.filter(pl.col(\"side\") == \"sell\").shape[0]:,}')\n", "print(f' Precio min: ${binance_trades[\"price\"].min():,.2f}')\n", "print(f' Precio max: ${binance_trades[\"price\"].max():,.2f}')\n", "print(f' Qty mediana: {binance_trades[\"qty\"].median():.6f} BTC')\n", "print(f' Qty max: {binance_trades[\"qty\"].max():.4f} BTC')\n", "print(f' Fills/aggTrade mediana: {binance_trades[\"n_fills\"].median():.0f}')\n", "print(f' Fills/aggTrade max: {binance_trades[\"n_fills\"].max()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## 2. Bitstamp L3 — órdenes individuales via WebSocket\n", "\n", "Cada mensaje tiene:\n", "- `id`: ID único de la orden\n", "- `order_type`: 0 = buy, 1 = sell\n", "- `price`, `amount`\n", "- `datetime`, `microtimestamp`\n", "\n", "Los canales:\n", "- `live_orders_btcusd`: cada orden creada\n", "- `live_trades_btcusd`: cada ejecución con IDs de maker y taker" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "async def record_bitstamp_l3(\n", " pair: str = 'btcusd',\n", " duration_seconds: int = 300,\n", ") -> tuple[pl.DataFrame, pl.DataFrame]:\n", " \"\"\"Graba datos L3 de Bitstamp via WebSocket.\n", "\n", " Retorna (orders_df, trades_df) con todas las órdenes y trades capturados.\n", " \"\"\"\n", " orders = []\n", " trades = []\n", " start = time.time()\n", " msg_count = 0\n", "\n", " async with websockets.connect(BITSTAMP_WS) as ws:\n", " # Suscribirse a órdenes individuales + trades\n", " for channel in [f'live_orders_{pair}', f'live_trades_{pair}']:\n", " await ws.send(json.dumps({\n", " 'event': 'bts:subscribe',\n", " 'data': {'channel': channel}\n", " }))\n", "\n", " print(f'Grabando Bitstamp L3 ({pair}) por {duration_seconds}s...')\n", "\n", " while time.time() - start < duration_seconds:\n", " try:\n", " raw = await asyncio.wait_for(ws.recv(), timeout=5.0)\n", " msg = json.loads(raw)\n", " msg_count += 1\n", "\n", " event = msg.get('event', '')\n", " channel = msg.get('channel', '')\n", " data = msg.get('data', {})\n", "\n", " if isinstance(data, str):\n", " try:\n", " data = json.loads(data)\n", " except:\n", " continue\n", "\n", " # Órdenes (L3)\n", " if 'live_orders' in channel and event in ('order_created', 'order_changed', 'order_deleted'):\n", " orders.append({\n", " 'event': event,\n", " 'order_id': data.get('id', ''),\n", " 'side': 'buy' if data.get('order_type') == 0 else 'sell',\n", " 'price': float(data.get('price', 0)),\n", " 'amount': float(data.get('amount', 0)),\n", " 'datetime': data.get('datetime', ''),\n", " 'microtimestamp': data.get('microtimestamp', ''),\n", " })\n", "\n", " # Trades\n", " elif 'live_trades' in channel and event == 'trade':\n", " trades.append({\n", " 'trade_id': data.get('id', ''),\n", " 'side': 'buy' if data.get('type') == 0 else 'sell',\n", " 'price': float(data.get('price', 0)),\n", " 'amount': float(data.get('amount', 0)),\n", " 'buy_order_id': data.get('buy_order_id', ''),\n", " 'sell_order_id': data.get('sell_order_id', ''),\n", " 'timestamp': data.get('timestamp', ''),\n", " 'microtimestamp': data.get('microtimestamp', ''),\n", " })\n", "\n", " if msg_count % 5000 == 0:\n", " elapsed = time.time() - start\n", " print(f' {elapsed:.0f}s: {len(orders):,} orders, {len(trades):,} trades ({msg_count:,} msgs)')\n", "\n", " except asyncio.TimeoutError:\n", " continue\n", "\n", " elapsed = time.time() - start\n", " print(f'\\nGrabación terminada: {elapsed:.0f}s')\n", " print(f' Órdenes L3: {len(orders):,}')\n", " print(f' Trades: {len(trades):,}')\n", " print(f' Msgs total: {msg_count:,}')\n", "\n", " orders_df = pl.DataFrame(orders) if orders else pl.DataFrame()\n", " trades_df = pl.DataFrame(trades) if trades else pl.DataFrame()\n", "\n", " return orders_df, trades_df\n", "\n", "\n", "print('record_bitstamp_l3() definida')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Grabar 5 minutos de L3 de Bitstamp\n", "bs_orders, bs_trades = await record_bitstamp_l3('btcusd', duration_seconds=300)\n", "\n", "if bs_orders.shape[0] > 0:\n", " print(f'\\n=== Órdenes L3 ===')\n", " print(f'Shape: {bs_orders.shape}')\n", " print(bs_orders.head(5))\n", " print(f'\\nEventos:')\n", " print(bs_orders.group_by('event').agg(pl.len().alias('count')).sort('count', descending=True))\n", "\n", "if bs_trades.shape[0] > 0:\n", " print(f'\\n=== Trades ===')\n", " print(f'Shape: {bs_trades.shape}')\n", " print(bs_trades.head(5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Guardar Bitstamp L3\n", "if bs_orders.shape[0] > 0:\n", " path = DATA_DIR / 'bitstamp_btcusd_l3_orders.csv'\n", " bs_orders.write_csv(str(path))\n", " print(f'Guardado: {path} ({bs_orders.shape[0]:,} filas, {path.stat().st_size/1024/1024:.1f} MB)')\n", "\n", "if bs_trades.shape[0] > 0:\n", " path = DATA_DIR / 'bitstamp_btcusd_l3_trades.csv'\n", " bs_trades.write_csv(str(path))\n", " print(f'Guardado: {path} ({bs_trades.shape[0]:,} filas, {path.stat().st_size/1024/1024:.1f} MB)')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## 3. Resumen del dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "print('=' * 70)\n", "print(' DATASET RECOLECTADO')\n", "print('=' * 70)\n", "\n", "total_rows = 0\n", "for f in sorted(DATA_DIR.glob('*.csv')):\n", " size_mb = f.stat().st_size / 1024 / 1024\n", " # Contar filas rápido\n", " try:\n", " nrows = pl.scan_csv(str(f)).select(pl.len()).collect().item()\n", " except:\n", " nrows = '?'\n", " total_rows += nrows if isinstance(nrows, int) else 0\n", " print(f' {f.name:<45} {nrows:>10,} filas {size_mb:>7.1f} MB')\n", "\n", "print(f'\\n TOTAL: {total_rows:>10,} filas')\n", "print('=' * 70)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "waiting for binance download\n" ] } ], "source": [ "print('waiting for binance download')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Descargando 1M aggTrades (ID 3,923,568,287 → 3,924,568,287)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 100,000 (10%) | 3,308 rows/s | 2026-04-02 16:01:27.188000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 200,000 (20%) | 3,248 rows/s | 2026-04-02 16:55:31.965000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 300,000 (30%) | 3,258 rows/s | 2026-04-02 18:36:12.184000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 400,000 (40%) | 3,253 rows/s | 2026-04-02 20:16:47.719000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 500,000 (50%) | 3,212 rows/s | 2026-04-02 22:23:55.678000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 600,000 (60%) | 3,187 rows/s | 2026-04-03 03:15:10.491000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 700,000 (70%) | 3,199 rows/s | 2026-04-03 06:33:15.815000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 800,000 (80%) | 3,209 rows/s | 2026-04-03 09:56:05.448000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 900,000 (90%) | 3,225 rows/s | 2026-04-03 13:58:09.651000\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 1,000,000 (100%) | 3,235 rows/s | 2026-04-03 16:32:41.139000\n", "Listo: 1,000,000 rows en 309s (3,234 rows/s)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Shape: (1000000, 9)\n", "Rango: 2026-04-02 14:26:02.324000 → 2026-04-03 16:32:41.139000 (1 day, 2:06:38.815000)\n" ] } ], "source": [ "\n", "import aiohttp, asyncio, time as _time\n", "from datetime import datetime as _dt\n", "\n", "async def _fetch_bulk():\n", " all_records = []\n", " total = 0\n", " t0 = _time.time()\n", " \n", " async with aiohttp.ClientSession() as session:\n", " # Obtener último ID\n", " async with session.get(f'{BINANCE_BASE}/api/v3/aggTrades', \n", " params={'symbol': 'BTCUSDT', 'limit': 1}) as resp:\n", " data = await resp.json()\n", " latest_id = data[0]['a']\n", " \n", " from_id = latest_id - 1_000_000\n", " print(f'Descargando 1M aggTrades (ID {from_id:,} → {latest_id:,})')\n", " \n", " while from_id < latest_id:\n", " params = {'symbol': 'BTCUSDT', 'fromId': from_id, 'limit': 1000}\n", " async with session.get(f'{BINANCE_BASE}/api/v3/aggTrades', params=params) as resp:\n", " if resp.status == 429:\n", " await asyncio.sleep(10)\n", " continue\n", " if resp.status != 200:\n", " print(f'Error {resp.status}')\n", " break\n", " data = await resp.json()\n", " \n", " if not data:\n", " break\n", " \n", " for r in data:\n", " all_records.append({\n", " 'agg_trade_id': r['a'], 'price': float(r['p']), 'qty': float(r['q']),\n", " 'first_trade_id': r['f'], 'last_trade_id': r['l'], 'timestamp': r['T'],\n", " 'is_buyer_maker': r['m'], 'side': 'sell' if r['m'] else 'buy',\n", " 'n_fills': r['l'] - r['f'] + 1,\n", " })\n", " \n", " from_id = data[-1]['a'] + 1\n", " total += len(data)\n", " \n", " if total % 100_000 == 0:\n", " elapsed = _time.time() - t0\n", " pct = total / 1_000_000 * 100\n", " ts = _dt.fromtimestamp(data[-1]['T'] / 1000)\n", " print(f' {total:>10,} ({pct:.0f}%) | {total/elapsed:,.0f} rows/s | {ts}')\n", " \n", " await asyncio.sleep(0.06)\n", " \n", " elapsed = _time.time() - t0\n", " print(f'Listo: {total:,} rows en {elapsed:.0f}s ({total/elapsed:,.0f} rows/s)')\n", " return pl.DataFrame(all_records)\n", "\n", "binance_trades = await _fetch_bulk()\n", "print(f'Shape: {binance_trades.shape}')\n", "t_min = _dt.fromtimestamp(binance_trades['timestamp'].min() / 1000)\n", "t_max = _dt.fromtimestamp(binance_trades['timestamp'].max() / 1000)\n", "print(f'Rango: {t_min} → {t_max} ({t_max - t_min})')\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Guardado: binance_btcusdt_aggtrades_1M.csv\n", " 1,000,000 filas, 72.1 MB\n", " Buys: 506,912\n", " Sells: 493,088\n", " Qty mediana: 0.000460 BTC\n", " Fills/aggTrade mediana: 1\n", " Fills/aggTrade max: 500\n" ] } ], "source": [ "\n", "# Guardar Binance aggTrades\n", "out_path = DATA_DIR / 'binance_btcusdt_aggtrades_1M.csv'\n", "binance_trades.write_csv(str(out_path))\n", "size_mb = out_path.stat().st_size / 1024 / 1024\n", "print(f'Guardado: {out_path.name}')\n", "print(f' {binance_trades.shape[0]:,} filas, {size_mb:.1f} MB')\n", "print(f' Buys: {binance_trades.filter(pl.col(\"side\") == \"buy\").shape[0]:,}')\n", "print(f' Sells: {binance_trades.filter(pl.col(\"side\") == \"sell\").shape[0]:,}')\n", "print(f' Qty mediana: {binance_trades[\"qty\"].median():.6f} BTC')\n", "print(f' Fills/aggTrade mediana: {binance_trades[\"n_fills\"].median():.0f}')\n", "print(f' Fills/aggTrade max: {binance_trades[\"n_fills\"].max()}')\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Grabando Bitstamp L3 (btcusd) por 300s...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 24s: 4,995 orders, 3 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 41s: 9,993 orders, 5 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 79s: 14,989 orders, 9 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 125s: 19,984 orders, 14 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 164s: 24,930 orders, 68 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 205s: 29,927 orders, 71 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 228s: 34,925 orders, 73 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 246s: 39,924 orders, 74 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 281s: 44,920 orders, 78 trades\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bitstamp: 48,668 orders, 82 trades en 310s\n", "Guardado: bitstamp_btcusd_l3_orders.csv (48,668 filas)\n", "shape: (3, 2)\n", "┌───────────────┬───────┐\n", "│ event ┆ count │\n", "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═══════════════╪═══════╡\n", "│ order_created ┆ 24290 │\n", "│ order_deleted ┆ 24289 │\n", "│ order_changed ┆ 89 │\n", "└───────────────┴───────┘\n", "Guardado: bitstamp_btcusd_l3_trades.csv (82 filas)\n", "shape: (3, 8)\n", "┌───────────┬──────┬─────────┬──────────┬───────────────┬──────────────┬────────────┬──────────────┐\n", "│ trade_id ┆ side ┆ price ┆ amount ┆ buy_order_id ┆ sell_order_i ┆ timestamp ┆ microtimesta │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ d ┆ --- ┆ mp │\n", "│ i64 ┆ str ┆ f64 ┆ f64 ┆ i64 ┆ --- ┆ str ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ i64 ┆ ┆ str │\n", "╞═══════════╪══════╪═════════╪══════════╪═══════════════╪══════════════╪════════════╪══════════════╡\n", "│ 555923544 ┆ buy ┆ 66879.0 ┆ 0.013272 ┆ 1992262225506 ┆ 199226205261 ┆ 1775227119 ┆ 177522711955 │\n", "│ ┆ ┆ ┆ ┆ 305 ┆ 0055 ┆ ┆ 6000 │\n", "│ 555923545 ┆ buy ┆ 66879.0 ┆ 0.001557 ┆ 1992262225506 ┆ 199226222522 ┆ 1775227119 ┆ 177522711955 │\n", "│ ┆ ┆ ┆ ┆ 305 ┆ 7776 ┆ ┆ 6000 │\n", "│ 555923566 ┆ buy ┆ 66898.0 ┆ 0.000077 ┆ 1992262268186 ┆ 199226223412 ┆ 1775227129 ┆ 177522712997 │\n", "│ ┆ ┆ ┆ ┆ 625 ┆ 4290 ┆ ┆ 6000 │\n", "└───────────┴──────┴─────────┴──────────┴───────────────┴──────────────┴────────────┴──────────────┘\n" ] } ], "source": [ "\n", "import websockets, json, asyncio\n", "\n", "async def _record_bitstamp(pair='btcusd', duration=300):\n", " orders, trades = [], []\n", " t0 = _time.time()\n", " mc = 0\n", " async with websockets.connect('wss://ws.bitstamp.net') as ws:\n", " for ch in [f'live_orders_{pair}', f'live_trades_{pair}']:\n", " await ws.send(json.dumps({'event':'bts:subscribe','data':{'channel':ch}}))\n", " print(f'Grabando Bitstamp L3 ({pair}) por {duration}s...')\n", " while _time.time() - t0 < duration:\n", " try:\n", " raw = await asyncio.wait_for(ws.recv(), timeout=5.0)\n", " msg = json.loads(raw)\n", " mc += 1\n", " ev = msg.get('event','')\n", " ch = msg.get('channel','')\n", " d = msg.get('data',{})\n", " if isinstance(d, str):\n", " try: d = json.loads(d)\n", " except: continue\n", " if 'live_orders' in ch and ev in ('order_created','order_changed','order_deleted'):\n", " orders.append({\n", " 'event':ev, 'order_id':d.get('id',''),\n", " 'side':'buy' if d.get('order_type')==0 else 'sell',\n", " 'price':float(d.get('price',0)), 'amount':float(d.get('amount',0)),\n", " 'datetime':d.get('datetime',''), 'microtimestamp':d.get('microtimestamp',''),\n", " })\n", " elif 'live_trades' in ch and ev == 'trade':\n", " trades.append({\n", " 'trade_id':d.get('id',''),\n", " 'side':'buy' if d.get('type')==0 else 'sell',\n", " 'price':float(d.get('price',0)), 'amount':float(d.get('amount',0)),\n", " 'buy_order_id':d.get('buy_order_id',''),\n", " 'sell_order_id':d.get('sell_order_id',''),\n", " 'timestamp':d.get('timestamp',''), 'microtimestamp':d.get('microtimestamp',''),\n", " })\n", " if mc % 5000 == 0:\n", " print(f' {_time.time()-t0:.0f}s: {len(orders):,} orders, {len(trades):,} trades')\n", " except asyncio.TimeoutError:\n", " continue\n", " print(f'Bitstamp: {len(orders):,} orders, {len(trades):,} trades en {_time.time()-t0:.0f}s')\n", " return pl.DataFrame(orders) if orders else pl.DataFrame(), pl.DataFrame(trades) if trades else pl.DataFrame()\n", "\n", "bs_orders, bs_trades = await _record_bitstamp('btcusd', duration=300)\n", "if bs_orders.shape[0] > 0:\n", " bs_orders.write_csv(str(DATA_DIR / 'bitstamp_btcusd_l3_orders.csv'))\n", " print(f'Guardado: bitstamp_btcusd_l3_orders.csv ({bs_orders.shape[0]:,} filas)')\n", " print(bs_orders.group_by('event').agg(pl.len().alias('count')).sort('count', descending=True))\n", "if bs_trades.shape[0] > 0:\n", " bs_trades.write_csv(str(DATA_DIR / 'bitstamp_btcusd_l3_trades.csv'))\n", " print(f'Guardado: bitstamp_btcusd_l3_trades.csv ({bs_trades.shape[0]:,} filas)')\n", " print(bs_trades.head(3))\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pares disponibles en Bitstamp: 273\n", " ['btcusd', 'btceur', 'btcgbp', 'gbpusd', 'eurusd', 'xrpusd', 'xrpeur', 'xrpbtc', 'xrpgbp', 'ltcbtc']\n", " ['ltcusd', 'ltceur', 'ethbtc', 'ethusd', 'etheur', 'ethgbp', 'bchusd', 'bcheur', 'xlmusd', 'xlmeur']\n", " ['linkusd', 'linkeur', 'linkgbp', 'usdcusd', 'usdceur', 'btcusdc', 'ethusdc', 'eth2eth', 'aaveusd', 'aaveeur']\n", " ['batusd', 'bateur', 'umausd', 'umaeur', 'daiusd', 'kncusd', 'knceur', 'mkrusd', 'mkreur', 'zrxusd']\n", " ['zrxeur', 'algousd', 'algoeur', 'audiousd', 'audioeur', 'crvusd', 'crveur', 'snxusd', 'snxeur', 'uniusd']\n", " ['unieur', 'xtzusd', 'xtzeur', 'compusd', 'compeur', 'grtusd', 'grteur', 'usdtusd', 'usdteur', 'usdcusdt']\n", " ['btcusdt', 'ethusdt', 'xrpusdt', 'flrusd', 'flreur', 'manausd', 'manaeur', 'sushiusd', 'sushieur', 'chzusd']\n", " ['chzeur', 'enjusd', 'enjeur', 'hbarusd', 'hbareur', 'axsusd', 'axseur', 'sandusd', 'sandeur', 'storjusd']\n", " ['storjeur', 'adausd', 'adaeur', 'fetusd', 'feteur', 'sklusd', 'skleur', 'avaxusd', 'avaxeur', 'ftmusd']\n", " ['ftmeur', 'shibusd', 'shibeur', 'galausd', 'galaeur', 'wbtcbtc', 'ctsiusd', 'ctsieur', 'imxusd', 'imxeur']\n", " ['injusd', 'injeur', 'rndrusd', 'rndreur', 'solusd', 'soleur', 'apeusd', 'apeeur', 'eurcusdc', 'eurceur']\n", " ['dotusd', 'doteur', 'nearusd', 'neareur', 'ldousd', 'ldoeur', 'dogeusd', 'dogeeur', 'suiusd', 'suieur']\n", " ['eurcvusdt', 'eurcveur', 'pyusdusd', 'tracusd', 'traceur', 'wecanusd', 'wecaneur', 'lmwrusd', 'lmwreur', 'pepeusd']\n", " ['pepeeur', 'csprusd', 'cspreur', 'gyenusd', 'vchfusd', 'vchfeur', 'veurusd', 'veureur', 'zusdusd', 'bonkusd']\n", " ['bonkeur', 'jupusd', 'jupeur', 'pythusd', 'pytheur', 'wifusd', 'wifeur', 'ondousd', 'ondoeur', 'trufusd']\n", " ['trufeur', 'egldusd', 'egldeur', 'icpusd', 'icpeur', 'xdcusd', 'xdceur', 'flokiusd', 'flokieur', 'strkusd']\n", " ['strkeur', 'xchngusd', 'xchngeur', 'ctxusd', 'ctxeur', 'mogusd', 'mogeur', 'wenusd', 'weneur', 'arbusd']\n", " ['arbeur', 'xsgdusd', 'xsgdusdt', 'zetausd', 'zetaeur', 'mewusd', 'meweur', 'rlusdusd', 'rlusdeur', 'rlusdusdt']\n", " ['btcrlusd', 'xrprlusd', 'ethrlusd', 'woousd', 'wooeur', 'seiusd', 'seieur', 'opusd', 'opeur', 'cxtusd']\n", " ['cxteur', 'popcatusd', 'popcateur', 'syrupusd', 'syrupeur', 'melaniausd', 'melaniaeur', 'trumpusd', 'trumpeur', 'polusd']\n", " ['poleur', 'fartcoinusd', 'fartcoineur', 'penguusd', 'pengueur', 'pnutusd', 'pnuteur', 'etcusd', 'etceur', 'xrpusdc']\n", " ['solusdc', 'btceurc', 'etheurc', 'eurcvusdc', 'btceurcv', 'etheurcv', 'sgdusd', 'virtualeur', 'virtualusd', 'xcnusd']\n", " ['xcneur', 'moodengusd', 'moodengeur', 'taiusd', 'taieur', 'tonusd', 'toneur', 'xplusd', 'xpleur', 'biousd']\n", " ['bioeur', 'enausd', 'enaeur', 'spkusd', 'spkeur', 'wlfiusd', 'wlfieur', 'hypeusd', 'hypeeur', 'asterusd']\n", " ['astereur', 'avntusd', 'avnteur', 'bnbusd', 'bnbeur', 'monusd', 'moneur', 'pendleusd', 'pendleeur', 'zorausd']\n", " ['zoraeur', 'aptusd', 'apteur', 'atomusd', 'atomeur', 'cakeusd', 'cakeeur', 'fightusd', 'fighteur', 'litusd']\n", " ['liteur', 'susd', 'seur', 'taousd', 'taoeur', 'trxusd', 'trxeur', 'aerousd', 'aeroeur', 'paxgusd']\n", " ['paxgeur', 'mntusd', 'mnteur']\n" ] } ], "source": [ "\n", "# Descubrir todos los pares disponibles en Bitstamp via REST\n", "import aiohttp, asyncio\n", "\n", "async def get_bitstamp_pairs():\n", " async with aiohttp.ClientSession() as s:\n", " async with s.get('https://www.bitstamp.net/api/v2/trading-pairs-info/') as r:\n", " data = await r.json()\n", " # Filtrar pares activos con USD\n", " pairs = [p['url_symbol'] for p in data if p['trading'] == 'Enabled']\n", " return pairs\n", "\n", "all_pairs = await get_bitstamp_pairs()\n", "print(f'Pares disponibles en Bitstamp: {len(all_pairs)}')\n", "for i in range(0, len(all_pairs), 10):\n", " print(f' {all_pairs[i:i+10]}')\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Suscrito a 30 pares (60 canales)\n", "Grabando 900s...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 52s: 19,886 orders, 54 trades | 386 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 105s: 39,844 orders, 96 trades | 380 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 139s: 59,825 orders, 115 trades | 432 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 165s: 79,771 orders, 169 trades | 484 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 180s: 99,733 orders, 207 trades | 554 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 203s: 119,704 orders, 236 trades | 590 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 231s: 139,667 orders, 273 trades | 605 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 249s: 159,648 orders, 292 trades | 640 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 280s: 179,615 orders, 325 trades | 642 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 318s: 199,577 orders, 363 trades | 627 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 344s: 219,531 orders, 409 trades | 638 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 362s: 239,503 orders, 437 trades | 662 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 396s: 259,473 orders, 467 trades | 656 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 426s: 279,419 orders, 521 trades | 655 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 470s: 299,377 orders, 563 trades | 637 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 496s: 319,349 orders, 591 trades | 644 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 531s: 339,318 orders, 622 trades | 639 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 564s: 359,287 orders, 653 trades | 637 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 587s: 379,268 orders, 672 trades | 646 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 615s: 399,247 orders, 693 trades | 649 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 643s: 419,226 orders, 714 trades | 652 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 690s: 439,193 orders, 747 trades | 637 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 725s: 459,152 orders, 788 trades | 633 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 768s: 479,097 orders, 843 trades | 624 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 796s: 499,001 orders, 939 trades | 627 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 823s: 518,950 orders, 990 trades | 630 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 839s: 538,919 orders, 1,021 trades | 642 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 863s: 558,883 orders, 1,057 trades | 647 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " 895s: 578,849 orders, 1,091 trades | 647 orders/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Finalizado: 581,929 orders, 1,094 trades en 910s\n", " 639 orders/s promedio\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Guardado: bitstamp_multi_l3_orders.csv (581,929 filas, 48.5 MB)\n", "shape: (10, 2)\n", "┌─────────┬────────┐\n", "│ pair ┆ n │\n", "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═════════╪════════╡\n", "│ btcusd ┆ 168383 │\n", "│ ethusd ┆ 74387 │\n", "│ solusd ┆ 50820 │\n", "│ xrpusd ┆ 32191 │\n", "│ avaxusd ┆ 30689 │\n", "│ soleur ┆ 29519 │\n", "│ btceur ┆ 28591 │\n", "│ adausd ┆ 27297 │\n", "│ btcusdt ┆ 22249 │\n", "│ dogeusd ┆ 17526 │\n", "└─────────┴────────┘\n", "Guardado: bitstamp_multi_l3_trades.csv (1,094 filas)\n", "shape: (10, 2)\n", "┌─────────┬─────┐\n", "│ pair ┆ n │\n", "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═════════╪═════╡\n", "│ solusd ┆ 292 │\n", "│ btcusd ┆ 150 │\n", "│ adausd ┆ 141 │\n", "│ xrpusd ┆ 123 │\n", "│ btceur ┆ 74 │\n", "│ avaxusd ┆ 46 │\n", "│ ethusd ┆ 42 │\n", "│ dogeusd ┆ 42 │\n", "│ xrpeur ┆ 39 │\n", "│ ltcusd ┆ 35 │\n", "└─────────┴─────┘\n" ] } ], "source": [ "\n", "import websockets, json, asyncio\n", "\n", "# Top 30 pares más líquidos (mezcla de USD, EUR, BTC base)\n", "PAIRS = [\n", " 'btcusd', 'ethusd', 'xrpusd', 'solusd', 'dogeusd', 'adausd',\n", " 'ltcusd', 'linkusd', 'avaxusd', 'dotusd', 'nearusd', 'suiusd',\n", " 'shibusd', 'pepeusd', 'bonkusd', 'injusd', 'arbusd', 'opusd',\n", " 'btceur', 'etheur', 'xrpeur', 'soleur', 'dogeeur',\n", " 'btcusdt', 'ethusdt', 'xrpusdt',\n", " 'ethbtc', 'xrpbtc', 'ltcbtc', 'usdtusd',\n", "]\n", "\n", "async def record_bitstamp_massive(pairs, duration_seconds=600):\n", " '''Graba L3 de múltiples pares simultáneamente.'''\n", " orders = []\n", " trades = []\n", " t0 = _time.time()\n", " mc = 0\n", "\n", " async with websockets.connect('wss://ws.bitstamp.net', max_size=10_000_000) as ws:\n", " # Suscribirse a todos los pares\n", " for pair in pairs:\n", " for ch_type in ['live_orders', 'live_trades']:\n", " await ws.send(json.dumps({\n", " 'event': 'bts:subscribe',\n", " 'data': {'channel': f'{ch_type}_{pair}'}\n", " }))\n", " print(f'Suscrito a {len(pairs)} pares ({len(pairs)*2} canales)')\n", " print(f'Grabando {duration_seconds}s...')\n", "\n", " while _time.time() - t0 < duration_seconds:\n", " try:\n", " raw = await asyncio.wait_for(ws.recv(), timeout=5.0)\n", " msg = json.loads(raw)\n", " mc += 1\n", " ev = msg.get('event', '')\n", " ch = msg.get('channel', '')\n", " d = msg.get('data', {})\n", " if isinstance(d, str):\n", " try: d = json.loads(d)\n", " except: continue\n", "\n", " # Extraer par del nombre del canal\n", " pair = ch.replace('live_orders_', '').replace('live_trades_', '')\n", "\n", " if 'live_orders' in ch and ev in ('order_created', 'order_changed', 'order_deleted'):\n", " orders.append({\n", " 'pair': pair, 'event': ev,\n", " 'order_id': d.get('id', ''),\n", " 'side': 'buy' if d.get('order_type') == 0 else 'sell',\n", " 'price': float(d.get('price', 0)),\n", " 'amount': float(d.get('amount', 0)),\n", " 'datetime': d.get('datetime', ''),\n", " 'microtimestamp': d.get('microtimestamp', ''),\n", " })\n", " elif 'live_trades' in ch and ev == 'trade':\n", " trades.append({\n", " 'pair': pair, 'trade_id': d.get('id', ''),\n", " 'side': 'buy' if d.get('type') == 0 else 'sell',\n", " 'price': float(d.get('price', 0)),\n", " 'amount': float(d.get('amount', 0)),\n", " 'buy_order_id': d.get('buy_order_id', ''),\n", " 'sell_order_id': d.get('sell_order_id', ''),\n", " 'timestamp': d.get('timestamp', ''),\n", " 'microtimestamp': d.get('microtimestamp', ''),\n", " })\n", "\n", " if mc % 20000 == 0:\n", " elapsed = _time.time() - t0\n", " rate = len(orders) / elapsed\n", " print(f' {elapsed:.0f}s: {len(orders):,} orders, {len(trades):,} trades | {rate:,.0f} orders/s')\n", "\n", " # Check if we hit 1M\n", " if len(orders) >= 1_000_000:\n", " print(f' Alcanzado 1M orders en {_time.time()-t0:.0f}s!')\n", " break\n", "\n", " except asyncio.TimeoutError:\n", " continue\n", "\n", " elapsed = _time.time() - t0\n", " print(f'\\nFinalizado: {len(orders):,} orders, {len(trades):,} trades en {elapsed:.0f}s')\n", " print(f' {len(orders)/elapsed:,.0f} orders/s promedio')\n", "\n", " orders_df = pl.DataFrame(orders) if orders else pl.DataFrame()\n", " trades_df = pl.DataFrame(trades) if trades else pl.DataFrame()\n", " return orders_df, trades_df\n", "\n", "bs_orders_m, bs_trades_m = await record_bitstamp_massive(PAIRS, duration_seconds=900)\n", "\n", "# Guardar\n", "if bs_orders_m.shape[0] > 0:\n", " path = DATA_DIR / 'bitstamp_multi_l3_orders.csv'\n", " bs_orders_m.write_csv(str(path))\n", " size_mb = path.stat().st_size / 1024 / 1024\n", " print(f'\\nGuardado: {path.name} ({bs_orders_m.shape[0]:,} filas, {size_mb:.1f} MB)')\n", " print(bs_orders_m.group_by('pair').agg(pl.len().alias('n')).sort('n', descending=True).head(10))\n", "\n", "if bs_trades_m.shape[0] > 0:\n", " path = DATA_DIR / 'bitstamp_multi_l3_trades.csv'\n", " bs_trades_m.write_csv(str(path))\n", " print(f'Guardado: {path.name} ({bs_trades_m.shape[0]:,} filas)')\n", " print(bs_trades_m.group_by('pair').agg(pl.len().alias('n')).sort('n', descending=True).head(10))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 4 }