574 lines
21 KiB
Plaintext
574 lines
21 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Análisis del dataset real: 1M aggTrades + Bitstamp L3\n",
|
||
"\n",
|
||
"Tenemos:\n",
|
||
"- **Binance**: 1M aggTrades de BTC/USDT (~26h de mercado)\n",
|
||
"- **Bitstamp**: L3 orders + trades (5 min de captura)\n",
|
||
"\n",
|
||
"## Objetivos\n",
|
||
"1. Estimar parámetros de microestructura sobre datos reales\n",
|
||
"2. Ver cómo cambian con ventanas deslizantes\n",
|
||
"3. Comparar Binance (aggTrades = órdenes agrupadas) vs Bitstamp (L3 = cada orden)\n",
|
||
"4. Calibrar nuestra simulación para que genere datos similares"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Binance aggTrades: 1,000,000 filas\n",
|
||
"Columnas: ['agg_trade_id', 'price', 'qty', 'first_trade_id', 'last_trade_id', 'timestamp', 'is_buyer_maker', 'side', 'n_fills']\n",
|
||
"Rango: 2026-04-02 14:26:02.324000 → 2026-04-03 16:32:41.139000 (1 day, 2:06:38.815000)\n",
|
||
"\n",
|
||
"Bitstamp L3 aún no disponible (grabando...)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import polars as pl\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from scipy.optimize import curve_fit\n",
|
||
"from scipy.stats import probplot\n",
|
||
"from datetime import datetime\n",
|
||
"from pathlib import Path\n",
|
||
"\n",
|
||
"DATA = Path('../data')\n",
|
||
"\n",
|
||
"# Cargar Binance aggTrades\n",
|
||
"trades = pl.read_csv(str(DATA / 'binance_btcusdt_aggtrades_1M.csv'))\n",
|
||
"print(f'Binance aggTrades: {trades.shape[0]:,} filas')\n",
|
||
"print(f'Columnas: {trades.columns}')\n",
|
||
"\n",
|
||
"t_min = datetime.fromtimestamp(trades['timestamp'].min() / 1000)\n",
|
||
"t_max = datetime.fromtimestamp(trades['timestamp'].max() / 1000)\n",
|
||
"print(f'Rango: {t_min} → {t_max} ({t_max - t_min})')\n",
|
||
"\n",
|
||
"# Intentar cargar Bitstamp si existe\n",
|
||
"bs_path = DATA / 'bitstamp_btcusd_l3_orders.csv'\n",
|
||
"if bs_path.exists():\n",
|
||
" bs_orders = pl.read_csv(str(bs_path))\n",
|
||
" print(f'\\nBitstamp L3 orders: {bs_orders.shape[0]:,} filas')\n",
|
||
"else:\n",
|
||
" bs_orders = None\n",
|
||
" print('\\nBitstamp L3 aún no disponible (grabando...)')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 1. Visión general del dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Añadir columna datetime y agrupar por minuto\n",
|
||
"trades_dt = trades.with_columns(\n",
|
||
" (pl.col('timestamp') * 1000).cast(pl.Datetime('us')).alias('datetime'),\n",
|
||
" (pl.col('timestamp') // 60000).alias('minute'),\n",
|
||
")\n",
|
||
"\n",
|
||
"# Por minuto\n",
|
||
"per_min = trades_dt.group_by('minute').agg(\n",
|
||
" pl.len().alias('n_trades'),\n",
|
||
" pl.col('price').last().alias('close'),\n",
|
||
" pl.col('price').min().alias('low'),\n",
|
||
" pl.col('price').max().alias('high'),\n",
|
||
" pl.col('qty').sum().alias('volume'),\n",
|
||
" (pl.col('qty') * pl.col('price')).sum().alias('turnover'),\n",
|
||
" pl.col('timestamp').min().alias('ts'),\n",
|
||
").sort('minute')\n",
|
||
"\n",
|
||
"# Log returns\n",
|
||
"per_min = per_min.with_columns(\n",
|
||
" (pl.col('close').log() - pl.col('close').shift(1).log()).alias('log_return')\n",
|
||
")\n",
|
||
"\n",
|
||
"print(f'Minutos: {per_min.shape[0]}')\n",
|
||
"print(f'Trades/minuto: media={per_min[\"n_trades\"].mean():.0f}, mediana={per_min[\"n_trades\"].median():.0f}')\n",
|
||
"print(f'Volumen/minuto: media={per_min[\"volume\"].mean():.2f} BTC')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Overview: precio, volumen, trades/min\n",
|
||
"fig, axes = plt.subplots(3, 1, figsize=(16, 10), gridspec_kw={'height_ratios': [3, 1, 1]}, sharex=True)\n",
|
||
"\n",
|
||
"minutes = np.arange(per_min.shape[0])\n",
|
||
"\n",
|
||
"ax = axes[0]\n",
|
||
"ax.plot(minutes, per_min['close'].to_numpy(), linewidth=0.5, color='#3498db')\n",
|
||
"ax.set_ylabel('Precio (USDT)')\n",
|
||
"ax.set_title(f'BTC/USDT — 1M aggTrades ({t_min.strftime(\"%Y-%m-%d %H:%M\")} → {t_max.strftime(\"%Y-%m-%d %H:%M\")})')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"ax = axes[1]\n",
|
||
"ax.bar(minutes, per_min['volume'].to_numpy(), width=1.0, color='#e67e22', alpha=0.6)\n",
|
||
"ax.set_ylabel('Volumen (BTC)')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"ax = axes[2]\n",
|
||
"ax.bar(minutes, per_min['n_trades'].to_numpy(), width=1.0, color='#9b59b6', alpha=0.6)\n",
|
||
"ax.set_ylabel('Trades/min')\n",
|
||
"ax.set_xlabel('Minuto')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 2. Estimación de parámetros\n",
|
||
"\n",
|
||
"### 2.1 Volatilidad (σ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"returns = per_min.drop_nulls('log_return')['log_return'].to_numpy()\n",
|
||
"\n",
|
||
"sigma_1m = np.std(returns)\n",
|
||
"sigma_1h = sigma_1m * np.sqrt(60)\n",
|
||
"sigma_1d = sigma_1m * np.sqrt(60 * 24)\n",
|
||
"sigma_ann = sigma_1d * np.sqrt(365)\n",
|
||
"\n",
|
||
"print(f'σ por minuto: {sigma_1m:.6f}')\n",
|
||
"print(f'σ por hora: {sigma_1h:.5f}')\n",
|
||
"print(f'σ diaria: {sigma_1d:.4f} ({sigma_1d*100:.2f}%)')\n",
|
||
"print(f'σ anualizada: {sigma_ann:.2f} ({sigma_ann*100:.0f}%)')\n",
|
||
"\n",
|
||
"# Rolling sigma (ventana de 60 minutos)\n",
|
||
"window = 60\n",
|
||
"rolling_sigma = np.array([np.std(returns[max(0,i-window):i]) for i in range(window, len(returns))])\n",
|
||
"\n",
|
||
"fig, axes = plt.subplots(2, 2, figsize=(14, 8))\n",
|
||
"\n",
|
||
"# Histograma de retornos\n",
|
||
"ax = axes[0][0]\n",
|
||
"ax.hist(returns, bins=100, density=True, color='#3498db', alpha=0.6)\n",
|
||
"x = np.linspace(returns.min(), returns.max(), 200)\n",
|
||
"from scipy.stats import norm\n",
|
||
"ax.plot(x, norm.pdf(x, 0, sigma_1m), 'r-', linewidth=1.5, label=f'Normal σ={sigma_1m:.5f}')\n",
|
||
"ax.set_title('Distribución de retornos 1m')\n",
|
||
"ax.legend(fontsize=8)\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# QQ plot\n",
|
||
"probplot(returns, dist='norm', plot=axes[0][1])\n",
|
||
"axes[0][1].set_title('QQ-Plot vs Normal')\n",
|
||
"axes[0][1].grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# Rolling sigma\n",
|
||
"ax = axes[1][0]\n",
|
||
"ax.fill_between(range(len(rolling_sigma)), rolling_sigma, color='#e74c3c', alpha=0.5)\n",
|
||
"ax.axhline(y=sigma_1m, color='black', linestyle='--', linewidth=0.8, label=f'σ global={sigma_1m:.5f}')\n",
|
||
"ax.set_title(f'σ rolling (ventana {window}m)')\n",
|
||
"ax.set_ylabel('σ por minuto')\n",
|
||
"ax.legend(fontsize=8)\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# Retornos absolutos (clustering de volatilidad)\n",
|
||
"ax = axes[1][1]\n",
|
||
"ax.plot(np.abs(returns), linewidth=0.3, color='#e74c3c', alpha=0.6)\n",
|
||
"ax.set_title('|Retornos| — clustering de volatilidad')\n",
|
||
"ax.set_ylabel('|log-return|')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"kurtosis = float(np.mean((returns - np.mean(returns))**4) / sigma_1m**4)\n",
|
||
"skew = float(np.mean((returns - np.mean(returns))**3) / sigma_1m**3)\n",
|
||
"print(f'\\nKurtosis: {kurtosis:.1f} (Normal=3)')\n",
|
||
"print(f'Skewness: {skew:.3f} (Normal=0)')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2.2 Arrival rate (λ) y Hawkes clustering"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Trades por segundo\n",
|
||
"trades_per_sec = trades.with_columns(\n",
|
||
" (pl.col('timestamp') // 1000).alias('second')\n",
|
||
").group_by('second').agg(\n",
|
||
" pl.len().alias('n_trades'),\n",
|
||
" pl.col('qty').sum().alias('volume'),\n",
|
||
").sort('second')\n",
|
||
"\n",
|
||
"arrivals = trades_per_sec['n_trades'].to_numpy()\n",
|
||
"\n",
|
||
"lambda_mean = np.mean(arrivals)\n",
|
||
"var_mean = np.var(arrivals) / np.mean(arrivals)\n",
|
||
"\n",
|
||
"print(f'Trades/segundo: media={lambda_mean:.1f}, mediana={np.median(arrivals):.0f}')\n",
|
||
"print(f'Var/Mean ratio: {var_mean:.1f} (=1 si Poisson, >1 = clustering)')\n",
|
||
"\n",
|
||
"# Autocorrelación\n",
|
||
"max_lag = 60\n",
|
||
"mean_a = np.mean(arrivals)\n",
|
||
"var_a = np.var(arrivals)\n",
|
||
"acf = np.array([\n",
|
||
" np.mean((arrivals[lag:] - mean_a) * (arrivals[:-lag] - mean_a)) / var_a\n",
|
||
" if lag > 0 else 1.0\n",
|
||
" for lag in range(max_lag)\n",
|
||
"])\n",
|
||
"\n",
|
||
"# Ajustar exponencial para estimar Hawkes\n",
|
||
"lags = np.arange(1, max_lag)\n",
|
||
"acf_vals = acf[1:]\n",
|
||
"positive_mask = acf_vals > 0\n",
|
||
"if np.sum(positive_mask) > 5:\n",
|
||
" try:\n",
|
||
" exp_fn = lambda x, a, b: a * np.exp(-b * x)\n",
|
||
" popt, _ = curve_fit(exp_fn, lags[positive_mask], acf_vals[positive_mask], p0=[0.3, 0.1], maxfev=5000)\n",
|
||
" hawkes_a, hawkes_b = abs(popt[0]), abs(popt[1])\n",
|
||
" branching = hawkes_a / hawkes_b\n",
|
||
" except:\n",
|
||
" hawkes_a, hawkes_b, branching = 0, 1, 0\n",
|
||
"else:\n",
|
||
" hawkes_a, hawkes_b, branching = 0, 1, 0\n",
|
||
"\n",
|
||
"print(f'\\nHawkes (ajuste exp a ACF):')\n",
|
||
"print(f' α ≈ {hawkes_a:.4f}')\n",
|
||
"print(f' β ≈ {hawkes_b:.4f}')\n",
|
||
"print(f' Branching ratio η = α/β = {branching:.3f} (< 1 = estacionario)')\n",
|
||
"\n",
|
||
"fig, axes = plt.subplots(1, 3, figsize=(16, 4))\n",
|
||
"\n",
|
||
"# ACF\n",
|
||
"ax = axes[0]\n",
|
||
"ax.bar(range(max_lag), acf, color='#e67e22', alpha=0.6)\n",
|
||
"if hawkes_a > 0:\n",
|
||
" ax.plot(lags, exp_fn(lags, hawkes_a, hawkes_b), 'r-', linewidth=2, label=f'Exp fit: α={hawkes_a:.3f}, β={hawkes_b:.3f}')\n",
|
||
"ax.axhline(y=0, color='black', linewidth=0.5)\n",
|
||
"ci = 1.96 / np.sqrt(len(arrivals))\n",
|
||
"ax.axhline(y=ci, color='blue', linestyle='--', linewidth=0.8, alpha=0.5)\n",
|
||
"ax.axhline(y=-ci, color='blue', linestyle='--', linewidth=0.8, alpha=0.5)\n",
|
||
"ax.set_title('ACF trades/segundo')\n",
|
||
"ax.set_xlabel('Lag (s)')\n",
|
||
"ax.legend(fontsize=7)\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# Distribución de arrivals\n",
|
||
"ax = axes[1]\n",
|
||
"ax.hist(arrivals, bins=50, density=True, color='#3498db', alpha=0.6)\n",
|
||
"ax.set_title(f'Trades/segundo (media={lambda_mean:.1f}, V/M={var_mean:.1f})')\n",
|
||
"ax.set_xlabel('Trades/s')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# Rolling lambda\n",
|
||
"w = 300 # ventana 5 min\n",
|
||
"rolling_lambda = np.convolve(arrivals, np.ones(w)/w, mode='valid')\n",
|
||
"ax = axes[2]\n",
|
||
"ax.plot(rolling_lambda, linewidth=0.5, color='#9b59b6')\n",
|
||
"ax.axhline(y=lambda_mean, color='black', linestyle='--', linewidth=0.8)\n",
|
||
"ax.set_title(f'λ rolling (ventana {w}s = 5min)')\n",
|
||
"ax.set_ylabel('Trades/s')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2.3 Distribución de tamaños (Pareto)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sizes = trades['qty'].to_numpy()\n",
|
||
"sizes = sizes[sizes > 0]\n",
|
||
"costs = (trades['qty'] * trades['price']).to_numpy()\n",
|
||
"costs = costs[costs > 0]\n",
|
||
"\n",
|
||
"# Pareto MLE sobre la cola (p90+)\n",
|
||
"x_min_qty = np.percentile(sizes, 90)\n",
|
||
"tail_qty = sizes[sizes >= x_min_qty]\n",
|
||
"alpha_qty = len(tail_qty) / np.sum(np.log(tail_qty / x_min_qty))\n",
|
||
"\n",
|
||
"x_min_cost = np.percentile(costs, 90)\n",
|
||
"tail_cost = costs[costs >= x_min_cost]\n",
|
||
"alpha_cost = len(tail_cost) / np.sum(np.log(tail_cost / x_min_cost))\n",
|
||
"\n",
|
||
"print(f'Tamaños (BTC):')\n",
|
||
"print(f' Mediana: {np.median(sizes):.6f} BTC')\n",
|
||
"print(f' p99: {np.percentile(sizes, 99):.4f} BTC')\n",
|
||
"print(f' Max: {sizes.max():.2f} BTC')\n",
|
||
"print(f' Pareto α (cola p90+): {alpha_qty:.2f}')\n",
|
||
"\n",
|
||
"print(f'\\nTurnover (USDT):')\n",
|
||
"print(f' Mediana: ${np.median(costs):,.0f}')\n",
|
||
"print(f' p99: ${np.percentile(costs, 99):,.0f}')\n",
|
||
"print(f' Max: ${costs.max():,.0f}')\n",
|
||
"print(f' Pareto α (cola p90+): {alpha_cost:.2f}')\n",
|
||
"\n",
|
||
"fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
|
||
"\n",
|
||
"# CCDF log-log de tamaños\n",
|
||
"for ax, data, alpha, label, xmin in [\n",
|
||
" (axes[0], sizes, alpha_qty, 'BTC', x_min_qty),\n",
|
||
" (axes[1], costs, alpha_cost, 'USDT', x_min_cost),\n",
|
||
"]:\n",
|
||
" sorted_d = np.sort(data)[::-1]\n",
|
||
" ranks = np.arange(1, len(sorted_d) + 1) / len(sorted_d)\n",
|
||
" ax.loglog(sorted_d, ranks, '.', markersize=0.5, alpha=0.3, color='#2ecc71')\n",
|
||
" x_fit = np.logspace(np.log10(xmin), np.log10(data.max()), 50)\n",
|
||
" ax.loglog(x_fit, (x_fit/xmin)**(-alpha) * (len(data[data>=xmin])/len(data)),\n",
|
||
" 'r-', linewidth=2, label=f'Pareto α={alpha:.2f}')\n",
|
||
" ax.set_title(f'CCDF tamaños ({label})')\n",
|
||
" ax.set_xlabel(label)\n",
|
||
" ax.set_ylabel('P(X > x)')\n",
|
||
" ax.legend()\n",
|
||
" ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2.4 Jumps y colas pesadas"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Detectar jumps (retornos > 3σ)\n",
|
||
"threshold = 3 * sigma_1m\n",
|
||
"jump_mask = np.abs(returns) > threshold\n",
|
||
"n_jumps = np.sum(jump_mask)\n",
|
||
"jump_intensity = n_jumps / len(returns)\n",
|
||
"jump_sizes = np.abs(returns[jump_mask])\n",
|
||
"jump_size_std = np.std(jump_sizes) if len(jump_sizes) > 1 else 0\n",
|
||
"\n",
|
||
"print(f'Jumps detectados (>3σ): {n_jumps} de {len(returns)} ({jump_intensity*100:.1f}%)')\n",
|
||
"print(f'Jump size std: {jump_size_std:.6f}')\n",
|
||
"print(f'Kurtosis: {kurtosis:.1f} (Normal=3, >3 = colas pesadas)')\n",
|
||
"\n",
|
||
"# Retornos con jumps marcados\n",
|
||
"fig, ax = plt.subplots(figsize=(16, 4))\n",
|
||
"ax.plot(returns, linewidth=0.3, color='#3498db', alpha=0.6)\n",
|
||
"idx = np.where(jump_mask)[0]\n",
|
||
"ax.scatter(idx, returns[idx], color='red', s=10, zorder=5, label=f'Jumps ({n_jumps})')\n",
|
||
"ax.axhline(y=threshold, color='red', linestyle='--', linewidth=0.5, alpha=0.5)\n",
|
||
"ax.axhline(y=-threshold, color='red', linestyle='--', linewidth=0.5, alpha=0.5)\n",
|
||
"ax.set_title('Retornos 1m — jumps marcados en rojo')\n",
|
||
"ax.legend(fontsize=8)\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### 2.5 Fills por aggTrade — estructura de las órdenes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# n_fills nos dice cuántos niveles del book barrió cada taker order\n",
|
||
"fills = trades['n_fills'].to_numpy()\n",
|
||
"\n",
|
||
"print(f'Fills por aggTrade:')\n",
|
||
"print(f' 1 fill (no cruzó niveles): {np.sum(fills == 1):,} ({np.mean(fills == 1)*100:.1f}%)')\n",
|
||
"print(f' 2-5 fills: {np.sum((fills >= 2) & (fills <= 5)):,} ({np.mean((fills >= 2) & (fills <= 5))*100:.1f}%)')\n",
|
||
"print(f' 6-20 fills: {np.sum((fills >= 6) & (fills <= 20)):,} ({np.mean((fills >= 6) & (fills <= 20))*100:.1f}%)')\n",
|
||
"print(f' >20 fills (ballenas): {np.sum(fills > 20):,} ({np.mean(fills > 20)*100:.1f}%)')\n",
|
||
"print(f' Max fills: {fills.max()}')\n",
|
||
"\n",
|
||
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
||
"\n",
|
||
"ax = axes[0]\n",
|
||
"ax.hist(fills[fills <= 20], bins=range(1, 22), color='#3498db', alpha=0.6, edgecolor='white')\n",
|
||
"ax.set_title('Fills por aggTrade (≤20)')\n",
|
||
"ax.set_xlabel('Número de fills')\n",
|
||
"ax.set_ylabel('Frecuencia')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"# Qty vs n_fills — las ballenas barren más niveles\n",
|
||
"ax = axes[1]\n",
|
||
"sample = trades.sample(min(50000, trades.shape[0]), seed=42)\n",
|
||
"ax.scatter(sample['n_fills'].to_numpy(), sample['qty'].to_numpy(), s=0.5, alpha=0.2, color='#e67e22')\n",
|
||
"ax.set_xlabel('Fills por aggTrade')\n",
|
||
"ax.set_ylabel('Qty (BTC)')\n",
|
||
"ax.set_title('Tamaño de orden vs fills (más grande = barre más niveles)')\n",
|
||
"ax.set_yscale('log')\n",
|
||
"ax.set_xscale('log')\n",
|
||
"ax.grid(True, alpha=0.3)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---\n",
|
||
"## 3. Bitstamp L3: comparar con Binance"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Cargar Bitstamp si ya existe\n",
|
||
"bs_orders_path = DATA / 'bitstamp_btcusd_l3_orders.csv'\n",
|
||
"bs_trades_path = DATA / 'bitstamp_btcusd_l3_trades.csv'\n",
|
||
"\n",
|
||
"if bs_orders_path.exists():\n",
|
||
" bs_orders = pl.read_csv(str(bs_orders_path))\n",
|
||
" print(f'Bitstamp L3 orders: {bs_orders.shape[0]:,}')\n",
|
||
" print(bs_orders.group_by('event').agg(pl.len().alias('count')).sort('count', descending=True))\n",
|
||
" print()\n",
|
||
" \n",
|
||
" # Ratio create/delete — vida media de las órdenes\n",
|
||
" creates = bs_orders.filter(pl.col('event') == 'order_created').shape[0]\n",
|
||
" deletes = bs_orders.filter(pl.col('event') == 'order_deleted').shape[0]\n",
|
||
" changes = bs_orders.filter(pl.col('event') == 'order_changed').shape[0]\n",
|
||
" print(f'Creadas: {creates:,} Borradas: {deletes:,} Cambiadas: {changes:,}')\n",
|
||
" print(f'Ratio delete/create: {deletes/creates:.2f} (cercano a 1 = la mayoría se cancela sin ejecutar)')\n",
|
||
" \n",
|
||
" # Cuántas se cancelan vs se ejecutan\n",
|
||
" print(f'\\nEsto revela algo fundamental: la mayoría de órdenes se CANCELAN, no se ejecutan.')\n",
|
||
" print(f'Los makers constantemente ponen y quitan órdenes para ajustar sus quotes.')\n",
|
||
"\n",
|
||
"if bs_trades_path.exists():\n",
|
||
" bs_trades = pl.read_csv(str(bs_trades_path))\n",
|
||
" print(f'\\nBitstamp L3 trades: {bs_trades.shape[0]:,}')\n",
|
||
" print(bs_trades.head(3))\n",
|
||
" \n",
|
||
" # En L3 podemos ver maker y taker order IDs\n",
|
||
" print(f'\\nCon L3 vemos los IDs del buyer y seller de cada trade:')\n",
|
||
" print(f' Unique buy_order_ids: {bs_trades[\"buy_order_id\"].n_unique():,}')\n",
|
||
" print(f' Unique sell_order_ids: {bs_trades[\"sell_order_id\"].n_unique():,}')\n",
|
||
"\n",
|
||
"if not bs_orders_path.exists():\n",
|
||
" print('Bitstamp L3 aún no disponible. Ejecutar notebook 05 primero.')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---\n",
|
||
"## 4. Resumen: parámetros calibrados desde datos reales\n",
|
||
"\n",
|
||
"Estos son los valores que usaríamos para que nuestra simulación genere datos similares a BTC/USDT."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Recopilar todo\n",
|
||
"print('=' * 65)\n",
|
||
"print(' PARÁMETROS CALIBRADOS DESDE BTC/USDT REAL')\n",
|
||
"print(' Dataset: 1M aggTrades, ~26 horas')\n",
|
||
"print('=' * 65)\n",
|
||
"print(f'')\n",
|
||
"print(f' # Precio fundamental')\n",
|
||
"print(f' sigma = {sigma_1m:.6f} # por minuto')\n",
|
||
"print(f' mu = {np.mean(returns):.8f} # drift (cercano a 0)')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Jumps')\n",
|
||
"print(f' jump_intensity = {jump_intensity:.4f} # {jump_intensity*100:.1f}% de velas tienen jump')\n",
|
||
"print(f' jump_size_std = {jump_size_std:.6f}')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Arrival rate')\n",
|
||
"print(f' n_takers_lambda = {lambda_mean:.1f} # aggTrades/segundo')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Hawkes clustering')\n",
|
||
"print(f' hawkes_alpha = {hawkes_a:.4f}')\n",
|
||
"print(f' hawkes_beta = {hawkes_b:.4f}')\n",
|
||
"print(f' branching_ratio = {branching:.3f}')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Distribución de tamaños')\n",
|
||
"print(f' taker_size_alpha = {alpha_qty:.2f} # Pareto exponent (cola p90+)')\n",
|
||
"print(f' taker_size_min = {np.percentile(sizes, 5):.6f} # BTC (p5)')\n",
|
||
"print(f' taker_size_max = {np.percentile(sizes, 99.9):.4f} # BTC (p99.9)')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Estructura de fills')\n",
|
||
"print(f' median_fills_per_order = {np.median(fills):.0f}')\n",
|
||
"print(f' pct_single_fill = {np.mean(fills==1)*100:.1f}%')\n",
|
||
"print(f'')\n",
|
||
"print(f' # Resumen estadístico')\n",
|
||
"print(f' kurtosis = {kurtosis:.1f}')\n",
|
||
"print(f' skewness = {skew:.3f}')\n",
|
||
"print(f' var_mean_ratio = {var_mean:.1f}')\n",
|
||
"print('=' * 65)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"name": "python",
|
||
"version": "3.13.0"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|