{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Alpha Research: señales de microestructura\n", "\n", "Exploramos señales que podrían predecir movimientos de precio a corto plazo.\n", "\n", "Para cada señal:\n", "1. La calculamos sobre los datos reales\n", "2. Medimos su correlación con retornos futuros a distintos horizontes\n", "3. Visualizamos si tiene poder predictivo\n", "\n", "**Datos:** 1M aggTrades BTC/USDT (~26h)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Trades: 1,000,000\n", "Columnas: ['agg_trade_id', 'price', 'qty', 'first_trade_id', 'last_trade_id', 'timestamp', 'is_buyer_maker', 'side', 'n_fills']\n" ] } ], "source": [ "import polars as pl\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from pathlib import Path\n", "from scipy.stats import spearmanr\n", "\n", "DATA = Path('../data')\n", "trades = pl.read_csv(str(DATA / 'binance_btcusdt_aggtrades_1M.csv'))\n", "print(f'Trades: {trades.shape[0]:,}')\n", "print(f'Columnas: {trades.columns}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparación: agrupar en barras de tiempo\n", "\n", "Las señales se calculan sobre ventanas de tiempo, no sobre trades individuales.\n", "Creamos barras de 1 segundo con todas las métricas que necesitamos." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Barras de 1 segundo\n", "bars = trades.with_columns(\n", " (pl.col('timestamp') // 1000).alias('second'),\n", " (pl.col('price') * pl.col('qty')).alias('turnover'),\n", " pl.when(pl.col('side') == 'buy').then(pl.col('qty')).otherwise(0.0).alias('buy_qty'),\n", " pl.when(pl.col('side') == 'sell').then(pl.col('qty')).otherwise(0.0).alias('sell_qty'),\n", " pl.when(pl.col('side') == 'buy').then(1).otherwise(0).alias('is_buy'),\n", ").group_by('second').agg(\n", " pl.col('price').last().alias('close'),\n", " pl.col('price').first().alias('open'),\n", " pl.col('price').max().alias('high'),\n", " pl.col('price').min().alias('low'),\n", " pl.col('qty').sum().alias('volume'),\n", " pl.col('turnover').sum().alias('turnover'),\n", " pl.len().alias('n_trades'),\n", " pl.col('buy_qty').sum().alias('buy_volume'),\n", " pl.col('sell_qty').sum().alias('sell_volume'),\n", " pl.col('is_buy').sum().alias('n_buys'),\n", " (pl.len() - pl.col('is_buy').sum()).alias('n_sells'),\n", " pl.col('n_fills').max().alias('max_fills'), # biggest order this second\n", " pl.col('qty').max().alias('max_qty'),\n", ").sort('second')\n", "\n", "# VWAP por segundo\n", "bars = bars.with_columns(\n", " (pl.col('turnover') / pl.col('volume')).alias('vwap'),\n", ")\n", "\n", "# Log returns futuros a distintos horizontes (para evaluar señales)\n", "for horizon in [1, 5, 10, 30, 60]:\n", " bars = bars.with_columns(\n", " (pl.col('close').shift(-horizon).log() - pl.col('close').log()).alias(f'fwd_ret_{horizon}s')\n", " )\n", "\n", "print(f'Barras de 1s: {bars.shape[0]:,}')\n", "print(f'Columnas: {bars.columns}')\n", "print(bars.head(3))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def evaluate_signal(bars: pl.DataFrame, signal_col: str, name: str, horizons=[1, 5, 10, 30, 60]):\n", " \"\"\"Evalúa una señal: correlación con retornos futuros + gráficos.\"\"\"\n", " fig, axes = plt.subplots(1, len(horizons) + 1, figsize=(4 * (len(horizons) + 1), 4))\n", " \n", " # Panel 1: la señal en el tiempo\n", " ax = axes[0]\n", " sig = bars[signal_col].to_numpy()\n", " ax.plot(sig[:2000], linewidth=0.3, color='#3498db', alpha=0.7)\n", " ax.set_title(f'{name}\\n(primeros 2000s)', fontsize=9)\n", " ax.set_xlabel('Segundo')\n", " ax.grid(True, alpha=0.3)\n", " \n", " # Paneles 2+: scatter señal vs retorno futuro por horizonte\n", " corrs = []\n", " for i, h in enumerate(horizons):\n", " ax = axes[i + 1]\n", " ret_col = f'fwd_ret_{h}s'\n", " \n", " clean = bars.select([signal_col, ret_col]).drop_nulls()\n", " if clean.shape[0] < 100:\n", " corrs.append((h, 0, 1))\n", " continue\n", " \n", " x = clean[signal_col].to_numpy()\n", " y = clean[ret_col].to_numpy()\n", " \n", " # Spearman (rank correlation, más robusto a outliers)\n", " rho, pval = spearmanr(x, y)\n", " corrs.append((h, rho, pval))\n", " \n", " # Binned scatter: dividir señal en 20 bins, plotear media de retorno\n", " n_bins = 20\n", " try:\n", " bins = np.percentile(x[~np.isnan(x)], np.linspace(0, 100, n_bins + 1))\n", " bins = np.unique(bins)\n", " if len(bins) < 3:\n", " raise ValueError\n", " bin_idx = np.digitize(x, bins) - 1\n", " bin_idx = np.clip(bin_idx, 0, len(bins) - 2)\n", " bin_means_x = [np.mean(x[bin_idx == b]) for b in range(len(bins) - 1) if np.sum(bin_idx == b) > 0]\n", " bin_means_y = [np.mean(y[bin_idx == b]) * 10000 for b in range(len(bins) - 1) if np.sum(bin_idx == b) > 0] # in bps\n", " ax.bar(range(len(bin_means_y)), bin_means_y, color='#2ecc71' if rho > 0 else '#e74c3c', alpha=0.6)\n", " except:\n", " pass\n", " \n", " color = 'green' if abs(rho) > 0.02 and pval < 0.01 else 'gray'\n", " ax.set_title(f'{h}s: ρ={rho:.4f}\\np={pval:.2e}', fontsize=9, color=color)\n", " ax.set_xlabel(f'Bin de {name}')\n", " if i == 0:\n", " ax.set_ylabel('Ret futuro (bps)')\n", " ax.axhline(y=0, color='black', linewidth=0.5)\n", " ax.grid(True, alpha=0.3)\n", " \n", " fig.suptitle(f'Señal: {name}', fontsize=12, fontweight='bold')\n", " plt.tight_layout()\n", " plt.show()\n", " \n", " # Resumen\n", " for h, rho, pval in corrs:\n", " sig_marker = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''\n", " print(f' {h:>3}s: ρ={rho:+.4f} (p={pval:.2e}) {sig_marker}')\n", " \n", " return corrs\n", "\n", "print('evaluate_signal() definida')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 1: Order Flow Imbalance (OFI)\n", "\n", "**Qué mide:** La diferencia entre volumen de compras y ventas en los últimos N segundos. \n", "**Intuición:** Si llegan más market buys que sells, hay presión compradora → el precio debería subir. \n", "**Fórmula:** `OFI = (buy_volume - sell_volume) / (buy_volume + sell_volume)` \n", "Normalizado entre -1 (todo sells) y +1 (todo buys)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# OFI en ventanas de 5, 10, 30 segundos\n", "for w in [5, 10, 30]:\n", " buy_sum = bars['buy_volume'].rolling_sum(window_size=w)\n", " sell_sum = bars['sell_volume'].rolling_sum(window_size=w)\n", " total = buy_sum + sell_sum\n", " ofi = (buy_sum - sell_sum) / total\n", " bars = bars.with_columns(ofi.alias(f'ofi_{w}s'))\n", "\n", "print('OFI 5s:')\n", "corrs_ofi5 = evaluate_signal(bars, 'ofi_5s', 'OFI 5s')\n", "print('\\nOFI 10s:')\n", "corrs_ofi10 = evaluate_signal(bars, 'ofi_10s', 'OFI 10s')\n", "print('\\nOFI 30s:')\n", "corrs_ofi30 = evaluate_signal(bars, 'ofi_30s', 'OFI 30s')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 2: Trade Count Imbalance\n", "\n", "**Qué mide:** Diferencia entre número de buys y sells (no volumen, sino conteo). \n", "**Intuición:** Muchos trades pequeños de compra pueden ser más informativos que un solo trade grande. \n", "**Fórmula:** `TCI = (n_buys - n_sells) / (n_buys + n_sells)`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for w in [5, 10, 30]:\n", " nb = bars['n_buys'].rolling_sum(window_size=w)\n", " ns = bars['n_sells'].rolling_sum(window_size=w)\n", " bars = bars.with_columns(\n", " ((nb - ns) / (nb + ns)).alias(f'tci_{w}s')\n", " )\n", "\n", "print('Trade Count Imbalance 10s:')\n", "corrs_tci = evaluate_signal(bars, 'tci_10s', 'TCI 10s')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 3: Trade Intensity (aceleración de actividad)\n", "\n", "**Qué mide:** ¿Están llegando trades más rápido que lo normal? \n", "**Intuición:** Aceleraciones predicen movimientos — los informados tradean antes del movimiento. \n", "**Fórmula:** `intensity = trades_last_5s / trades_last_60s_avg`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "short_window = bars['n_trades'].rolling_sum(window_size=5)\n", "long_window = bars['n_trades'].rolling_mean(window_size=60)\n", "bars = bars.with_columns(\n", " (short_window / 5 / long_window).alias('trade_intensity')\n", ")\n", "\n", "print('Trade Intensity (5s / 60s avg):')\n", "corrs_intensity = evaluate_signal(bars, 'trade_intensity', 'Trade Intensity')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 4: Volume-Weighted Imbalance\n", "\n", "**Qué mide:** OFI pero ponderando más los trades grandes (ballenas). \n", "**Intuición:** Un trade de 1 BTC tiene más información que 100 trades de 0.001 BTC. \n", "**Fórmula:** Separar trades grandes (>p90) y calcular su OFI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Señal basada en los trades más grandes de cada segundo\n", "# max_qty ya captura el trade más grande, pero necesitamos su lado\n", "# Usamos n_fills como proxy: más fills = orden más grande que barrió más niveles\n", "\n", "# Proxy: volumen de los trades con >5 fills (ballenas)\n", "whale_trades = trades.filter(pl.col('n_fills') > 5).with_columns(\n", " (pl.col('timestamp') // 1000).alias('second'),\n", " pl.when(pl.col('side') == 'buy').then(pl.col('qty')).otherwise(-pl.col('qty')).alias('signed_qty'),\n", ")\n", "\n", "whale_flow = whale_trades.group_by('second').agg(\n", " pl.col('signed_qty').sum().alias('whale_flow'),\n", " pl.len().alias('whale_count'),\n", ").sort('second')\n", "\n", "# Unir con bars\n", "bars = bars.join(whale_flow, on='second', how='left').with_columns(\n", " pl.col('whale_flow').fill_null(0.0),\n", " pl.col('whale_count').fill_null(0),\n", ")\n", "\n", "# Whale flow rolling\n", "bars = bars.with_columns(\n", " pl.col('whale_flow').rolling_sum(window_size=10).alias('whale_flow_10s'),\n", ")\n", "\n", "print('Whale Flow 10s (trades con >5 fills):')\n", "print(f'Trades clasificados como ballena: {whale_trades.shape[0]:,} ({whale_trades.shape[0]/trades.shape[0]*100:.1f}%)')\n", "corrs_whale = evaluate_signal(bars, 'whale_flow_10s', 'Whale Flow 10s')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 5: VWAP Deviation\n", "\n", "**Qué mide:** ¿El precio actual está por encima o debajo del VWAP reciente? \n", "**Intuición:** El precio tiende a revertir al VWAP (mean reversion). \n", "**Fórmula:** `deviation = (close - vwap_rolling) / close`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for w in [30, 60, 300]:\n", " rolling_turnover = bars['turnover'].rolling_sum(window_size=w)\n", " rolling_volume = bars['volume'].rolling_sum(window_size=w)\n", " rolling_vwap = rolling_turnover / rolling_volume\n", " deviation = (bars['close'] - rolling_vwap) / bars['close']\n", " bars = bars.with_columns(deviation.alias(f'vwap_dev_{w}s'))\n", "\n", "print('VWAP Deviation 30s:')\n", "corrs_vwap30 = evaluate_signal(bars, 'vwap_dev_30s', 'VWAP Dev 30s')\n", "print('\\nVWAP Deviation 60s:')\n", "corrs_vwap60 = evaluate_signal(bars, 'vwap_dev_60s', 'VWAP Dev 60s')\n", "print('\\nVWAP Deviation 300s (5min):')\n", "corrs_vwap300 = evaluate_signal(bars, 'vwap_dev_300s', 'VWAP Dev 5min')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 6: Volatility Breakout\n", "\n", "**Qué mide:** ¿La volatilidad actual es anormalmente alta? \n", "**Intuición:** Picos de volatilidad preceden movimientos direccionales (momentum post-breakout). \n", "**Fórmula:** `breakout = vol_5s / vol_60s`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Volatilidad realizada como rango (high - low) / close\n", "bars = bars.with_columns(\n", " ((pl.col('high') - pl.col('low')) / pl.col('close')).alias('range_pct')\n", ")\n", "\n", "short_vol = bars['range_pct'].rolling_mean(window_size=5)\n", "long_vol = bars['range_pct'].rolling_mean(window_size=60)\n", "bars = bars.with_columns(\n", " (short_vol / long_vol).alias('vol_breakout')\n", ")\n", "\n", "print('Volatility Breakout (5s / 60s):')\n", "corrs_volbreak = evaluate_signal(bars, 'vol_breakout', 'Vol Breakout')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Señal 7: Retorno reciente (momentum/reversal)\n", "\n", "**Qué mide:** ¿El precio acaba de subir o bajar? \n", "**Intuición:** A muy corto plazo puede haber momentum (inercia) o reversal (rebote). \n", "**Fórmula:** `ret_Ns = log(close) - log(close_N_ago)`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for w in [1, 5, 10, 30, 60]:\n", " bars = bars.with_columns(\n", " (pl.col('close').log() - pl.col('close').shift(w).log()).alias(f'past_ret_{w}s')\n", " )\n", "\n", "print('Past Return 1s (ultra corto):')\n", "corrs_ret1 = evaluate_signal(bars, 'past_ret_1s', 'Past Ret 1s')\n", "print('\\nPast Return 5s:')\n", "corrs_ret5 = evaluate_signal(bars, 'past_ret_5s', 'Past Ret 5s')\n", "print('\\nPast Return 30s:')\n", "corrs_ret30 = evaluate_signal(bars, 'past_ret_30s', 'Past Ret 30s')\n", "print('\\nPast Return 60s:')\n", "corrs_ret60 = evaluate_signal(bars, 'past_ret_60s', 'Past Ret 60s')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## Resumen: ranking de señales" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Recopilar todas las correlaciones\n", "all_signals = [\n", " ('OFI 5s', corrs_ofi5),\n", " ('OFI 10s', corrs_ofi10),\n", " ('OFI 30s', corrs_ofi30),\n", " ('TCI 10s', corrs_tci),\n", " ('Trade Intensity', corrs_intensity),\n", " ('Whale Flow 10s', corrs_whale),\n", " ('VWAP Dev 30s', corrs_vwap30),\n", " ('VWAP Dev 60s', corrs_vwap60),\n", " ('VWAP Dev 5min', corrs_vwap300),\n", " ('Vol Breakout', corrs_volbreak),\n", " ('Past Ret 1s', corrs_ret1),\n", " ('Past Ret 5s', corrs_ret5),\n", " ('Past Ret 30s', corrs_ret30),\n", " ('Past Ret 60s', corrs_ret60),\n", "]\n", "\n", "records = []\n", "for name, corrs in all_signals:\n", " for h, rho, pval in corrs:\n", " records.append({'signal': name, 'horizon_s': h, 'spearman_rho': round(rho, 5), 'p_value': pval})\n", "\n", "results = pl.DataFrame(records)\n", "\n", "# Heatmap de correlaciones\n", "signal_names = [s[0] for s in all_signals]\n", "horizons = [1, 5, 10, 30, 60]\n", "\n", "matrix = np.zeros((len(signal_names), len(horizons)))\n", "for row in results.iter_rows(named=True):\n", " i = signal_names.index(row['signal'])\n", " j = horizons.index(row['horizon_s'])\n", " matrix[i, j] = row['spearman_rho']\n", "\n", "fig, ax = plt.subplots(figsize=(10, 10))\n", "vmax = max(0.01, np.max(np.abs(matrix)))\n", "im = ax.imshow(matrix, cmap='RdBu_r', aspect='auto', vmin=-vmax, vmax=vmax)\n", "ax.set_xticks(range(len(horizons)))\n", "ax.set_xticklabels([f'{h}s' for h in horizons], fontsize=10)\n", "ax.set_yticks(range(len(signal_names)))\n", "ax.set_yticklabels(signal_names, fontsize=10)\n", "\n", "for i in range(len(signal_names)):\n", " for j in range(len(horizons)):\n", " val = matrix[i, j]\n", " # Marcar significativos\n", " r = results.filter((pl.col('signal') == signal_names[i]) & (pl.col('horizon_s') == horizons[j]))\n", " if r.shape[0] > 0:\n", " pv = r['p_value'][0]\n", " star = '***' if pv < 0.001 else '**' if pv < 0.01 else '*' if pv < 0.05 else ''\n", " else:\n", " star = ''\n", " color = 'white' if abs(val) > vmax * 0.6 else 'black'\n", " ax.text(j, i, f'{val:.4f}\\n{star}', ha='center', va='center', fontsize=8, color=color)\n", "\n", "ax.set_title('Spearman ρ: señal vs retorno futuro\\n(rojo = predice subida, azul = predice bajada, *** = p<0.001)', fontsize=12)\n", "ax.set_xlabel('Horizonte futuro')\n", "plt.colorbar(im, label='ρ')\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "# Top señales\n", "print('\\nTop 15 señales por |ρ| (significativas p<0.01):')\n", "top = results.filter(pl.col('p_value') < 0.01).with_columns(\n", " pl.col('spearman_rho').abs().alias('abs_rho')\n", ").sort('abs_rho', descending=True).head(15)\n", "print(top.select(['signal', 'horizon_s', 'spearman_rho', 'p_value']))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 4 }