Files
estudio_mercados/notebooks/.ipynb_checkpoints/09_alpha_signals-checkpoint.ipynb
T

513 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Alpha Research: señales de microestructura\n",
"\n",
"Exploramos señales que podrían predecir movimientos de precio a corto plazo.\n",
"\n",
"Para cada señal:\n",
"1. La calculamos sobre los datos reales\n",
"2. Medimos su correlación con retornos futuros a distintos horizontes\n",
"3. Visualizamos si tiene poder predictivo\n",
"\n",
"**Datos:** 1M aggTrades BTC/USDT (~26h)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trades: 1,000,000\n",
"Columnas: ['agg_trade_id', 'price', 'qty', 'first_trade_id', 'last_trade_id', 'timestamp', 'is_buyer_maker', 'side', 'n_fills']\n"
]
}
],
"source": [
"import polars as pl\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"from scipy.stats import spearmanr\n",
"\n",
"DATA = Path('../data')\n",
"trades = pl.read_csv(str(DATA / 'binance_btcusdt_aggtrades_1M.csv'))\n",
"print(f'Trades: {trades.shape[0]:,}')\n",
"print(f'Columnas: {trades.columns}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparación: agrupar en barras de tiempo\n",
"\n",
"Las señales se calculan sobre ventanas de tiempo, no sobre trades individuales.\n",
"Creamos barras de 1 segundo con todas las métricas que necesitamos."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Barras de 1 segundo\n",
"bars = trades.with_columns(\n",
" (pl.col('timestamp') // 1000).alias('second'),\n",
" (pl.col('price') * pl.col('qty')).alias('turnover'),\n",
" pl.when(pl.col('side') == 'buy').then(pl.col('qty')).otherwise(0.0).alias('buy_qty'),\n",
" pl.when(pl.col('side') == 'sell').then(pl.col('qty')).otherwise(0.0).alias('sell_qty'),\n",
" pl.when(pl.col('side') == 'buy').then(1).otherwise(0).alias('is_buy'),\n",
").group_by('second').agg(\n",
" pl.col('price').last().alias('close'),\n",
" pl.col('price').first().alias('open'),\n",
" pl.col('price').max().alias('high'),\n",
" pl.col('price').min().alias('low'),\n",
" pl.col('qty').sum().alias('volume'),\n",
" pl.col('turnover').sum().alias('turnover'),\n",
" pl.len().alias('n_trades'),\n",
" pl.col('buy_qty').sum().alias('buy_volume'),\n",
" pl.col('sell_qty').sum().alias('sell_volume'),\n",
" pl.col('is_buy').sum().alias('n_buys'),\n",
" (pl.len() - pl.col('is_buy').sum()).alias('n_sells'),\n",
" pl.col('n_fills').max().alias('max_fills'), # biggest order this second\n",
" pl.col('qty').max().alias('max_qty'),\n",
").sort('second')\n",
"\n",
"# VWAP por segundo\n",
"bars = bars.with_columns(\n",
" (pl.col('turnover') / pl.col('volume')).alias('vwap'),\n",
")\n",
"\n",
"# Log returns futuros a distintos horizontes (para evaluar señales)\n",
"for horizon in [1, 5, 10, 30, 60]:\n",
" bars = bars.with_columns(\n",
" (pl.col('close').shift(-horizon).log() - pl.col('close').log()).alias(f'fwd_ret_{horizon}s')\n",
" )\n",
"\n",
"print(f'Barras de 1s: {bars.shape[0]:,}')\n",
"print(f'Columnas: {bars.columns}')\n",
"print(bars.head(3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_signal(bars: pl.DataFrame, signal_col: str, name: str, horizons=[1, 5, 10, 30, 60]):\n",
" \"\"\"Evalúa una señal: correlación con retornos futuros + gráficos.\"\"\"\n",
" fig, axes = plt.subplots(1, len(horizons) + 1, figsize=(4 * (len(horizons) + 1), 4))\n",
" \n",
" # Panel 1: la señal en el tiempo\n",
" ax = axes[0]\n",
" sig = bars[signal_col].to_numpy()\n",
" ax.plot(sig[:2000], linewidth=0.3, color='#3498db', alpha=0.7)\n",
" ax.set_title(f'{name}\\n(primeros 2000s)', fontsize=9)\n",
" ax.set_xlabel('Segundo')\n",
" ax.grid(True, alpha=0.3)\n",
" \n",
" # Paneles 2+: scatter señal vs retorno futuro por horizonte\n",
" corrs = []\n",
" for i, h in enumerate(horizons):\n",
" ax = axes[i + 1]\n",
" ret_col = f'fwd_ret_{h}s'\n",
" \n",
" clean = bars.select([signal_col, ret_col]).drop_nulls()\n",
" if clean.shape[0] < 100:\n",
" corrs.append((h, 0, 1))\n",
" continue\n",
" \n",
" x = clean[signal_col].to_numpy()\n",
" y = clean[ret_col].to_numpy()\n",
" \n",
" # Spearman (rank correlation, más robusto a outliers)\n",
" rho, pval = spearmanr(x, y)\n",
" corrs.append((h, rho, pval))\n",
" \n",
" # Binned scatter: dividir señal en 20 bins, plotear media de retorno\n",
" n_bins = 20\n",
" try:\n",
" bins = np.percentile(x[~np.isnan(x)], np.linspace(0, 100, n_bins + 1))\n",
" bins = np.unique(bins)\n",
" if len(bins) < 3:\n",
" raise ValueError\n",
" bin_idx = np.digitize(x, bins) - 1\n",
" bin_idx = np.clip(bin_idx, 0, len(bins) - 2)\n",
" bin_means_x = [np.mean(x[bin_idx == b]) for b in range(len(bins) - 1) if np.sum(bin_idx == b) > 0]\n",
" bin_means_y = [np.mean(y[bin_idx == b]) * 10000 for b in range(len(bins) - 1) if np.sum(bin_idx == b) > 0] # in bps\n",
" ax.bar(range(len(bin_means_y)), bin_means_y, color='#2ecc71' if rho > 0 else '#e74c3c', alpha=0.6)\n",
" except:\n",
" pass\n",
" \n",
" color = 'green' if abs(rho) > 0.02 and pval < 0.01 else 'gray'\n",
" ax.set_title(f'{h}s: ρ={rho:.4f}\\np={pval:.2e}', fontsize=9, color=color)\n",
" ax.set_xlabel(f'Bin de {name}')\n",
" if i == 0:\n",
" ax.set_ylabel('Ret futuro (bps)')\n",
" ax.axhline(y=0, color='black', linewidth=0.5)\n",
" ax.grid(True, alpha=0.3)\n",
" \n",
" fig.suptitle(f'Señal: {name}', fontsize=12, fontweight='bold')\n",
" plt.tight_layout()\n",
" plt.show()\n",
" \n",
" # Resumen\n",
" for h, rho, pval in corrs:\n",
" sig_marker = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''\n",
" print(f' {h:>3}s: ρ={rho:+.4f} (p={pval:.2e}) {sig_marker}')\n",
" \n",
" return corrs\n",
"\n",
"print('evaluate_signal() definida')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 1: Order Flow Imbalance (OFI)\n",
"\n",
"**Qué mide:** La diferencia entre volumen de compras y ventas en los últimos N segundos. \n",
"**Intuición:** Si llegan más market buys que sells, hay presión compradora → el precio debería subir. \n",
"**Fórmula:** `OFI = (buy_volume - sell_volume) / (buy_volume + sell_volume)` \n",
"Normalizado entre -1 (todo sells) y +1 (todo buys)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# OFI en ventanas de 5, 10, 30 segundos\n",
"for w in [5, 10, 30]:\n",
" buy_sum = bars['buy_volume'].rolling_sum(window_size=w)\n",
" sell_sum = bars['sell_volume'].rolling_sum(window_size=w)\n",
" total = buy_sum + sell_sum\n",
" ofi = (buy_sum - sell_sum) / total\n",
" bars = bars.with_columns(ofi.alias(f'ofi_{w}s'))\n",
"\n",
"print('OFI 5s:')\n",
"corrs_ofi5 = evaluate_signal(bars, 'ofi_5s', 'OFI 5s')\n",
"print('\\nOFI 10s:')\n",
"corrs_ofi10 = evaluate_signal(bars, 'ofi_10s', 'OFI 10s')\n",
"print('\\nOFI 30s:')\n",
"corrs_ofi30 = evaluate_signal(bars, 'ofi_30s', 'OFI 30s')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 2: Trade Count Imbalance\n",
"\n",
"**Qué mide:** Diferencia entre número de buys y sells (no volumen, sino conteo). \n",
"**Intuición:** Muchos trades pequeños de compra pueden ser más informativos que un solo trade grande. \n",
"**Fórmula:** `TCI = (n_buys - n_sells) / (n_buys + n_sells)`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for w in [5, 10, 30]:\n",
" nb = bars['n_buys'].rolling_sum(window_size=w)\n",
" ns = bars['n_sells'].rolling_sum(window_size=w)\n",
" bars = bars.with_columns(\n",
" ((nb - ns) / (nb + ns)).alias(f'tci_{w}s')\n",
" )\n",
"\n",
"print('Trade Count Imbalance 10s:')\n",
"corrs_tci = evaluate_signal(bars, 'tci_10s', 'TCI 10s')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 3: Trade Intensity (aceleración de actividad)\n",
"\n",
"**Qué mide:** ¿Están llegando trades más rápido que lo normal? \n",
"**Intuición:** Aceleraciones predicen movimientos — los informados tradean antes del movimiento. \n",
"**Fórmula:** `intensity = trades_last_5s / trades_last_60s_avg`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"short_window = bars['n_trades'].rolling_sum(window_size=5)\n",
"long_window = bars['n_trades'].rolling_mean(window_size=60)\n",
"bars = bars.with_columns(\n",
" (short_window / 5 / long_window).alias('trade_intensity')\n",
")\n",
"\n",
"print('Trade Intensity (5s / 60s avg):')\n",
"corrs_intensity = evaluate_signal(bars, 'trade_intensity', 'Trade Intensity')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 4: Volume-Weighted Imbalance\n",
"\n",
"**Qué mide:** OFI pero ponderando más los trades grandes (ballenas). \n",
"**Intuición:** Un trade de 1 BTC tiene más información que 100 trades de 0.001 BTC. \n",
"**Fórmula:** Separar trades grandes (>p90) y calcular su OFI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Señal basada en los trades más grandes de cada segundo\n",
"# max_qty ya captura el trade más grande, pero necesitamos su lado\n",
"# Usamos n_fills como proxy: más fills = orden más grande que barrió más niveles\n",
"\n",
"# Proxy: volumen de los trades con >5 fills (ballenas)\n",
"whale_trades = trades.filter(pl.col('n_fills') > 5).with_columns(\n",
" (pl.col('timestamp') // 1000).alias('second'),\n",
" pl.when(pl.col('side') == 'buy').then(pl.col('qty')).otherwise(-pl.col('qty')).alias('signed_qty'),\n",
")\n",
"\n",
"whale_flow = whale_trades.group_by('second').agg(\n",
" pl.col('signed_qty').sum().alias('whale_flow'),\n",
" pl.len().alias('whale_count'),\n",
").sort('second')\n",
"\n",
"# Unir con bars\n",
"bars = bars.join(whale_flow, on='second', how='left').with_columns(\n",
" pl.col('whale_flow').fill_null(0.0),\n",
" pl.col('whale_count').fill_null(0),\n",
")\n",
"\n",
"# Whale flow rolling\n",
"bars = bars.with_columns(\n",
" pl.col('whale_flow').rolling_sum(window_size=10).alias('whale_flow_10s'),\n",
")\n",
"\n",
"print('Whale Flow 10s (trades con >5 fills):')\n",
"print(f'Trades clasificados como ballena: {whale_trades.shape[0]:,} ({whale_trades.shape[0]/trades.shape[0]*100:.1f}%)')\n",
"corrs_whale = evaluate_signal(bars, 'whale_flow_10s', 'Whale Flow 10s')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 5: VWAP Deviation\n",
"\n",
"**Qué mide:** ¿El precio actual está por encima o debajo del VWAP reciente? \n",
"**Intuición:** El precio tiende a revertir al VWAP (mean reversion). \n",
"**Fórmula:** `deviation = (close - vwap_rolling) / close`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for w in [30, 60, 300]:\n",
" rolling_turnover = bars['turnover'].rolling_sum(window_size=w)\n",
" rolling_volume = bars['volume'].rolling_sum(window_size=w)\n",
" rolling_vwap = rolling_turnover / rolling_volume\n",
" deviation = (bars['close'] - rolling_vwap) / bars['close']\n",
" bars = bars.with_columns(deviation.alias(f'vwap_dev_{w}s'))\n",
"\n",
"print('VWAP Deviation 30s:')\n",
"corrs_vwap30 = evaluate_signal(bars, 'vwap_dev_30s', 'VWAP Dev 30s')\n",
"print('\\nVWAP Deviation 60s:')\n",
"corrs_vwap60 = evaluate_signal(bars, 'vwap_dev_60s', 'VWAP Dev 60s')\n",
"print('\\nVWAP Deviation 300s (5min):')\n",
"corrs_vwap300 = evaluate_signal(bars, 'vwap_dev_300s', 'VWAP Dev 5min')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 6: Volatility Breakout\n",
"\n",
"**Qué mide:** ¿La volatilidad actual es anormalmente alta? \n",
"**Intuición:** Picos de volatilidad preceden movimientos direccionales (momentum post-breakout). \n",
"**Fórmula:** `breakout = vol_5s / vol_60s`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Volatilidad realizada como rango (high - low) / close\n",
"bars = bars.with_columns(\n",
" ((pl.col('high') - pl.col('low')) / pl.col('close')).alias('range_pct')\n",
")\n",
"\n",
"short_vol = bars['range_pct'].rolling_mean(window_size=5)\n",
"long_vol = bars['range_pct'].rolling_mean(window_size=60)\n",
"bars = bars.with_columns(\n",
" (short_vol / long_vol).alias('vol_breakout')\n",
")\n",
"\n",
"print('Volatility Breakout (5s / 60s):')\n",
"corrs_volbreak = evaluate_signal(bars, 'vol_breakout', 'Vol Breakout')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Señal 7: Retorno reciente (momentum/reversal)\n",
"\n",
"**Qué mide:** ¿El precio acaba de subir o bajar? \n",
"**Intuición:** A muy corto plazo puede haber momentum (inercia) o reversal (rebote). \n",
"**Fórmula:** `ret_Ns = log(close) - log(close_N_ago)`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for w in [1, 5, 10, 30, 60]:\n",
" bars = bars.with_columns(\n",
" (pl.col('close').log() - pl.col('close').shift(w).log()).alias(f'past_ret_{w}s')\n",
" )\n",
"\n",
"print('Past Return 1s (ultra corto):')\n",
"corrs_ret1 = evaluate_signal(bars, 'past_ret_1s', 'Past Ret 1s')\n",
"print('\\nPast Return 5s:')\n",
"corrs_ret5 = evaluate_signal(bars, 'past_ret_5s', 'Past Ret 5s')\n",
"print('\\nPast Return 30s:')\n",
"corrs_ret30 = evaluate_signal(bars, 'past_ret_30s', 'Past Ret 30s')\n",
"print('\\nPast Return 60s:')\n",
"corrs_ret60 = evaluate_signal(bars, 'past_ret_60s', 'Past Ret 60s')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Resumen: ranking de señales"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Recopilar todas las correlaciones\n",
"all_signals = [\n",
" ('OFI 5s', corrs_ofi5),\n",
" ('OFI 10s', corrs_ofi10),\n",
" ('OFI 30s', corrs_ofi30),\n",
" ('TCI 10s', corrs_tci),\n",
" ('Trade Intensity', corrs_intensity),\n",
" ('Whale Flow 10s', corrs_whale),\n",
" ('VWAP Dev 30s', corrs_vwap30),\n",
" ('VWAP Dev 60s', corrs_vwap60),\n",
" ('VWAP Dev 5min', corrs_vwap300),\n",
" ('Vol Breakout', corrs_volbreak),\n",
" ('Past Ret 1s', corrs_ret1),\n",
" ('Past Ret 5s', corrs_ret5),\n",
" ('Past Ret 30s', corrs_ret30),\n",
" ('Past Ret 60s', corrs_ret60),\n",
"]\n",
"\n",
"records = []\n",
"for name, corrs in all_signals:\n",
" for h, rho, pval in corrs:\n",
" records.append({'signal': name, 'horizon_s': h, 'spearman_rho': round(rho, 5), 'p_value': pval})\n",
"\n",
"results = pl.DataFrame(records)\n",
"\n",
"# Heatmap de correlaciones\n",
"signal_names = [s[0] for s in all_signals]\n",
"horizons = [1, 5, 10, 30, 60]\n",
"\n",
"matrix = np.zeros((len(signal_names), len(horizons)))\n",
"for row in results.iter_rows(named=True):\n",
" i = signal_names.index(row['signal'])\n",
" j = horizons.index(row['horizon_s'])\n",
" matrix[i, j] = row['spearman_rho']\n",
"\n",
"fig, ax = plt.subplots(figsize=(10, 10))\n",
"vmax = max(0.01, np.max(np.abs(matrix)))\n",
"im = ax.imshow(matrix, cmap='RdBu_r', aspect='auto', vmin=-vmax, vmax=vmax)\n",
"ax.set_xticks(range(len(horizons)))\n",
"ax.set_xticklabels([f'{h}s' for h in horizons], fontsize=10)\n",
"ax.set_yticks(range(len(signal_names)))\n",
"ax.set_yticklabels(signal_names, fontsize=10)\n",
"\n",
"for i in range(len(signal_names)):\n",
" for j in range(len(horizons)):\n",
" val = matrix[i, j]\n",
" # Marcar significativos\n",
" r = results.filter((pl.col('signal') == signal_names[i]) & (pl.col('horizon_s') == horizons[j]))\n",
" if r.shape[0] > 0:\n",
" pv = r['p_value'][0]\n",
" star = '***' if pv < 0.001 else '**' if pv < 0.01 else '*' if pv < 0.05 else ''\n",
" else:\n",
" star = ''\n",
" color = 'white' if abs(val) > vmax * 0.6 else 'black'\n",
" ax.text(j, i, f'{val:.4f}\\n{star}', ha='center', va='center', fontsize=8, color=color)\n",
"\n",
"ax.set_title('Spearman ρ: señal vs retorno futuro\\n(rojo = predice subida, azul = predice bajada, *** = p<0.001)', fontsize=12)\n",
"ax.set_xlabel('Horizonte futuro')\n",
"plt.colorbar(im, label='ρ')\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Top señales\n",
"print('\\nTop 15 señales por |ρ| (significativas p<0.01):')\n",
"top = results.filter(pl.col('p_value') < 0.01).with_columns(\n",
" pl.col('spearman_rho').abs().alias('abs_rho')\n",
").sort('abs_rho', descending=True).head(15)\n",
"print(top.select(['signal', 'horizon_s', 'spearman_rho', 'p_value']))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}