{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "46e94147-fc78-4423-8142-024587261562", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "7edb270b", "metadata": {}, "source": [ "# Comparativa: almacenamiento y recuperación local de embeddings\n", "\n", "Buscamos el equivalente a SQLite/DuckDB para vectores: **embebido, sin servidor, archivo local**.\n", "\n", "Candidatos:\n", "- **FAISS** — Meta, gold standard ANN, sin metadata nativa\n", "- **sqlite-vec** — Extensión SQLite pura en C, vectores junto a datos SQL\n", "- **LanceDB** — DB columnar para vectores, persiste en directorio\n", "- **ChromaDB** — Popular, metadata rica, modo local\n", "- **USearch** — Ultra ligero, rivaliza FAISS en velocidad\n", "\n", "Medimos: inserción, búsqueda top-10, persistencia a disco, carga desde disco, tamaño en disco." ] }, { "cell_type": "markdown", "id": "d8a2144f", "metadata": {}, "source": [ "## 1. Setup y corpus de embeddings pre-generados" ] }, { "cell_type": "code", "execution_count": 1, "id": "36e79f24", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dimensión: 384\n", "Tamaños de corpus: [1000, 10000, 50000]\n", "Queries: 100\n", "Embeddings pre-generados (normalizados, float32)\n" ] } ], "source": [ "import numpy as np\n", "import time, os, shutil, json\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "plt.style.use('seaborn-v0_8-whitegrid')\n", "\n", "# Generamos embeddings sintéticos (dim=384 como e5-small) para no depender del modelo\n", "# Esto aísla el benchmark de storage del benchmark de encoding\n", "np.random.seed(42)\n", "\n", "SIZES = [1_000, 10_000, 50_000]\n", "DIM = 384\n", "DATA_DIR = 'data/vector_bench'\n", "os.makedirs(DATA_DIR, exist_ok=True)\n", "\n", "# Pre-generar embeddings normalizados (simulan output de e5-small)\n", "datasets = {}\n", "for n in SIZES:\n", " vecs = np.random.randn(n, DIM).astype(np.float32)\n", " # Normalizar como haría sentence-transformers\n", " norms = np.linalg.norm(vecs, axis=1, keepdims=True)\n", " vecs = vecs / norms\n", " datasets[n] = vecs\n", "\n", "# Queries (100 vectores)\n", "N_QUERIES = 100\n", "queries = np.random.randn(N_QUERIES, DIM).astype(np.float32)\n", "queries = queries / np.linalg.norm(queries, axis=1, keepdims=True)\n", "\n", "# Metadata simulada\n", "def make_metadata(n):\n", " categories = ['programación', 'ciencia', 'cocina', 'finanzas', 'geografía']\n", " return [{'id': i, 'category': categories[i % len(categories)], 'text': f'documento_{i}'} for i in range(n)]\n", "\n", "print(f'Dimensión: {DIM}')\n", "print(f'Tamaños de corpus: {SIZES}')\n", "print(f'Queries: {N_QUERIES}')\n", "print(f'Embeddings pre-generados (normalizados, float32)')" ] }, { "cell_type": "markdown", "id": "cbadd0ba", "metadata": {}, "source": [ "## 2. Benchmark framework\n", "\n", "Cada backend implementa: insert, search, save, load, size_on_disk." ] }, { "cell_type": "code", "execution_count": 2, "id": "f187cb44", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Framework de benchmark listo\n" ] } ], "source": [ "K = 10 # top-k para búsqueda\n", "\n", "def bench_backend(name, insert_fn, search_fn, save_fn, load_search_fn, size_fn, cleanup_fn):\n", " \"\"\"Benchmark completo de un backend de vectores.\"\"\"\n", " results = []\n", " for n in SIZES:\n", " vecs = datasets[n]\n", " meta = make_metadata(n)\n", " path = os.path.join(DATA_DIR, f'{name}_{n}')\n", "\n", " # Limpiar estado previo\n", " cleanup_fn(path)\n", "\n", " # INSERT\n", " t0 = time.perf_counter()\n", " state = insert_fn(vecs, meta, path)\n", " insert_time = time.perf_counter() - t0\n", "\n", " # SEARCH (100 queries)\n", " t0 = time.perf_counter()\n", " results_search = search_fn(state, queries, K)\n", " search_time = time.perf_counter() - t0\n", " search_per_query = search_time / N_QUERIES\n", "\n", " # SAVE / persist\n", " t0 = time.perf_counter()\n", " save_fn(state, path)\n", " save_time = time.perf_counter() - t0\n", "\n", " # SIZE on disk\n", " disk_mb = size_fn(path)\n", "\n", " # LOAD from disk + search\n", " t0 = time.perf_counter()\n", " loaded_results = load_search_fn(path, queries[:1], K)\n", " load_search_time = time.perf_counter() - t0\n", "\n", " results.append({\n", " 'backend': name,\n", " 'n_vectors': n,\n", " 'insert_s': round(insert_time, 4),\n", " 'search_100q_s': round(search_time, 4),\n", " 'per_query_ms': round(search_per_query * 1000, 3),\n", " 'save_s': round(save_time, 4),\n", " 'load_and_search_s': round(load_search_time, 4),\n", " 'disk_mb': round(disk_mb, 2),\n", " })\n", "\n", " print(f' {name:12s} | {n:6d} vecs | insert={insert_time:.3f}s | '\n", " f'search={search_per_query*1000:.2f}ms/q | save={save_time:.3f}s | '\n", " f'load+search={load_search_time:.3f}s | disk={disk_mb:.1f}MB')\n", "\n", " cleanup_fn(path)\n", "\n", " return results\n", "\n", "def dir_size_mb(path):\n", " total = 0\n", " if os.path.isfile(path):\n", " return os.path.getsize(path) / (1024*1024)\n", " if not os.path.exists(path):\n", " return 0\n", " for dp, dn, fns in os.walk(path):\n", " for f in fns:\n", " total += os.path.getsize(os.path.join(dp, f))\n", " return total / (1024*1024)\n", "\n", "def cleanup_path(path):\n", " if os.path.isfile(path):\n", " os.remove(path)\n", " elif os.path.isdir(path):\n", " shutil.rmtree(path, ignore_errors=True)\n", " # También limpiar archivos con sufijos\n", " for suffix in ['.faiss', '.ids.npy', '.usearch', '.meta.json', '.db']:\n", " p = path + suffix\n", " if os.path.exists(p):\n", " os.remove(p)\n", "\n", "print('Framework de benchmark listo')" ] }, { "cell_type": "markdown", "id": "47e89c3c", "metadata": {}, "source": [ "## 3. Implementaciones por backend" ] }, { "cell_type": "code", "execution_count": 3, "id": "bc86400d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ FAISS listo\n" ] } ], "source": [ "# ── FAISS ──────────────────────────────────────────────────────\n", "import faiss\n", "\n", "def faiss_insert(vecs, meta, path):\n", " index = faiss.IndexFlatIP(DIM)\n", " index.add(vecs)\n", " return index\n", "\n", "def faiss_search(index, queries, k):\n", " scores, ids = index.search(queries, k)\n", " return ids\n", "\n", "def faiss_save(index, path):\n", " faiss.write_index(index, path + '.faiss')\n", "\n", "def faiss_load_search(path, queries, k):\n", " index = faiss.read_index(path + '.faiss')\n", " scores, ids = index.search(queries, k)\n", " return ids\n", "\n", "def faiss_size(path):\n", " return dir_size_mb(path + '.faiss')\n", "\n", "print('✓ FAISS listo')" ] }, { "cell_type": "code", "execution_count": 4, "id": "30a64cf0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ sqlite-vec listo\n" ] } ], "source": [ "# ── sqlite-vec ─────────────────────────────────────────────────\n", "import sqlite3\n", "import sqlite_vec\n", "\n", "def sqlvec_insert(vecs, meta, path):\n", " db = sqlite3.connect(path + '.db')\n", " db.enable_load_extension(True)\n", " sqlite_vec.load(db)\n", " db.execute(f'CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0(embedding float[{DIM}])')\n", " # Insert en batches\n", " batch_size = 500\n", " for i in range(0, len(vecs), batch_size):\n", " batch = [(j, vecs[j].tobytes()) for j in range(i, min(i + batch_size, len(vecs)))]\n", " db.executemany('INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)', batch)\n", " db.commit()\n", " return db\n", "\n", "def sqlvec_search(db, queries, k):\n", " results = []\n", " for q in queries:\n", " rows = db.execute(\n", " 'SELECT rowid, distance FROM vec_items WHERE embedding MATCH ? ORDER BY distance LIMIT ?',\n", " (q.tobytes(), k)\n", " ).fetchall()\n", " results.append([r[0] for r in rows])\n", " return results\n", "\n", "def sqlvec_save(db, path):\n", " db.close() # Ya está persistido en el .db\n", "\n", "def sqlvec_load_search(path, queries, k):\n", " db = sqlite3.connect(path + '.db')\n", " db.enable_load_extension(True)\n", " sqlite_vec.load(db)\n", " q = queries[0]\n", " rows = db.execute(\n", " 'SELECT rowid, distance FROM vec_items WHERE embedding MATCH ? ORDER BY distance LIMIT ?',\n", " (q.tobytes(), k)\n", " ).fetchall()\n", " db.close()\n", " return [r[0] for r in rows]\n", "\n", "def sqlvec_size(path):\n", " return dir_size_mb(path + '.db')\n", "\n", "print('✓ sqlite-vec listo')" ] }, { "cell_type": "code", "execution_count": 5, "id": "74d1bb27", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ LanceDB listo\n" ] } ], "source": [ "# ── LanceDB ────────────────────────────────────────────────────\n", "import lancedb\n", "import pyarrow as pa\n", "\n", "def lance_insert(vecs, meta, path):\n", " db = lancedb.connect(path)\n", " data = pa.table({\n", " 'id': list(range(len(vecs))),\n", " 'category': [meta[i]['category'] for i in range(len(vecs))],\n", " 'vector': [v.tolist() for v in vecs],\n", " })\n", " tbl = db.create_table('vectors', data, mode='overwrite')\n", " return tbl\n", "\n", "def lance_search(tbl, queries, k):\n", " results = []\n", " for q in queries:\n", " r = tbl.search(q.tolist()).limit(k).to_list()\n", " results.append([row['id'] for row in r])\n", " return results\n", "\n", "def lance_save(tbl, path):\n", " pass # LanceDB persiste automáticamente\n", "\n", "def lance_load_search(path, queries, k):\n", " db = lancedb.connect(path)\n", " tbl = db.open_table('vectors')\n", " q = queries[0]\n", " r = tbl.search(q.tolist()).limit(k).to_list()\n", " return [row['id'] for row in r]\n", "\n", "def lance_size(path):\n", " return dir_size_mb(path)\n", "\n", "print('✓ LanceDB listo')" ] }, { "cell_type": "code", "execution_count": 6, "id": "1541cef1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ ChromaDB listo\n" ] } ], "source": [ "# ── ChromaDB ───────────────────────────────────────────────────\n", "import chromadb\n", "\n", "def chroma_insert(vecs, meta, path):\n", " client = chromadb.PersistentClient(path=path)\n", " col = client.get_or_create_collection('vectors', metadata={'hnsw:space': 'ip'})\n", " # Chroma tiene límite de batch, insertar en chunks\n", " batch_size = 5000\n", " for i in range(0, len(vecs), batch_size):\n", " end = min(i + batch_size, len(vecs))\n", " col.add(\n", " ids=[str(j) for j in range(i, end)],\n", " embeddings=[v.tolist() for v in vecs[i:end]],\n", " metadatas=[meta[j] for j in range(i, end)],\n", " )\n", " return (client, col)\n", "\n", "def chroma_search(state, queries, k):\n", " _, col = state\n", " results = col.query(query_embeddings=[q.tolist() for q in queries], n_results=k)\n", " return results['ids']\n", "\n", "def chroma_save(state, path):\n", " pass # PersistentClient persiste automáticamente\n", "\n", "def chroma_load_search(path, queries, k):\n", " client = chromadb.PersistentClient(path=path)\n", " col = client.get_collection('vectors')\n", " results = col.query(query_embeddings=[queries[0].tolist()], n_results=k)\n", " return results['ids']\n", "\n", "def chroma_size(path):\n", " return dir_size_mb(path)\n", "\n", "print('✓ ChromaDB listo')" ] }, { "cell_type": "code", "execution_count": 9, "id": "6bec353d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ USearch listo\n" ] } ], "source": [ "# ── USearch ────────────────────────────────────────────────────\n", "from usearch.index import Index\n", "\n", "def usearch_insert(vecs, meta, path):\n", " index = Index(ndim=DIM, metric='ip', dtype='f32')\n", " keys = np.arange(len(vecs), dtype=np.uint64)\n", " index.add(keys, vecs)\n", " return index\n", "\n", "def usearch_search(index, queries, k):\n", " results = index.search(queries, k)\n", " return results.keys\n", "\n", "def usearch_save(index, path):\n", " index.save(path + '.usearch')\n", "\n", "def usearch_load_search(path, queries, k):\n", " index = Index(ndim=DIM, metric='ip', dtype='f32')\n", " index.load(path + '.usearch')\n", " results = index.search(queries[:1], k)\n", " return results.keys\n", "\n", "def usearch_size(path):\n", " return dir_size_mb(path + '.usearch')\n", "\n", "print('✓ USearch listo')" ] }, { "cell_type": "markdown", "id": "c82f6260", "metadata": {}, "source": [ "## 4. Ejecutar benchmarks" ] }, { "cell_type": "code", "execution_count": 8, "id": "e02de33b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "── FAISS ──\n", " FAISS | 1000 vecs | insert=0.003s | search=0.23ms/q | save=0.006s | load+search=0.003s | disk=1.5MB\n", " FAISS | 10000 vecs | insert=0.020s | search=0.48ms/q | save=0.030s | load+search=0.034s | disk=14.6MB\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " FAISS | 50000 vecs | insert=0.240s | search=2.77ms/q | save=0.169s | load+search=0.211s | disk=73.2MB\n", "\n", "── sqlite-vec ──\n", " sqlite-vec | 1000 vecs | insert=0.053s | search=0.35ms/q | save=0.001s | load+search=0.001s | disk=1.6MB\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " sqlite-vec | 10000 vecs | insert=0.293s | search=4.87ms/q | save=0.000s | load+search=0.006s | disk=15.3MB\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " sqlite-vec | 50000 vecs | insert=1.275s | search=19.50ms/q | save=0.000s | load+search=0.018s | disk=74.7MB\n", "\n", "── LanceDB ──\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[90m[\u001b[0m2026-04-02T15:27:41Z \u001b[33mWARN \u001b[0m lance::dataset::write::insert\u001b[90m]\u001b[0m No existing dataset at /home/lucas/fn_registry/analysis/estudio_embeddings/notebooks/data/vector_bench/LanceDB_1000/vectors.lance, it will be created\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " LanceDB | 1000 vecs | insert=0.040s | search=3.32ms/q | save=0.000s | load+search=0.007s | disk=1.5MB\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[90m[\u001b[0m2026-04-02T15:27:42Z \u001b[33mWARN \u001b[0m lance::dataset::write::insert\u001b[90m]\u001b[0m No existing dataset at /home/lucas/fn_registry/analysis/estudio_embeddings/notebooks/data/vector_bench/LanceDB_10000/vectors.lance, it will be created\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " LanceDB | 10000 vecs | insert=0.297s | search=12.30ms/q | save=0.000s | load+search=0.011s | disk=14.7MB\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[90m[\u001b[0m2026-04-02T15:27:47Z \u001b[33mWARN \u001b[0m lance::dataset::write::insert\u001b[90m]\u001b[0m No existing dataset at /home/lucas/fn_registry/analysis/estudio_embeddings/notebooks/data/vector_bench/LanceDB_50000/vectors.lance, it will be created\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " LanceDB | 50000 vecs | insert=3.740s | search=28.88ms/q | save=0.000s | load+search=0.030s | disk=73.7MB\n", "\n", "── ChromaDB ──\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " ChromaDB | 1000 vecs | insert=0.492s | search=0.30ms/q | save=0.000s | load+search=0.006s | disk=4.1MB\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " ChromaDB | 10000 vecs | insert=2.778s | search=0.53ms/q | save=0.000s | load+search=0.006s | disk=29.6MB\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " ChromaDB | 50000 vecs | insert=19.123s | search=1.23ms/q | save=0.000s | load+search=0.010s | disk=108.6MB\n", "\n", "── USearch ──\n", " ERROR: usearch.index.Index() got multiple values for keyword argument 'ndim'\n", "\n", "Benchmark completo\n" ] } ], "source": [ "all_results = []\n", "\n", "backends = [\n", " ('FAISS', faiss_insert, faiss_search, faiss_save, faiss_load_search, faiss_size, cleanup_path),\n", " ('sqlite-vec', sqlvec_insert, sqlvec_search, sqlvec_save, sqlvec_load_search, sqlvec_size, cleanup_path),\n", " ('LanceDB', lance_insert, lance_search, lance_save, lance_load_search, lance_size, cleanup_path),\n", " ('ChromaDB', chroma_insert, chroma_search, chroma_save, chroma_load_search, chroma_size, cleanup_path),\n", " ('USearch', usearch_insert, usearch_search, usearch_save, usearch_load_search, usearch_size, cleanup_path),\n", "]\n", "\n", "for name, *fns in backends:\n", " print(f'\\n── {name} ──')\n", " try:\n", " res = bench_backend(name, *fns)\n", " all_results.extend(res)\n", " except Exception as e:\n", " print(f' ERROR: {e}')\n", "\n", "df = pd.DataFrame(all_results)\n", "print('\\nBenchmark completo')" ] }, { "cell_type": "markdown", "id": "f126167a", "metadata": {}, "source": [ "## 5. Tabla resumen y visualizaciones" ] }, { "cell_type": "code", "execution_count": 11, "id": "311f88fc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
| \n", " | Backend | \n", "Vectors | \n", "Insert (s) | \n", "Search 100q (s) | \n", "Per query (ms) | \n", "Save (s) | \n", "Load+Search (s) | \n", "Disk (MB) | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "FAISS | \n", "1000 | \n", "0.002700 | \n", "0.022600 | \n", "0.226000 | \n", "0.006000 | \n", "0.002600 | \n", "1.460000 | \n", "
| 1 | \n", "FAISS | \n", "10000 | \n", "0.020000 | \n", "0.048000 | \n", "0.480000 | \n", "0.030200 | \n", "0.034000 | \n", "14.650000 | \n", "
| 2 | \n", "FAISS | \n", "50000 | \n", "0.240500 | \n", "0.277200 | \n", "2.772000 | \n", "0.169200 | \n", "0.210900 | \n", "73.240000 | \n", "
| 3 | \n", "sqlite-vec | \n", "1000 | \n", "0.053100 | \n", "0.035500 | \n", "0.355000 | \n", "0.000600 | \n", "0.001300 | \n", "1.550000 | \n", "
| 4 | \n", "sqlite-vec | \n", "10000 | \n", "0.292600 | \n", "0.486800 | \n", "4.868000 | \n", "0.000300 | \n", "0.006000 | \n", "15.260000 | \n", "
| 5 | \n", "sqlite-vec | \n", "50000 | \n", "1.274600 | \n", "1.949800 | \n", "19.498000 | \n", "0.000300 | \n", "0.018300 | \n", "74.690000 | \n", "
| 6 | \n", "LanceDB | \n", "1000 | \n", "0.040000 | \n", "0.332400 | \n", "3.324000 | \n", "0.000000 | \n", "0.007100 | \n", "1.480000 | \n", "
| 7 | \n", "LanceDB | \n", "10000 | \n", "0.296800 | \n", "1.230500 | \n", "12.305000 | \n", "0.000000 | \n", "0.010700 | \n", "14.740000 | \n", "
| 8 | \n", "LanceDB | \n", "50000 | \n", "3.740100 | \n", "2.888400 | \n", "28.884000 | \n", "0.000000 | \n", "0.030100 | \n", "73.670000 | \n", "
| 9 | \n", "ChromaDB | \n", "1000 | \n", "0.491600 | \n", "0.029500 | \n", "0.295000 | \n", "0.000000 | \n", "0.006000 | \n", "4.090000 | \n", "
| 10 | \n", "ChromaDB | \n", "10000 | \n", "2.778100 | \n", "0.053500 | \n", "0.535000 | \n", "0.000000 | \n", "0.005500 | \n", "29.570000 | \n", "
| 11 | \n", "ChromaDB | \n", "50000 | \n", "19.122500 | \n", "0.122800 | \n", "1.228000 | \n", "0.000000 | \n", "0.009800 | \n", "108.640000 | \n", "
| 12 | \n", "USearch | \n", "1000 | \n", "0.010800 | \n", "0.001300 | \n", "0.013000 | \n", "0.001100 | \n", "0.001500 | \n", "1.610000 | \n", "
| 13 | \n", "USearch | \n", "10000 | \n", "0.289400 | \n", "0.002500 | \n", "0.025000 | \n", "0.013400 | \n", "0.018800 | \n", "16.070000 | \n", "
| 14 | \n", "USearch | \n", "50000 | \n", "5.965700 | \n", "0.013500 | \n", "0.135000 | \n", "0.111700 | \n", "0.188300 | \n", "80.320000 | \n", "