0fa16a033c
Funciones Python para embeddings: carga/guardado de modelos, encoding de texto, y almacenamiento/búsqueda vectorial con sqlite-vec y usearch. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
"""Vector storage and retrieval with sqlite-vec."""
|
|
|
|
import sqlite3
|
|
|
|
import numpy as np
|
|
import sqlite_vec
|
|
|
|
|
|
def embedding_store_sqlvec(db_path: str, table: str, ids: list, embeddings: list, dim: int = 384) -> int:
|
|
"""Inserta embeddings en una tabla sqlite-vec.
|
|
|
|
Crea la tabla virtual si no existe. Inserta en batches de 500.
|
|
|
|
Args:
|
|
db_path: Path al archivo SQLite.
|
|
table: Nombre de la tabla virtual vec0.
|
|
ids: Lista de IDs (int) para cada embedding.
|
|
embeddings: Lista de arrays numpy float32.
|
|
dim: Dimension de los embeddings (default 384 para e5-small).
|
|
|
|
Returns:
|
|
Numero de embeddings insertados.
|
|
|
|
Raises:
|
|
sqlite3.Error: Si hay error de escritura en la BD.
|
|
"""
|
|
db = sqlite3.connect(db_path)
|
|
db.enable_load_extension(True)
|
|
sqlite_vec.load(db)
|
|
|
|
db.execute(f"CREATE VIRTUAL TABLE IF NOT EXISTS [{table}] USING vec0(embedding float[{dim}])")
|
|
|
|
batch_size = 500
|
|
count = 0
|
|
for i in range(0, len(ids), batch_size):
|
|
batch = [
|
|
(int(ids[j]), np.asarray(embeddings[j], dtype=np.float32).tobytes())
|
|
for j in range(i, min(i + batch_size, len(ids)))
|
|
]
|
|
db.executemany(f"INSERT INTO [{table}](rowid, embedding) VALUES (?, ?)", batch)
|
|
count += len(batch)
|
|
|
|
db.commit()
|
|
db.close()
|
|
return count
|
|
|
|
|
|
def embedding_search_sqlvec(db_path: str, table: str, query_embedding: list, k: int = 10) -> list:
|
|
"""Busca los k vecinos mas cercanos en una tabla sqlite-vec.
|
|
|
|
Args:
|
|
db_path: Path al archivo SQLite con la tabla vec0.
|
|
table: Nombre de la tabla virtual.
|
|
query_embedding: Array numpy float32 del query.
|
|
k: Numero de resultados a retornar.
|
|
|
|
Returns:
|
|
Lista de dicts con 'rowid' y 'distance' ordenados por cercania.
|
|
|
|
Raises:
|
|
sqlite3.Error: Si la tabla no existe o hay error de lectura.
|
|
"""
|
|
db = sqlite3.connect(db_path)
|
|
db.enable_load_extension(True)
|
|
sqlite_vec.load(db)
|
|
|
|
q_bytes = np.asarray(query_embedding, dtype=np.float32).tobytes()
|
|
rows = db.execute(
|
|
f"SELECT rowid, distance FROM [{table}] WHERE embedding MATCH ? ORDER BY distance LIMIT ?",
|
|
(q_bytes, k),
|
|
).fetchall()
|
|
|
|
db.close()
|
|
return [{"rowid": r[0], "distance": r[1]} for r in rows]
|