182 lines
5.9 KiB
Plaintext
182 lines
5.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "36832aa6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1.2.2\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import duckdb\n",
|
|
"print(duckdb.__version__)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "c357d895",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda135ac30>"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"conn.execute(\"INSTALL vss; LOAD vss;\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "4a7a3ffb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "BinderException",
|
|
"evalue": "Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true.",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
"\u001b[31mBinderException\u001b[39m Traceback (most recent call last)",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 38\u001b[39m\n\u001b[32m 35\u001b[39m conn.execute(\u001b[33m\"\u001b[39m\u001b[33mINSERT INTO textos VALUES (?, ?, ?)\u001b[39m\u001b[33m\"\u001b[39m, (i, texto, vector))\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Crear el índice HNSW en la columna embedding\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m38\u001b[39m \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCREATE INDEX vss_idx ON textos USING HNSW (embedding);\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m \u001b[38;5;66;03m# Realizar una búsqueda de los 3 textos más similares a una consulta\u001b[39;00m\n\u001b[32m 41\u001b[39m consulta = \u001b[33m\"\u001b[39m\u001b[33minteligencia artificial en Python\u001b[39m\u001b[33m\"\u001b[39m\n",
|
|
"\u001b[31mBinderException\u001b[39m: Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true."
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import duckdb\n",
|
|
"import numpy as np\n",
|
|
"from sentence_transformers import SentenceTransformer\n",
|
|
"\n",
|
|
"# Conectar a la base de datos\n",
|
|
"conn = duckdb.connect(\"textos_vss.duckdb\")\n",
|
|
"\n",
|
|
"# Instalar y cargar la extensión VSS\n",
|
|
"conn.execute(\"INSTALL vss;\")\n",
|
|
"conn.execute(\"LOAD vss;\")\n",
|
|
"\n",
|
|
"# Modelo de embeddings\n",
|
|
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
|
|
"\n",
|
|
"# Crear la tabla con una columna de embeddings de tipo FLOAT[]\n",
|
|
"conn.execute(\"\"\"\n",
|
|
"CREATE TABLE IF NOT EXISTS textos (\n",
|
|
" id INTEGER PRIMARY KEY,\n",
|
|
" texto TEXT,\n",
|
|
" embedding FLOAT[]\n",
|
|
");\n",
|
|
"\"\"\")\n",
|
|
"\n",
|
|
"# Insertar textos y sus embeddings\n",
|
|
"extractos = [\n",
|
|
" \"El rápido zorro marrón salta sobre el perro perezoso.\",\n",
|
|
" \"La inteligencia artificial está revolucionando el mundo.\",\n",
|
|
" \"DuckDB permite análisis embebidos rápidos en Python.\",\n",
|
|
" \"Los modelos de lenguaje pueden entender consultas humanas.\",\n",
|
|
" \"Python es un lenguaje poderoso para ciencia de datos.\"\n",
|
|
"]\n",
|
|
"\n",
|
|
"for i, texto in enumerate(extractos):\n",
|
|
" vector = model.encode(texto).astype(np.float32).tolist()\n",
|
|
" conn.execute(\"INSERT INTO textos VALUES (?, ?, ?)\", (i, texto, vector))\n",
|
|
"\n",
|
|
"# Crear el índice HNSW en la columna embedding\n",
|
|
"conn.execute(\"CREATE INDEX vss_idx ON textos USING HNSW (embedding);\")\n",
|
|
"\n",
|
|
"# Realizar una búsqueda de los 3 textos más similares a una consulta\n",
|
|
"consulta = \"inteligencia artificial en Python\"\n",
|
|
"vector_consulta = model.encode(consulta).astype(np.float32).tolist()\n",
|
|
"resultados = conn.execute(\"\"\"\n",
|
|
" SELECT id, texto, array_distance(embedding, ?) AS distancia\n",
|
|
" FROM textos\n",
|
|
" ORDER BY distancia ASC\n",
|
|
" LIMIT 3;\n",
|
|
"\"\"\", (vector_consulta,)).fetchall()\n",
|
|
"\n",
|
|
"# Mostrar los resultados\n",
|
|
"for id_, texto, distancia in resultados:\n",
|
|
" print(f\"[ID: {id_}] Distancia: {distancia:.4f} → {texto}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "2f6059bd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda32d9730>"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"conn = duckdb.connect(\"textos_vss.duckdb\")\n",
|
|
"conn.execute(\"SET hnsw_enable_experimental_persistence=true;\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "449bebc1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda322a5b0>"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"conn.execute(\"DELETE FROM textos;\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|