This repository has been archived on 2025-11-27. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Fitz_Studio/pruebas_conceptos/duckdb/prueba_duckdb.ipynb
T
2025-05-05 02:21:55 +02:00

182 lines
5.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "36832aa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.2.2\n"
]
}
],
"source": [
"import duckdb\n",
"print(duckdb.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c357d895",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda135ac30>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conn.execute(\"INSTALL vss; LOAD vss;\")\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4a7a3ffb",
"metadata": {},
"outputs": [
{
"ename": "BinderException",
"evalue": "Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mBinderException\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 38\u001b[39m\n\u001b[32m 35\u001b[39m conn.execute(\u001b[33m\"\u001b[39m\u001b[33mINSERT INTO textos VALUES (?, ?, ?)\u001b[39m\u001b[33m\"\u001b[39m, (i, texto, vector))\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Crear el índice HNSW en la columna embedding\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m38\u001b[39m \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCREATE INDEX vss_idx ON textos USING HNSW (embedding);\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m \u001b[38;5;66;03m# Realizar una búsqueda de los 3 textos más similares a una consulta\u001b[39;00m\n\u001b[32m 41\u001b[39m consulta = \u001b[33m\"\u001b[39m\u001b[33minteligencia artificial en Python\u001b[39m\u001b[33m\"\u001b[39m\n",
"\u001b[31mBinderException\u001b[39m: Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true."
]
}
],
"source": [
"import duckdb\n",
"import numpy as np\n",
"from sentence_transformers import SentenceTransformer\n",
"\n",
"# Conectar a la base de datos\n",
"conn = duckdb.connect(\"textos_vss.duckdb\")\n",
"\n",
"# Instalar y cargar la extensión VSS\n",
"conn.execute(\"INSTALL vss;\")\n",
"conn.execute(\"LOAD vss;\")\n",
"\n",
"# Modelo de embeddings\n",
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"\n",
"# Crear la tabla con una columna de embeddings de tipo FLOAT[]\n",
"conn.execute(\"\"\"\n",
"CREATE TABLE IF NOT EXISTS textos (\n",
" id INTEGER PRIMARY KEY,\n",
" texto TEXT,\n",
" embedding FLOAT[]\n",
");\n",
"\"\"\")\n",
"\n",
"# Insertar textos y sus embeddings\n",
"extractos = [\n",
" \"El rápido zorro marrón salta sobre el perro perezoso.\",\n",
" \"La inteligencia artificial está revolucionando el mundo.\",\n",
" \"DuckDB permite análisis embebidos rápidos en Python.\",\n",
" \"Los modelos de lenguaje pueden entender consultas humanas.\",\n",
" \"Python es un lenguaje poderoso para ciencia de datos.\"\n",
"]\n",
"\n",
"for i, texto in enumerate(extractos):\n",
" vector = model.encode(texto).astype(np.float32).tolist()\n",
" conn.execute(\"INSERT INTO textos VALUES (?, ?, ?)\", (i, texto, vector))\n",
"\n",
"# Crear el índice HNSW en la columna embedding\n",
"conn.execute(\"CREATE INDEX vss_idx ON textos USING HNSW (embedding);\")\n",
"\n",
"# Realizar una búsqueda de los 3 textos más similares a una consulta\n",
"consulta = \"inteligencia artificial en Python\"\n",
"vector_consulta = model.encode(consulta).astype(np.float32).tolist()\n",
"resultados = conn.execute(\"\"\"\n",
" SELECT id, texto, array_distance(embedding, ?) AS distancia\n",
" FROM textos\n",
" ORDER BY distancia ASC\n",
" LIMIT 3;\n",
"\"\"\", (vector_consulta,)).fetchall()\n",
"\n",
"# Mostrar los resultados\n",
"for id_, texto, distancia in resultados:\n",
" print(f\"[ID: {id_}] Distancia: {distancia:.4f} → {texto}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2f6059bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda32d9730>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conn = duckdb.connect(\"textos_vss.duckdb\")\n",
"conn.execute(\"SET hnsw_enable_experimental_persistence=true;\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "449bebc1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x1cda322a5b0>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conn.execute(\"DELETE FROM textos;\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}