{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "36832aa6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.2.2\n" ] } ], "source": [ "import duckdb\n", "print(duckdb.__version__)" ] }, { "cell_type": "code", "execution_count": 5, "id": "c357d895", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conn.execute(\"INSTALL vss; LOAD vss;\")\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "4a7a3ffb", "metadata": {}, "outputs": [ { "ename": "BinderException", "evalue": "Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true.", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mBinderException\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 38\u001b[39m\n\u001b[32m 35\u001b[39m conn.execute(\u001b[33m\"\u001b[39m\u001b[33mINSERT INTO textos VALUES (?, ?, ?)\u001b[39m\u001b[33m\"\u001b[39m, (i, texto, vector))\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Crear el índice HNSW en la columna embedding\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m38\u001b[39m \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mCREATE INDEX vss_idx ON textos USING HNSW (embedding);\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m \u001b[38;5;66;03m# Realizar una búsqueda de los 3 textos más similares a una consulta\u001b[39;00m\n\u001b[32m 41\u001b[39m consulta = \u001b[33m\"\u001b[39m\u001b[33minteligencia artificial en Python\u001b[39m\u001b[33m\"\u001b[39m\n", "\u001b[31mBinderException\u001b[39m: Binder Error: HNSW indexes can only be created in in-memory databases, or when the configuration option 'hnsw_enable_experimental_persistence' is set to true." ] } ], "source": [ "import duckdb\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "\n", "# Conectar a la base de datos\n", "conn = duckdb.connect(\"textos_vss.duckdb\")\n", "\n", "# Instalar y cargar la extensión VSS\n", "conn.execute(\"INSTALL vss;\")\n", "conn.execute(\"LOAD vss;\")\n", "\n", "# Modelo de embeddings\n", "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "# Crear la tabla con una columna de embeddings de tipo FLOAT[]\n", "conn.execute(\"\"\"\n", "CREATE TABLE IF NOT EXISTS textos (\n", " id INTEGER PRIMARY KEY,\n", " texto TEXT,\n", " embedding FLOAT[]\n", ");\n", "\"\"\")\n", "\n", "# Insertar textos y sus embeddings\n", "extractos = [\n", " \"El rápido zorro marrón salta sobre el perro perezoso.\",\n", " \"La inteligencia artificial está revolucionando el mundo.\",\n", " \"DuckDB permite análisis embebidos rápidos en Python.\",\n", " \"Los modelos de lenguaje pueden entender consultas humanas.\",\n", " \"Python es un lenguaje poderoso para ciencia de datos.\"\n", "]\n", "\n", "for i, texto in enumerate(extractos):\n", " vector = model.encode(texto).astype(np.float32).tolist()\n", " conn.execute(\"INSERT INTO textos VALUES (?, ?, ?)\", (i, texto, vector))\n", "\n", "# Crear el índice HNSW en la columna embedding\n", "conn.execute(\"CREATE INDEX vss_idx ON textos USING HNSW (embedding);\")\n", "\n", "# Realizar una búsqueda de los 3 textos más similares a una consulta\n", "consulta = \"inteligencia artificial en Python\"\n", "vector_consulta = model.encode(consulta).astype(np.float32).tolist()\n", "resultados = conn.execute(\"\"\"\n", " SELECT id, texto, array_distance(embedding, ?) AS distancia\n", " FROM textos\n", " ORDER BY distancia ASC\n", " LIMIT 3;\n", "\"\"\", (vector_consulta,)).fetchall()\n", "\n", "# Mostrar los resultados\n", "for id_, texto, distancia in resultados:\n", " print(f\"[ID: {id_}] Distancia: {distancia:.4f} → {texto}\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "2f6059bd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conn = duckdb.connect(\"textos_vss.duckdb\")\n", "conn.execute(\"SET hnsw_enable_experimental_persistence=true;\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "449bebc1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conn.execute(\"DELETE FROM textos;\")\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }