Add initial project files including Vosk model and audio recognition script

2024-11-30 20:42:31 +01:00
parent 20cdafb2bf
commit 7c416c5f85
21 changed files with 407 additions and 0 deletions
@@ -0,0 +1 @@
+venv
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comienza a hablar...\n",
+      "analiza este comprador y comprueba que tan verídico puede ser \n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[8], line 26\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComienza a hablar...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m---> 26\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBUFFER\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Lee el flujo de audio del micrófono\u001b[39;00m\n\u001b[0;32m     28\u001b[0m     \u001b[38;5;66;03m# Si no se recibe audio, sale del bucle\u001b[39;00m\n\u001b[0;32m     29\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[1;32mc:\\Users\\Tr4shhh\\.conda\\envs\\data\\Lib\\site-packages\\pyaudio\\__init__.py:570\u001b[0m, in \u001b[0;36mPyAudio.Stream.read\u001b[1;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[0;32m    567\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_input:\n\u001b[0;32m    568\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot input stream\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    569\u001b[0m                   paCanNotReadFromAnOutputOnlyStream)\n\u001b[1;32m--> 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_frames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    571\u001b[0m \u001b[43m                      \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import pyaudio\n",
+    "from vosk import Model, KaldiRecognizer\n",
+    "import keyboard  # Nueva biblioteca para escribir\n",
+    "\n",
+    "\n",
+    "BUFFER = 500 # Tamaño del buffer de audio\n",
+    "\n",
+    "# Carga el modelo de Vosk (español)\n",
+    "model = Model(\"vosk-model-small-es-0.42\")\n",
+    "\n",
+    "# Inicializa PyAudio\n",
+    "p = pyaudio.PyAudio()\n",
+    "\n",
+    "# Configura el micrófono (entrada de audio)\n",
+    "stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4000)\n",
+    "stream.start_stream()\n",
+    "\n",
+    "# Inicializa el reconocedor\n",
+    "recognizer = KaldiRecognizer(model, 16000)\n",
+    "\n",
+    "print(\"Comienza a hablar...\")\n",
+    "\n",
+    "while True:\n",
+    "    data = stream.read(BUFFER, exception_on_overflow=False)  # Lee el flujo de audio del micrófono\n",
+    "\n",
+    "    # Si no se recibe audio, sale del bucle\n",
+    "    if len(data) == 0:\n",
+    "        break\n",
+    "\n",
+    "    # Procesa cada fragmento de audio en tiempo real\n",
+    "    if recognizer.AcceptWaveform(data):\n",
+    "        result = recognizer.Result()  # Obtiene la transcripción de ese fragmento\n",
+    "        text = eval(result)['text'].encode('utf-8').decode('utf-8')\n",
+    "        text = text + ' ' # Agrega un espacio al final para separar las palabras\n",
+    "        print(text)\n",
+    "        keyboard.write(text)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "El archivo vosk-model-small-es-0.42.zip ha sido descomprimido.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import zipfile\n",
+    "\n",
+    "# Nombre del archivo zip\n",
+    "zip_filename = r'vosk-model-small-es-0.42.zip'\n",
+    "\n",
+    "# Descomprime el archivo zip en la misma carpeta\n",
+    "with zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n",
+    "    zip_ref.extractall('.')\n",
+    "\n",
+    "print(f\"El archivo {zip_filename} ha sido descomprimido.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ID: 0, Nombre: Microsoft Sound Mapper - Input, Canales de entrada: 2\n",
+      "ID: 1, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
+      "ID: 2, Nombre: Microsoft Sound Mapper - Output, Canales de entrada: 0\n",
+      "ID: 3, Nombre: Speakers (2- High Definition Au, Canales de entrada: 0\n",
+      "ID: 4, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
+      "ID: 5, Nombre: Primary Sound Capture Driver, Canales de entrada: 2\n",
+      "ID: 6, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
+      "ID: 7, Nombre: Primary Sound Driver, Canales de entrada: 0\n",
+      "ID: 8, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
+      "ID: 9, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
+      "ID: 10, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
+      "ID: 11, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
+      "ID: 12, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
+      "ID: 13, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
+      "ID: 14, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
+      "ID: 15, Nombre: Speakers (HD Audio Speaker), Canales de entrada: 0\n",
+      "ID: 16, Nombre: SPDIF Out (HD Audio SPDIF out), Canales de entrada: 0\n",
+      "ID: 17, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
+      "ID: 18, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pyaudio\n",
+    "\n",
+    "p = pyaudio.PyAudio()\n",
+    "for i in range(p.get_device_count()):\n",
+    "    info = p.get_device_info_by_index(i)\n",
+    "    print(f\"ID: {i}, Nombre: {info['name']}, Canales de entrada: {info['maxInputChannels']}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Presiona una tecla para ver su nombre (Ctrl+C para salir):\n",
+      "Tecla presionada: alt gr\n",
+      "Tecla presionada: alt gr\n",
+      "Tecla presionada: alt gr\n",
+      "Tecla presionada: ñ\n",
+      "Tecla presionada: alt gr\n",
+      "Tecla presionada: ñ\n"
+     ]
+    }
+   ],
+   "source": [
+    "import keyboard\n",
+    "\n",
+    "print(\"Presiona una tecla para ver su nombre (Ctrl+C para salir):\")\n",
+    "\n",
+    "try:\n",
+    "    while True:\n",
+    "        event = keyboard.read_event()\n",
+    "        if event.event_type == keyboard.KEY_DOWN:\n",
+    "            print(f\"Tecla presionada: {event.name}\")\n",
+    "except KeyboardInterrupt:\n",
+    "    print(\"Finalizado.\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "data",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,3 @@
+vosk
+pyaudio
+keyboard
@@ -0,0 +1,56 @@
+import sys
+import os
+import pyaudio
+from vosk import Model, KaldiRecognizer
+import keyboard
+
+BUFFER = 500  # Tamaño del buffer de audio
+RECOGNITION_ACTIVE = False  # Estado inicial del reconocimiento
+
+# Carga el modelo de Vosk (español)
+model = Model("vosk-model-small-es-0.42")
+
+# Inicializa PyAudio
+p = pyaudio.PyAudio()
+
+# Configura el micrófono (entrada de audio)
+stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2000)
+stream.start_stream()
+
+# Inicializa el reconocedor
+recognizer = KaldiRecognizer(model, 16000)
+
+print("Presiona 'AltGr' + 'Ñ' para activar/desactivar el reconocimiento de voz...")
+
+try:
+    while True:
+        # Detecta si 'AltGr' + 'Ñ' son presionadas juntas
+        if keyboard.is_pressed('alt gr') and keyboard.is_pressed('ñ'):
+            # Cambia el estado de activación del reconocimiento de voz
+            RECOGNITION_ACTIVE = not RECOGNITION_ACTIVE
+            state = "activado" if RECOGNITION_ACTIVE else "desactivado"
+            print(f"Reconocimiento de voz {state}.")
+            
+            # Espera a que se suelten las teclas antes de continuar
+            while keyboard.is_pressed('alt gr') or keyboard.is_pressed('ñ'):
+                pass
+
+        # Si el reconocimiento está activado, procesa el audio
+        if RECOGNITION_ACTIVE:
+            data = stream.read(BUFFER, exception_on_overflow=False)  # Lee el flujo de audio del micrófono
+
+            # Procesa cada fragmento de audio en tiempo real
+            if recognizer.AcceptWaveform(data):
+                result = recognizer.Result()  # Obtiene la transcripción de ese fragmento
+                text = eval(result)['text'].encode('utf-8').decode('utf-8')
+                text = text + ' '  # Agrega un espacio al final para separar las palabras
+                print(text)  # Imprime la transcripción
+                keyboard.write(text)  # Escribe el texto transcrito
+
+except KeyboardInterrupt:
+    print("\nPrograma terminado por el usuario.")
+
+# Cierra los recursos
+stream.stop_stream()
+stream.close()
+p.terminate()
@@ -0,0 +1 @@
+running
@@ -0,0 +1,10 @@
+Copyright 2022-2050 AC Technologies LLC
+
+Small Spanish model for Vosk
+
+WER
+
+%WER 42.63 [ 52779 / 123813, 4570 ins, 22135 del, 26074 sub ] exp/chain/tdnn/decode_test_call/wer_10_0.0
+%WER 16.02 [ 24405 / 152364, 3121 ins, 5007 del, 16277 sub ] exp/chain/tdnn/decode_test_cv/wer_11_0.0
+%WER 11.21 [ 9922 / 88499, 1811 ins, 1675 del, 6436 sub ] exp/chain/tdnn/decode_test_mls/wer_10_0.0
+%WER 16.72 [ 2410 / 14416, 445 ins, 696 del, 1269 sub ] exp/chain/tdnn/decode_test_mtedx/wer_10_0.0
@@ -0,0 +1,8 @@
+--sample-frequency=16000.0
+--use-energy=false
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=20
+--high-freq=7600
+--allow-upsample=true
+--allow-downsample=true
@@ -0,0 +1,10 @@
+--min-active=200
+--max-active=4000
+--beam=11.0
+--lattice-beam=4.0
+--acoustic-scale=1.0
+--frame-subsampling-factor=3
+--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
+--endpoint.rule2.min-trailing-silence=0.5
+--endpoint.rule3.min-trailing-silence=1.0
+--endpoint.rule4.min-trailing-silence=2.0
@@ -0,0 +1,10 @@
+7601
+7602
+7603
+7604
+7605
+7606
+7607
+7608
+7609
+7610
@@ -0,0 +1,110 @@
+1 nonword
+2 begin
+3 end
+4 internal
+5 singleton
+6 nonword
+7 begin
+8 end
+9 internal
+10 singleton
+11 begin
+12 end
+13 internal
+14 singleton
+15 begin
+16 end
+17 internal
+18 singleton
+19 begin
+20 end
+21 internal
+22 singleton
+23 begin
+24 end
+25 internal
+26 singleton
+27 begin
+28 end
+29 internal
+30 singleton
+31 begin
+32 end
+33 internal
+34 singleton
+35 begin
+36 end
+37 internal
+38 singleton
+39 begin
+40 end
+41 internal
+42 singleton
+43 begin
+44 end
+45 internal
+46 singleton
+47 begin
+48 end
+49 internal
+50 singleton
+51 begin
+52 end
+53 internal
+54 singleton
+55 begin
+56 end
+57 internal
+58 singleton
+59 begin
+60 end
+61 internal
+62 singleton
+63 begin
+64 end
+65 internal
+66 singleton
+67 begin
+68 end
+69 internal
+70 singleton
+71 begin
+72 end
+73 internal
+74 singleton
+75 begin
+76 end
+77 internal
+78 singleton
+79 begin
+80 end
+81 internal
+82 singleton
+83 begin
+84 end
+85 internal
+86 singleton
+87 begin
+88 end
+89 internal
+90 singleton
+91 begin
+92 end
+93 internal
+94 singleton
+95 begin
+96 end
+97 internal
+98 singleton
+99 begin
+100 end
+101 internal
+102 singleton
+103 begin
+104 end
+105 internal
+106 singleton
+107 begin
+108 end
+109 internal
+110 singleton
@@ -0,0 +1,3 @@
+ [
+  8.46276e+10 7.072285e+08 -1.289904e+10 5.671974e+09 -1.209136e+10 -2.034736e+10 -6.875634e+09 -1.557388e+10 -2.806233e+09 -3.674485e+09 -6.921794e+09 6.946679e+08 -8.52497e+09 -7.260168e+08 -4.343575e+09 -5.486674e+09 -1.710502e+09 -1.996049e+09 -2.165263e+09 -6.832425e+07 -9.641756e+08 5.967002e+07 -1.110601e+08 4.368464e+07 -1.341292e+07 5.138013e+08 1.807307e+08 -1.273191e+08 9.086873e+08 -7.03787e+08 6.228164e+08 2.102328e+08 4.091046e+07 1.103751e+09 1.753195e+07 6.810451e+08 3.860742e+08 -6.252538e+08 2.375626e+08 -4.708622e+08 8.625583e+08 
+  8.661931e+12 5.440279e+11 7.661625e+11 5.86159e+11 5.687293e+11 1.020851e+12 6.045613e+11 8.250028e+11 4.274593e+11 3.851017e+11 4.398129e+11 4.172123e+11 3.736421e+11 2.438057e+11 2.363456e+11 2.023758e+11 1.314776e+11 8.931146e+10 5.928187e+10 3.395453e+10 1.88439e+10 6.181701e+09 7.988108e+08 2.281808e+08 3.149188e+09 8.009975e+09 1.215929e+10 1.646801e+10 2.218979e+10 2.492957e+10 2.43543e+10 2.243619e+10 2.445217e+10 2.796862e+10 2.215869e+10 1.68782e+10 1.565755e+10 1.461083e+10 1.140419e+10 7.335776e+09 0 ]
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
@@ -0,0 +1,2 @@
+--left-context=3
+--right-context=3
				`@@ -0,0 +1 @@`
				`# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh`