diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ceb386 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv diff --git a/audio_a_texto.ipynb b/audio_a_texto.ipynb new file mode 100644 index 0000000..f92c0e5 --- /dev/null +++ b/audio_a_texto.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comienza a hablar...\n", + "analiza este comprador y comprueba que tan verídico puede ser \n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[8], line 26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComienza a hablar...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m---> 26\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBUFFER\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Lee el flujo de audio del micrófono\u001b[39;00m\n\u001b[0;32m 28\u001b[0m \u001b[38;5;66;03m# Si no se recibe audio, sale del bucle\u001b[39;00m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\Tr4shhh\\.conda\\envs\\data\\Lib\\site-packages\\pyaudio\\__init__.py:570\u001b[0m, in \u001b[0;36mPyAudio.Stream.read\u001b[1;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_input:\n\u001b[0;32m 568\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot input stream\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 569\u001b[0m paCanNotReadFromAnOutputOnlyStream)\n\u001b[1;32m--> 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_frames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 571\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import pyaudio\n", + "from vosk import Model, KaldiRecognizer\n", + "import keyboard # Nueva biblioteca para escribir\n", + "\n", + "\n", + "BUFFER = 500 # Tamaño del buffer de audio\n", + "\n", + "# Carga el modelo de Vosk (español)\n", + "model = Model(\"vosk-model-small-es-0.42\")\n", + "\n", + "# Inicializa PyAudio\n", + "p = pyaudio.PyAudio()\n", + "\n", + "# Configura el micrófono (entrada de audio)\n", + "stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4000)\n", + "stream.start_stream()\n", + "\n", + "# Inicializa el reconocedor\n", + "recognizer = KaldiRecognizer(model, 16000)\n", + "\n", + "print(\"Comienza a hablar...\")\n", + "\n", + "while True:\n", + " data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono\n", + "\n", + " # Si no se recibe audio, sale del bucle\n", + " if len(data) == 0:\n", + " break\n", + "\n", + " # Procesa cada fragmento de audio en tiempo real\n", + " if recognizer.AcceptWaveform(data):\n", + " result = recognizer.Result() # Obtiene la transcripción de ese fragmento\n", + " text = eval(result)['text'].encode('utf-8').decode('utf-8')\n", + " text = text + ' ' # Agrega un espacio al final para separar las palabras\n", + " print(text)\n", + " keyboard.write(text)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "El archivo vosk-model-small-es-0.42.zip ha sido descomprimido.\n" + ] + } + ], + "source": [ + "import zipfile\n", + "\n", + "# Nombre del archivo zip\n", + "zip_filename = r'vosk-model-small-es-0.42.zip'\n", + "\n", + "# Descomprime el archivo zip en la misma carpeta\n", + "with zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n", + " zip_ref.extractall('.')\n", + "\n", + "print(f\"El archivo {zip_filename} ha sido descomprimido.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ID: 0, Nombre: Microsoft Sound Mapper - Input, Canales de entrada: 2\n", + "ID: 1, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n", + "ID: 2, Nombre: Microsoft Sound Mapper - Output, Canales de entrada: 0\n", + "ID: 3, Nombre: Speakers (2- High Definition Au, Canales de entrada: 0\n", + "ID: 4, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n", + "ID: 5, Nombre: Primary Sound Capture Driver, Canales de entrada: 2\n", + "ID: 6, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n", + "ID: 7, Nombre: Primary Sound Driver, Canales de entrada: 0\n", + "ID: 8, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n", + "ID: 9, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n", + "ID: 10, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n", + "ID: 11, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n", + "ID: 12, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n", + "ID: 13, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n", + "ID: 14, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n", + "ID: 15, Nombre: Speakers (HD Audio Speaker), Canales de entrada: 0\n", + "ID: 16, Nombre: SPDIF Out (HD Audio SPDIF out), Canales de entrada: 0\n", + "ID: 17, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n", + "ID: 18, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n" + ] + } + ], + "source": [ + "import pyaudio\n", + "\n", + "p = pyaudio.PyAudio()\n", + "for i in range(p.get_device_count()):\n", + " info = p.get_device_info_by_index(i)\n", + " print(f\"ID: {i}, Nombre: {info['name']}, Canales de entrada: {info['maxInputChannels']}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Presiona una tecla para ver su nombre (Ctrl+C para salir):\n", + "Tecla presionada: alt gr\n", + "Tecla presionada: alt gr\n", + "Tecla presionada: alt gr\n", + "Tecla presionada: ñ\n", + "Tecla presionada: alt gr\n", + "Tecla presionada: ñ\n" + ] + } + ], + "source": [ + "import keyboard\n", + "\n", + "print(\"Presiona una tecla para ver su nombre (Ctrl+C para salir):\")\n", + "\n", + "try:\n", + " while True:\n", + " event = keyboard.read_event()\n", + " if event.event_type == keyboard.KEY_DOWN:\n", + " print(f\"Tecla presionada: {event.name}\")\n", + "except KeyboardInterrupt:\n", + " print(\"Finalizado.\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "data", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a34f1b8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +vosk +pyaudio +keyboard diff --git a/script_audio_a_texto.py b/script_audio_a_texto.py new file mode 100644 index 0000000..442f0fe --- /dev/null +++ b/script_audio_a_texto.py @@ -0,0 +1,56 @@ +import sys +import os +import pyaudio +from vosk import Model, KaldiRecognizer +import keyboard + +BUFFER = 500 # Tamaño del buffer de audio +RECOGNITION_ACTIVE = False # Estado inicial del reconocimiento + +# Carga el modelo de Vosk (español) +model = Model("vosk-model-small-es-0.42") + +# Inicializa PyAudio +p = pyaudio.PyAudio() + +# Configura el micrófono (entrada de audio) +stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2000) +stream.start_stream() + +# Inicializa el reconocedor +recognizer = KaldiRecognizer(model, 16000) + +print("Presiona 'AltGr' + 'Ñ' para activar/desactivar el reconocimiento de voz...") + +try: + while True: + # Detecta si 'AltGr' + 'Ñ' son presionadas juntas + if keyboard.is_pressed('alt gr') and keyboard.is_pressed('ñ'): + # Cambia el estado de activación del reconocimiento de voz + RECOGNITION_ACTIVE = not RECOGNITION_ACTIVE + state = "activado" if RECOGNITION_ACTIVE else "desactivado" + print(f"Reconocimiento de voz {state}.") + + # Espera a que se suelten las teclas antes de continuar + while keyboard.is_pressed('alt gr') or keyboard.is_pressed('ñ'): + pass + + # Si el reconocimiento está activado, procesa el audio + if RECOGNITION_ACTIVE: + data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono + + # Procesa cada fragmento de audio en tiempo real + if recognizer.AcceptWaveform(data): + result = recognizer.Result() # Obtiene la transcripción de ese fragmento + text = eval(result)['text'].encode('utf-8').decode('utf-8') + text = text + ' ' # Agrega un espacio al final para separar las palabras + print(text) # Imprime la transcripción + keyboard.write(text) # Escribe el texto transcrito + +except KeyboardInterrupt: + print("\nPrograma terminado por el usuario.") + +# Cierra los recursos +stream.stop_stream() +stream.close() +p.terminate() diff --git a/script_running.lock b/script_running.lock new file mode 100644 index 0000000..61d5e33 --- /dev/null +++ b/script_running.lock @@ -0,0 +1 @@ +running \ No newline at end of file diff --git a/vosk-model-es-0.42.zip b/vosk-model-es-0.42.zip new file mode 100644 index 0000000..8aa66f0 Binary files /dev/null and b/vosk-model-es-0.42.zip differ diff --git a/vosk-model-small-es-0.42.zip b/vosk-model-small-es-0.42.zip new file mode 100644 index 0000000..9d063c5 Binary files /dev/null and b/vosk-model-small-es-0.42.zip differ diff --git a/vosk-model-small-es-0.42/README b/vosk-model-small-es-0.42/README new file mode 100644 index 0000000..1cba36a --- /dev/null +++ b/vosk-model-small-es-0.42/README @@ -0,0 +1,10 @@ +Copyright 2022-2050 AC Technologies LLC + +Small Spanish model for Vosk + +WER + +%WER 42.63 [ 52779 / 123813, 4570 ins, 22135 del, 26074 sub ] exp/chain/tdnn/decode_test_call/wer_10_0.0 +%WER 16.02 [ 24405 / 152364, 3121 ins, 5007 del, 16277 sub ] exp/chain/tdnn/decode_test_cv/wer_11_0.0 +%WER 11.21 [ 9922 / 88499, 1811 ins, 1675 del, 6436 sub ] exp/chain/tdnn/decode_test_mls/wer_10_0.0 +%WER 16.72 [ 2410 / 14416, 445 ins, 696 del, 1269 sub ] exp/chain/tdnn/decode_test_mtedx/wer_10_0.0 diff --git a/vosk-model-small-es-0.42/am/final.mdl b/vosk-model-small-es-0.42/am/final.mdl new file mode 100644 index 0000000..3d11c18 Binary files /dev/null and b/vosk-model-small-es-0.42/am/final.mdl differ diff --git a/vosk-model-small-es-0.42/conf/mfcc.conf b/vosk-model-small-es-0.42/conf/mfcc.conf new file mode 100644 index 0000000..7b8151b --- /dev/null +++ b/vosk-model-small-es-0.42/conf/mfcc.conf @@ -0,0 +1,8 @@ +--sample-frequency=16000.0 +--use-energy=false +--num-mel-bins=40 +--num-ceps=40 +--low-freq=20 +--high-freq=7600 +--allow-upsample=true +--allow-downsample=true diff --git a/vosk-model-small-es-0.42/conf/model.conf b/vosk-model-small-es-0.42/conf/model.conf new file mode 100644 index 0000000..5e66614 --- /dev/null +++ b/vosk-model-small-es-0.42/conf/model.conf @@ -0,0 +1,10 @@ +--min-active=200 +--max-active=4000 +--beam=11.0 +--lattice-beam=4.0 +--acoustic-scale=1.0 +--frame-subsampling-factor=3 +--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 +--endpoint.rule2.min-trailing-silence=0.5 +--endpoint.rule3.min-trailing-silence=1.0 +--endpoint.rule4.min-trailing-silence=2.0 diff --git a/vosk-model-small-es-0.42/graph/Gr.fst b/vosk-model-small-es-0.42/graph/Gr.fst new file mode 100644 index 0000000..cc6fc69 Binary files /dev/null and b/vosk-model-small-es-0.42/graph/Gr.fst differ diff --git a/vosk-model-small-es-0.42/graph/HCLr.fst b/vosk-model-small-es-0.42/graph/HCLr.fst new file mode 100644 index 0000000..52aa040 Binary files /dev/null and b/vosk-model-small-es-0.42/graph/HCLr.fst differ diff --git a/vosk-model-small-es-0.42/graph/disambig_tid.int b/vosk-model-small-es-0.42/graph/disambig_tid.int new file mode 100644 index 0000000..77c56f6 --- /dev/null +++ b/vosk-model-small-es-0.42/graph/disambig_tid.int @@ -0,0 +1,10 @@ +7601 +7602 +7603 +7604 +7605 +7606 +7607 +7608 +7609 +7610 diff --git a/vosk-model-small-es-0.42/graph/phones/word_boundary.int b/vosk-model-small-es-0.42/graph/phones/word_boundary.int new file mode 100644 index 0000000..09d5c8b --- /dev/null +++ b/vosk-model-small-es-0.42/graph/phones/word_boundary.int @@ -0,0 +1,110 @@ +1 nonword +2 begin +3 end +4 internal +5 singleton +6 nonword +7 begin +8 end +9 internal +10 singleton +11 begin +12 end +13 internal +14 singleton +15 begin +16 end +17 internal +18 singleton +19 begin +20 end +21 internal +22 singleton +23 begin +24 end +25 internal +26 singleton +27 begin +28 end +29 internal +30 singleton +31 begin +32 end +33 internal +34 singleton +35 begin +36 end +37 internal +38 singleton +39 begin +40 end +41 internal +42 singleton +43 begin +44 end +45 internal +46 singleton +47 begin +48 end +49 internal +50 singleton +51 begin +52 end +53 internal +54 singleton +55 begin +56 end +57 internal +58 singleton +59 begin +60 end +61 internal +62 singleton +63 begin +64 end +65 internal +66 singleton +67 begin +68 end +69 internal +70 singleton +71 begin +72 end +73 internal +74 singleton +75 begin +76 end +77 internal +78 singleton +79 begin +80 end +81 internal +82 singleton +83 begin +84 end +85 internal +86 singleton +87 begin +88 end +89 internal +90 singleton +91 begin +92 end +93 internal +94 singleton +95 begin +96 end +97 internal +98 singleton +99 begin +100 end +101 internal +102 singleton +103 begin +104 end +105 internal +106 singleton +107 begin +108 end +109 internal +110 singleton diff --git a/vosk-model-small-es-0.42/ivector/final.dubm b/vosk-model-small-es-0.42/ivector/final.dubm new file mode 100644 index 0000000..1fa6379 Binary files /dev/null and b/vosk-model-small-es-0.42/ivector/final.dubm differ diff --git a/vosk-model-small-es-0.42/ivector/final.ie b/vosk-model-small-es-0.42/ivector/final.ie new file mode 100644 index 0000000..0884b7f Binary files /dev/null and b/vosk-model-small-es-0.42/ivector/final.ie differ diff --git a/vosk-model-small-es-0.42/ivector/final.mat b/vosk-model-small-es-0.42/ivector/final.mat new file mode 100644 index 0000000..6780ca2 Binary files /dev/null and b/vosk-model-small-es-0.42/ivector/final.mat differ diff --git a/vosk-model-small-es-0.42/ivector/global_cmvn.stats b/vosk-model-small-es-0.42/ivector/global_cmvn.stats new file mode 100644 index 0000000..1be324e --- /dev/null +++ b/vosk-model-small-es-0.42/ivector/global_cmvn.stats @@ -0,0 +1,3 @@ + [ + 8.46276e+10 7.072285e+08 -1.289904e+10 5.671974e+09 -1.209136e+10 -2.034736e+10 -6.875634e+09 -1.557388e+10 -2.806233e+09 -3.674485e+09 -6.921794e+09 6.946679e+08 -8.52497e+09 -7.260168e+08 -4.343575e+09 -5.486674e+09 -1.710502e+09 -1.996049e+09 -2.165263e+09 -6.832425e+07 -9.641756e+08 5.967002e+07 -1.110601e+08 4.368464e+07 -1.341292e+07 5.138013e+08 1.807307e+08 -1.273191e+08 9.086873e+08 -7.03787e+08 6.228164e+08 2.102328e+08 4.091046e+07 1.103751e+09 1.753195e+07 6.810451e+08 3.860742e+08 -6.252538e+08 2.375626e+08 -4.708622e+08 8.625583e+08 + 8.661931e+12 5.440279e+11 7.661625e+11 5.86159e+11 5.687293e+11 1.020851e+12 6.045613e+11 8.250028e+11 4.274593e+11 3.851017e+11 4.398129e+11 4.172123e+11 3.736421e+11 2.438057e+11 2.363456e+11 2.023758e+11 1.314776e+11 8.931146e+10 5.928187e+10 3.395453e+10 1.88439e+10 6.181701e+09 7.988108e+08 2.281808e+08 3.149188e+09 8.009975e+09 1.215929e+10 1.646801e+10 2.218979e+10 2.492957e+10 2.43543e+10 2.243619e+10 2.445217e+10 2.796862e+10 2.215869e+10 1.68782e+10 1.565755e+10 1.461083e+10 1.140419e+10 7.335776e+09 0 ] diff --git a/vosk-model-small-es-0.42/ivector/online_cmvn.conf b/vosk-model-small-es-0.42/ivector/online_cmvn.conf new file mode 100644 index 0000000..7748a4a --- /dev/null +++ b/vosk-model-small-es-0.42/ivector/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/vosk-model-small-es-0.42/ivector/splice.conf b/vosk-model-small-es-0.42/ivector/splice.conf new file mode 100644 index 0000000..960cd2e --- /dev/null +++ b/vosk-model-small-es-0.42/ivector/splice.conf @@ -0,0 +1,2 @@ +--left-context=3 +--right-context=3