Add initial project files including Vosk model and audio recognition script

This commit is contained in:
2024-11-30 20:42:31 +01:00
parent 20cdafb2bf
commit 7c416c5f85
21 changed files with 407 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
venv
+192
View File
@@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Comienza a hablar...\n",
"analiza este comprador y comprueba que tan verídico puede ser \n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[8], line 26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComienza a hablar...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m---> 26\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBUFFER\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Lee el flujo de audio del micrófono\u001b[39;00m\n\u001b[0;32m 28\u001b[0m \u001b[38;5;66;03m# Si no se recibe audio, sale del bucle\u001b[39;00m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\Tr4shhh\\.conda\\envs\\data\\Lib\\site-packages\\pyaudio\\__init__.py:570\u001b[0m, in \u001b[0;36mPyAudio.Stream.read\u001b[1;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_input:\n\u001b[0;32m 568\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot input stream\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 569\u001b[0m paCanNotReadFromAnOutputOnlyStream)\n\u001b[1;32m--> 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_frames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 571\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import sys\n",
"import os\n",
"import pyaudio\n",
"from vosk import Model, KaldiRecognizer\n",
"import keyboard # Nueva biblioteca para escribir\n",
"\n",
"\n",
"BUFFER = 500 # Tamaño del buffer de audio\n",
"\n",
"# Carga el modelo de Vosk (español)\n",
"model = Model(\"vosk-model-small-es-0.42\")\n",
"\n",
"# Inicializa PyAudio\n",
"p = pyaudio.PyAudio()\n",
"\n",
"# Configura el micrófono (entrada de audio)\n",
"stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4000)\n",
"stream.start_stream()\n",
"\n",
"# Inicializa el reconocedor\n",
"recognizer = KaldiRecognizer(model, 16000)\n",
"\n",
"print(\"Comienza a hablar...\")\n",
"\n",
"while True:\n",
" data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono\n",
"\n",
" # Si no se recibe audio, sale del bucle\n",
" if len(data) == 0:\n",
" break\n",
"\n",
" # Procesa cada fragmento de audio en tiempo real\n",
" if recognizer.AcceptWaveform(data):\n",
" result = recognizer.Result() # Obtiene la transcripción de ese fragmento\n",
" text = eval(result)['text'].encode('utf-8').decode('utf-8')\n",
" text = text + ' ' # Agrega un espacio al final para separar las palabras\n",
" print(text)\n",
" keyboard.write(text)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"El archivo vosk-model-small-es-0.42.zip ha sido descomprimido.\n"
]
}
],
"source": [
"import zipfile\n",
"\n",
"# Nombre del archivo zip\n",
"zip_filename = r'vosk-model-small-es-0.42.zip'\n",
"\n",
"# Descomprime el archivo zip en la misma carpeta\n",
"with zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n",
" zip_ref.extractall('.')\n",
"\n",
"print(f\"El archivo {zip_filename} ha sido descomprimido.\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID: 0, Nombre: Microsoft Sound Mapper - Input, Canales de entrada: 2\n",
"ID: 1, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
"ID: 2, Nombre: Microsoft Sound Mapper - Output, Canales de entrada: 0\n",
"ID: 3, Nombre: Speakers (2- High Definition Au, Canales de entrada: 0\n",
"ID: 4, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
"ID: 5, Nombre: Primary Sound Capture Driver, Canales de entrada: 2\n",
"ID: 6, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
"ID: 7, Nombre: Primary Sound Driver, Canales de entrada: 0\n",
"ID: 8, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
"ID: 9, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
"ID: 10, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
"ID: 11, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
"ID: 12, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
"ID: 13, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
"ID: 14, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
"ID: 15, Nombre: Speakers (HD Audio Speaker), Canales de entrada: 0\n",
"ID: 16, Nombre: SPDIF Out (HD Audio SPDIF out), Canales de entrada: 0\n",
"ID: 17, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
"ID: 18, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n"
]
}
],
"source": [
"import pyaudio\n",
"\n",
"p = pyaudio.PyAudio()\n",
"for i in range(p.get_device_count()):\n",
" info = p.get_device_info_by_index(i)\n",
" print(f\"ID: {i}, Nombre: {info['name']}, Canales de entrada: {info['maxInputChannels']}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Presiona una tecla para ver su nombre (Ctrl+C para salir):\n",
"Tecla presionada: alt gr\n",
"Tecla presionada: alt gr\n",
"Tecla presionada: alt gr\n",
"Tecla presionada: ñ\n",
"Tecla presionada: alt gr\n",
"Tecla presionada: ñ\n"
]
}
],
"source": [
"import keyboard\n",
"\n",
"print(\"Presiona una tecla para ver su nombre (Ctrl+C para salir):\")\n",
"\n",
"try:\n",
" while True:\n",
" event = keyboard.read_event()\n",
" if event.event_type == keyboard.KEY_DOWN:\n",
" print(f\"Tecla presionada: {event.name}\")\n",
"except KeyboardInterrupt:\n",
" print(\"Finalizado.\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "data",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
+3
View File
@@ -0,0 +1,3 @@
vosk
pyaudio
keyboard
+56
View File
@@ -0,0 +1,56 @@
import sys
import os
import pyaudio
from vosk import Model, KaldiRecognizer
import keyboard
BUFFER = 500 # Tamaño del buffer de audio
RECOGNITION_ACTIVE = False # Estado inicial del reconocimiento
# Carga el modelo de Vosk (español)
model = Model("vosk-model-small-es-0.42")
# Inicializa PyAudio
p = pyaudio.PyAudio()
# Configura el micrófono (entrada de audio)
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2000)
stream.start_stream()
# Inicializa el reconocedor
recognizer = KaldiRecognizer(model, 16000)
print("Presiona 'AltGr' + 'Ñ' para activar/desactivar el reconocimiento de voz...")
try:
while True:
# Detecta si 'AltGr' + 'Ñ' son presionadas juntas
if keyboard.is_pressed('alt gr') and keyboard.is_pressed('ñ'):
# Cambia el estado de activación del reconocimiento de voz
RECOGNITION_ACTIVE = not RECOGNITION_ACTIVE
state = "activado" if RECOGNITION_ACTIVE else "desactivado"
print(f"Reconocimiento de voz {state}.")
# Espera a que se suelten las teclas antes de continuar
while keyboard.is_pressed('alt gr') or keyboard.is_pressed('ñ'):
pass
# Si el reconocimiento está activado, procesa el audio
if RECOGNITION_ACTIVE:
data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono
# Procesa cada fragmento de audio en tiempo real
if recognizer.AcceptWaveform(data):
result = recognizer.Result() # Obtiene la transcripción de ese fragmento
text = eval(result)['text'].encode('utf-8').decode('utf-8')
text = text + ' ' # Agrega un espacio al final para separar las palabras
print(text) # Imprime la transcripción
keyboard.write(text) # Escribe el texto transcrito
except KeyboardInterrupt:
print("\nPrograma terminado por el usuario.")
# Cierra los recursos
stream.stop_stream()
stream.close()
p.terminate()
+1
View File
@@ -0,0 +1 @@
running
Binary file not shown.
Binary file not shown.
+10
View File
@@ -0,0 +1,10 @@
Copyright 2022-2050 AC Technologies LLC
Small Spanish model for Vosk
WER
%WER 42.63 [ 52779 / 123813, 4570 ins, 22135 del, 26074 sub ] exp/chain/tdnn/decode_test_call/wer_10_0.0
%WER 16.02 [ 24405 / 152364, 3121 ins, 5007 del, 16277 sub ] exp/chain/tdnn/decode_test_cv/wer_11_0.0
%WER 11.21 [ 9922 / 88499, 1811 ins, 1675 del, 6436 sub ] exp/chain/tdnn/decode_test_mls/wer_10_0.0
%WER 16.72 [ 2410 / 14416, 445 ins, 696 del, 1269 sub ] exp/chain/tdnn/decode_test_mtedx/wer_10_0.0
Binary file not shown.
+8
View File
@@ -0,0 +1,8 @@
--sample-frequency=16000.0
--use-energy=false
--num-mel-bins=40
--num-ceps=40
--low-freq=20
--high-freq=7600
--allow-upsample=true
--allow-downsample=true
+10
View File
@@ -0,0 +1,10 @@
--min-active=200
--max-active=4000
--beam=11.0
--lattice-beam=4.0
--acoustic-scale=1.0
--frame-subsampling-factor=3
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
--endpoint.rule2.min-trailing-silence=0.5
--endpoint.rule3.min-trailing-silence=1.0
--endpoint.rule4.min-trailing-silence=2.0
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,10 @@
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
@@ -0,0 +1,110 @@
1 nonword
2 begin
3 end
4 internal
5 singleton
6 nonword
7 begin
8 end
9 internal
10 singleton
11 begin
12 end
13 internal
14 singleton
15 begin
16 end
17 internal
18 singleton
19 begin
20 end
21 internal
22 singleton
23 begin
24 end
25 internal
26 singleton
27 begin
28 end
29 internal
30 singleton
31 begin
32 end
33 internal
34 singleton
35 begin
36 end
37 internal
38 singleton
39 begin
40 end
41 internal
42 singleton
43 begin
44 end
45 internal
46 singleton
47 begin
48 end
49 internal
50 singleton
51 begin
52 end
53 internal
54 singleton
55 begin
56 end
57 internal
58 singleton
59 begin
60 end
61 internal
62 singleton
63 begin
64 end
65 internal
66 singleton
67 begin
68 end
69 internal
70 singleton
71 begin
72 end
73 internal
74 singleton
75 begin
76 end
77 internal
78 singleton
79 begin
80 end
81 internal
82 singleton
83 begin
84 end
85 internal
86 singleton
87 begin
88 end
89 internal
90 singleton
91 begin
92 end
93 internal
94 singleton
95 begin
96 end
97 internal
98 singleton
99 begin
100 end
101 internal
102 singleton
103 begin
104 end
105 internal
106 singleton
107 begin
108 end
109 internal
110 singleton
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,3 @@
[
8.46276e+10 7.072285e+08 -1.289904e+10 5.671974e+09 -1.209136e+10 -2.034736e+10 -6.875634e+09 -1.557388e+10 -2.806233e+09 -3.674485e+09 -6.921794e+09 6.946679e+08 -8.52497e+09 -7.260168e+08 -4.343575e+09 -5.486674e+09 -1.710502e+09 -1.996049e+09 -2.165263e+09 -6.832425e+07 -9.641756e+08 5.967002e+07 -1.110601e+08 4.368464e+07 -1.341292e+07 5.138013e+08 1.807307e+08 -1.273191e+08 9.086873e+08 -7.03787e+08 6.228164e+08 2.102328e+08 4.091046e+07 1.103751e+09 1.753195e+07 6.810451e+08 3.860742e+08 -6.252538e+08 2.375626e+08 -4.708622e+08 8.625583e+08
8.661931e+12 5.440279e+11 7.661625e+11 5.86159e+11 5.687293e+11 1.020851e+12 6.045613e+11 8.250028e+11 4.274593e+11 3.851017e+11 4.398129e+11 4.172123e+11 3.736421e+11 2.438057e+11 2.363456e+11 2.023758e+11 1.314776e+11 8.931146e+10 5.928187e+10 3.395453e+10 1.88439e+10 6.181701e+09 7.988108e+08 2.281808e+08 3.149188e+09 8.009975e+09 1.215929e+10 1.646801e+10 2.218979e+10 2.492957e+10 2.43543e+10 2.243619e+10 2.445217e+10 2.796862e+10 2.215869e+10 1.68782e+10 1.565755e+10 1.461083e+10 1.140419e+10 7.335776e+09 0 ]
@@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
@@ -0,0 +1,2 @@
--left-context=3
--right-context=3