Add initial project files including Vosk model and audio recognition script
This commit is contained in:
@@ -0,0 +1 @@
|
||||
venv
|
||||
@@ -0,0 +1,192 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Comienza a hablar...\n",
|
||||
"analiza este comprador y comprueba que tan verídico puede ser \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[8], line 26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComienza a hablar...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m---> 26\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBUFFER\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Lee el flujo de audio del micrófono\u001b[39;00m\n\u001b[0;32m 28\u001b[0m \u001b[38;5;66;03m# Si no se recibe audio, sale del bucle\u001b[39;00m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
|
||||
"File \u001b[1;32mc:\\Users\\Tr4shhh\\.conda\\envs\\data\\Lib\\site-packages\\pyaudio\\__init__.py:570\u001b[0m, in \u001b[0;36mPyAudio.Stream.read\u001b[1;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_input:\n\u001b[0;32m 568\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNot input stream\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 569\u001b[0m paCanNotReadFromAnOutputOnlyStream)\n\u001b[1;32m--> 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_frames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 571\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_on_overflow\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"import pyaudio\n",
|
||||
"from vosk import Model, KaldiRecognizer\n",
|
||||
"import keyboard # Nueva biblioteca para escribir\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BUFFER = 500 # Tamaño del buffer de audio\n",
|
||||
"\n",
|
||||
"# Carga el modelo de Vosk (español)\n",
|
||||
"model = Model(\"vosk-model-small-es-0.42\")\n",
|
||||
"\n",
|
||||
"# Inicializa PyAudio\n",
|
||||
"p = pyaudio.PyAudio()\n",
|
||||
"\n",
|
||||
"# Configura el micrófono (entrada de audio)\n",
|
||||
"stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4000)\n",
|
||||
"stream.start_stream()\n",
|
||||
"\n",
|
||||
"# Inicializa el reconocedor\n",
|
||||
"recognizer = KaldiRecognizer(model, 16000)\n",
|
||||
"\n",
|
||||
"print(\"Comienza a hablar...\")\n",
|
||||
"\n",
|
||||
"while True:\n",
|
||||
" data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono\n",
|
||||
"\n",
|
||||
" # Si no se recibe audio, sale del bucle\n",
|
||||
" if len(data) == 0:\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" # Procesa cada fragmento de audio en tiempo real\n",
|
||||
" if recognizer.AcceptWaveform(data):\n",
|
||||
" result = recognizer.Result() # Obtiene la transcripción de ese fragmento\n",
|
||||
" text = eval(result)['text'].encode('utf-8').decode('utf-8')\n",
|
||||
" text = text + ' ' # Agrega un espacio al final para separar las palabras\n",
|
||||
" print(text)\n",
|
||||
" keyboard.write(text)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"El archivo vosk-model-small-es-0.42.zip ha sido descomprimido.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
"# Nombre del archivo zip\n",
|
||||
"zip_filename = r'vosk-model-small-es-0.42.zip'\n",
|
||||
"\n",
|
||||
"# Descomprime el archivo zip en la misma carpeta\n",
|
||||
"with zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n",
|
||||
" zip_ref.extractall('.')\n",
|
||||
"\n",
|
||||
"print(f\"El archivo {zip_filename} ha sido descomprimido.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ID: 0, Nombre: Microsoft Sound Mapper - Input, Canales de entrada: 2\n",
|
||||
"ID: 1, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
|
||||
"ID: 2, Nombre: Microsoft Sound Mapper - Output, Canales de entrada: 0\n",
|
||||
"ID: 3, Nombre: Speakers (2- High Definition Au, Canales de entrada: 0\n",
|
||||
"ID: 4, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
|
||||
"ID: 5, Nombre: Primary Sound Capture Driver, Canales de entrada: 2\n",
|
||||
"ID: 6, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
|
||||
"ID: 7, Nombre: Primary Sound Driver, Canales de entrada: 0\n",
|
||||
"ID: 8, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
|
||||
"ID: 9, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
|
||||
"ID: 10, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
|
||||
"ID: 11, Nombre: Speakers (2- High Definition Audio Device), Canales de entrada: 0\n",
|
||||
"ID: 12, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n",
|
||||
"ID: 13, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
|
||||
"ID: 14, Nombre: Output (NVIDIA High Definition Audio), Canales de entrada: 0\n",
|
||||
"ID: 15, Nombre: Speakers (HD Audio Speaker), Canales de entrada: 0\n",
|
||||
"ID: 16, Nombre: SPDIF Out (HD Audio SPDIF out), Canales de entrada: 0\n",
|
||||
"ID: 17, Nombre: Speakers (RAGEX40), Canales de entrada: 0\n",
|
||||
"ID: 18, Nombre: Microphone (RAGEX40), Canales de entrada: 1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pyaudio\n",
|
||||
"\n",
|
||||
"p = pyaudio.PyAudio()\n",
|
||||
"for i in range(p.get_device_count()):\n",
|
||||
" info = p.get_device_info_by_index(i)\n",
|
||||
" print(f\"ID: {i}, Nombre: {info['name']}, Canales de entrada: {info['maxInputChannels']}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Presiona una tecla para ver su nombre (Ctrl+C para salir):\n",
|
||||
"Tecla presionada: alt gr\n",
|
||||
"Tecla presionada: alt gr\n",
|
||||
"Tecla presionada: alt gr\n",
|
||||
"Tecla presionada: ñ\n",
|
||||
"Tecla presionada: alt gr\n",
|
||||
"Tecla presionada: ñ\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import keyboard\n",
|
||||
"\n",
|
||||
"print(\"Presiona una tecla para ver su nombre (Ctrl+C para salir):\")\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" while True:\n",
|
||||
" event = keyboard.read_event()\n",
|
||||
" if event.event_type == keyboard.KEY_DOWN:\n",
|
||||
" print(f\"Tecla presionada: {event.name}\")\n",
|
||||
"except KeyboardInterrupt:\n",
|
||||
" print(\"Finalizado.\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "data",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
vosk
|
||||
pyaudio
|
||||
keyboard
|
||||
@@ -0,0 +1,56 @@
|
||||
import sys
|
||||
import os
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
import keyboard
|
||||
|
||||
BUFFER = 500 # Tamaño del buffer de audio
|
||||
RECOGNITION_ACTIVE = False # Estado inicial del reconocimiento
|
||||
|
||||
# Carga el modelo de Vosk (español)
|
||||
model = Model("vosk-model-small-es-0.42")
|
||||
|
||||
# Inicializa PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
# Configura el micrófono (entrada de audio)
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=2000)
|
||||
stream.start_stream()
|
||||
|
||||
# Inicializa el reconocedor
|
||||
recognizer = KaldiRecognizer(model, 16000)
|
||||
|
||||
print("Presiona 'AltGr' + 'Ñ' para activar/desactivar el reconocimiento de voz...")
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Detecta si 'AltGr' + 'Ñ' son presionadas juntas
|
||||
if keyboard.is_pressed('alt gr') and keyboard.is_pressed('ñ'):
|
||||
# Cambia el estado de activación del reconocimiento de voz
|
||||
RECOGNITION_ACTIVE = not RECOGNITION_ACTIVE
|
||||
state = "activado" if RECOGNITION_ACTIVE else "desactivado"
|
||||
print(f"Reconocimiento de voz {state}.")
|
||||
|
||||
# Espera a que se suelten las teclas antes de continuar
|
||||
while keyboard.is_pressed('alt gr') or keyboard.is_pressed('ñ'):
|
||||
pass
|
||||
|
||||
# Si el reconocimiento está activado, procesa el audio
|
||||
if RECOGNITION_ACTIVE:
|
||||
data = stream.read(BUFFER, exception_on_overflow=False) # Lee el flujo de audio del micrófono
|
||||
|
||||
# Procesa cada fragmento de audio en tiempo real
|
||||
if recognizer.AcceptWaveform(data):
|
||||
result = recognizer.Result() # Obtiene la transcripción de ese fragmento
|
||||
text = eval(result)['text'].encode('utf-8').decode('utf-8')
|
||||
text = text + ' ' # Agrega un espacio al final para separar las palabras
|
||||
print(text) # Imprime la transcripción
|
||||
keyboard.write(text) # Escribe el texto transcrito
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nPrograma terminado por el usuario.")
|
||||
|
||||
# Cierra los recursos
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
@@ -0,0 +1 @@
|
||||
running
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
Copyright 2022-2050 AC Technologies LLC
|
||||
|
||||
Small Spanish model for Vosk
|
||||
|
||||
WER
|
||||
|
||||
%WER 42.63 [ 52779 / 123813, 4570 ins, 22135 del, 26074 sub ] exp/chain/tdnn/decode_test_call/wer_10_0.0
|
||||
%WER 16.02 [ 24405 / 152364, 3121 ins, 5007 del, 16277 sub ] exp/chain/tdnn/decode_test_cv/wer_11_0.0
|
||||
%WER 11.21 [ 9922 / 88499, 1811 ins, 1675 del, 6436 sub ] exp/chain/tdnn/decode_test_mls/wer_10_0.0
|
||||
%WER 16.72 [ 2410 / 14416, 445 ins, 696 del, 1269 sub ] exp/chain/tdnn/decode_test_mtedx/wer_10_0.0
|
||||
Binary file not shown.
@@ -0,0 +1,8 @@
|
||||
--sample-frequency=16000.0
|
||||
--use-energy=false
|
||||
--num-mel-bins=40
|
||||
--num-ceps=40
|
||||
--low-freq=20
|
||||
--high-freq=7600
|
||||
--allow-upsample=true
|
||||
--allow-downsample=true
|
||||
@@ -0,0 +1,10 @@
|
||||
--min-active=200
|
||||
--max-active=4000
|
||||
--beam=11.0
|
||||
--lattice-beam=4.0
|
||||
--acoustic-scale=1.0
|
||||
--frame-subsampling-factor=3
|
||||
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
|
||||
--endpoint.rule2.min-trailing-silence=0.5
|
||||
--endpoint.rule3.min-trailing-silence=1.0
|
||||
--endpoint.rule4.min-trailing-silence=2.0
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
7601
|
||||
7602
|
||||
7603
|
||||
7604
|
||||
7605
|
||||
7606
|
||||
7607
|
||||
7608
|
||||
7609
|
||||
7610
|
||||
@@ -0,0 +1,110 @@
|
||||
1 nonword
|
||||
2 begin
|
||||
3 end
|
||||
4 internal
|
||||
5 singleton
|
||||
6 nonword
|
||||
7 begin
|
||||
8 end
|
||||
9 internal
|
||||
10 singleton
|
||||
11 begin
|
||||
12 end
|
||||
13 internal
|
||||
14 singleton
|
||||
15 begin
|
||||
16 end
|
||||
17 internal
|
||||
18 singleton
|
||||
19 begin
|
||||
20 end
|
||||
21 internal
|
||||
22 singleton
|
||||
23 begin
|
||||
24 end
|
||||
25 internal
|
||||
26 singleton
|
||||
27 begin
|
||||
28 end
|
||||
29 internal
|
||||
30 singleton
|
||||
31 begin
|
||||
32 end
|
||||
33 internal
|
||||
34 singleton
|
||||
35 begin
|
||||
36 end
|
||||
37 internal
|
||||
38 singleton
|
||||
39 begin
|
||||
40 end
|
||||
41 internal
|
||||
42 singleton
|
||||
43 begin
|
||||
44 end
|
||||
45 internal
|
||||
46 singleton
|
||||
47 begin
|
||||
48 end
|
||||
49 internal
|
||||
50 singleton
|
||||
51 begin
|
||||
52 end
|
||||
53 internal
|
||||
54 singleton
|
||||
55 begin
|
||||
56 end
|
||||
57 internal
|
||||
58 singleton
|
||||
59 begin
|
||||
60 end
|
||||
61 internal
|
||||
62 singleton
|
||||
63 begin
|
||||
64 end
|
||||
65 internal
|
||||
66 singleton
|
||||
67 begin
|
||||
68 end
|
||||
69 internal
|
||||
70 singleton
|
||||
71 begin
|
||||
72 end
|
||||
73 internal
|
||||
74 singleton
|
||||
75 begin
|
||||
76 end
|
||||
77 internal
|
||||
78 singleton
|
||||
79 begin
|
||||
80 end
|
||||
81 internal
|
||||
82 singleton
|
||||
83 begin
|
||||
84 end
|
||||
85 internal
|
||||
86 singleton
|
||||
87 begin
|
||||
88 end
|
||||
89 internal
|
||||
90 singleton
|
||||
91 begin
|
||||
92 end
|
||||
93 internal
|
||||
94 singleton
|
||||
95 begin
|
||||
96 end
|
||||
97 internal
|
||||
98 singleton
|
||||
99 begin
|
||||
100 end
|
||||
101 internal
|
||||
102 singleton
|
||||
103 begin
|
||||
104 end
|
||||
105 internal
|
||||
106 singleton
|
||||
107 begin
|
||||
108 end
|
||||
109 internal
|
||||
110 singleton
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,3 @@
|
||||
[
|
||||
8.46276e+10 7.072285e+08 -1.289904e+10 5.671974e+09 -1.209136e+10 -2.034736e+10 -6.875634e+09 -1.557388e+10 -2.806233e+09 -3.674485e+09 -6.921794e+09 6.946679e+08 -8.52497e+09 -7.260168e+08 -4.343575e+09 -5.486674e+09 -1.710502e+09 -1.996049e+09 -2.165263e+09 -6.832425e+07 -9.641756e+08 5.967002e+07 -1.110601e+08 4.368464e+07 -1.341292e+07 5.138013e+08 1.807307e+08 -1.273191e+08 9.086873e+08 -7.03787e+08 6.228164e+08 2.102328e+08 4.091046e+07 1.103751e+09 1.753195e+07 6.810451e+08 3.860742e+08 -6.252538e+08 2.375626e+08 -4.708622e+08 8.625583e+08
|
||||
8.661931e+12 5.440279e+11 7.661625e+11 5.86159e+11 5.687293e+11 1.020851e+12 6.045613e+11 8.250028e+11 4.274593e+11 3.851017e+11 4.398129e+11 4.172123e+11 3.736421e+11 2.438057e+11 2.363456e+11 2.023758e+11 1.314776e+11 8.931146e+10 5.928187e+10 3.395453e+10 1.88439e+10 6.181701e+09 7.988108e+08 2.281808e+08 3.149188e+09 8.009975e+09 1.215929e+10 1.646801e+10 2.218979e+10 2.492957e+10 2.43543e+10 2.243619e+10 2.445217e+10 2.796862e+10 2.215869e+10 1.68782e+10 1.565755e+10 1.461083e+10 1.140419e+10 7.335776e+09 0 ]
|
||||
@@ -0,0 +1 @@
|
||||
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
|
||||
@@ -0,0 +1,2 @@
|
||||
--left-context=3
|
||||
--right-context=3
|
||||
Reference in New Issue
Block a user