From d3ed8d1ee0ba61e935f0803e083f7583c17a5094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=A4usler?= Date: Mon, 30 Jun 2025 04:05:29 +0200 Subject: [PATCH] fix: try pygame for playback --- speech_to_text.py | 118 ++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 77 deletions(-) diff --git a/speech_to_text.py b/speech_to_text.py index 5c6cb68..d666c9a 100644 --- a/speech_to_text.py +++ b/speech_to_text.py @@ -13,6 +13,7 @@ import numpy as np from typing import Optional, Dict, Any, Callable from gtts import gTTS import tempfile +import pygame # For smoother audio playback class SpeechToText: @@ -33,6 +34,13 @@ class SpeechToText: self.recording_thread = None self.callback = None + # Initialize pygame for audio playback + if not pygame.get_init(): + try: + pygame.init() + except Exception as e: + print(f"Warning: Failed to initialize pygame: {e}") + # Audio settings from config or defaults self.format = pyaudio.paInt16 self.channels = 1 @@ -195,8 +203,12 @@ class SpeechToText: print(f"Error processing audio: {e}") def speak_text(self, text: str) -> None: - """Convert text to speech and play it back on the audio device.""" + """Convert text to speech and play it back using pygame mixer (smoother playback).""" try: + # Initialize pygame mixer if not already done + if not pygame.get_init(): + pygame.mixer.init(frequency=self.rate, channels=self.channels) + print("Converting text to speech...") # Create a temporary file to store the TTS audio with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file: @@ -206,98 +218,50 @@ class SpeechToText: tts = gTTS(text=text, lang=self.language) tts.save(temp_filename) - # Play the audio file print("Playing TTS feedback...") - # Convert MP3 to WAV with matching sample rate - wav_file = self._convert_mp3_to_wav(temp_filename) - if not wav_file: - print("Failed to convert speech audio for playback") - return - - # Open the audio file - wf = wave.open(wav_file, 'rb') - - # Use the same sample rate we know works for the device try: - output_stream = self.p.open( - format=self.p.get_format_from_width(wf.getsampwidth()), - channels=wf.getnchannels(), - rate=self.rate, # Use the known working sample rate - output=True, - output_device_index=self.device_index - ) + # Use pygame mixer for smoother playback + pygame.mixer.music.set_volume(1.0) + pygame.mixer.music.load(temp_filename) + pygame.mixer.music.play() + + # Wait for playback to finish + while pygame.mixer.music.get_busy(): + # Using a short sleep to not consume CPU + pygame.time.wait(100) # Wait 100ms between checks + + print("TTS playback completed") + except Exception as e: - print(f"Failed to open audio output stream: {e}") - # Try again with device default settings - device_info = self.p.get_device_info_by_index(self.device_index) - output_stream = self.p.open( - format=pyaudio.paInt16, - channels=self.channels, - rate=int(device_info['defaultSampleRate']), - output=True, - output_device_index=self.device_index - ) - print(f"Using device default sample rate: {device_info['defaultSampleRate']}") - - # Play the audio with larger buffer for smoother playback - # Using larger chunk size and adding a small delay to allow the buffer to fill - chunk_size = 8192 # Increased from 1024 to reduce stuttering - - # Pre-buffer data for smoother playback - audio_data = [] - while True: - data = wf.readframes(chunk_size) - if len(data) == 0: - break - audio_data.append(data) - - # Reset file position for playback - wf.rewind() - - # Set a larger buffer size for output - buffer_size = chunk_size * 4 - - print(f"Playing audio with buffer size {buffer_size} bytes") - - # Play the buffered data with lower CPU load - for data in audio_data: - output_stream.write(data) - # Small sleep to reduce CPU load and allow buffer to process - time.sleep(0.005) - - # Clean up resources - output_stream.stop_stream() - output_stream.close() - wf.close() - - # Remove temporary files + print(f"Error during pygame playback: {e}") + # Fall back to ffplay for playback + self._play_with_ffplay(temp_filename) + + # Remove temporary file try: os.unlink(temp_filename) - os.unlink(wav_file) # Also remove the WAV file except Exception: pass except Exception as e: print(f"Error generating or playing speech: {e}") - def _convert_mp3_to_wav(self, mp3_file: str) -> str: - """Convert MP3 to WAV format with correct sample rate for PyAudio.""" - try: - import subprocess - wav_file = mp3_file.replace('.mp3', '.wav') + - # Use ffmpeg to convert MP3 to WAV with the same sample rate as the recording - # This ensures the device can play it back properly - subprocess.call(['ffmpeg', '-y', '-i', mp3_file, '-ar', str(self.rate), - '-ac', str(self.channels), wav_file], + def _play_with_ffplay(self, audio_file: str) -> None: + """Play audio file using ffplay as a fallback method.""" + try: + print("Trying ffplay fallback playback...") + import subprocess + # The -nodisp flag disables the graphical window + # -autoexit will close ffplay when playback finishes + subprocess.call(['ffplay', '-nodisp', '-autoexit', audio_file], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - print(f"Converted MP3 to WAV with sample rate {self.rate} Hz") - return wav_file + print("ffplay playback completed") except Exception as e: - print(f"Error converting MP3 to WAV: {e}") - return "" + print(f"Error using ffplay for playback: {e}") def cleanup(self) -> None: """Clean up resources."""