From d3ed8d1ee0ba61e935f0803e083f7583c17a5094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=A4usler?= <jan.haeusler@metropolitan-cities.com>
Date: Mon, 30 Jun 2025 04:05:29 +0200
Subject: [PATCH] fix: try pygame for playback

---
 speech_to_text.py | 118 ++++++++++++++++------------------------------
 1 file changed, 41 insertions(+), 77 deletions(-)

diff --git a/speech_to_text.py b/speech_to_text.py
index 5c6cb68..d666c9a 100644
--- a/speech_to_text.py
+++ b/speech_to_text.py
@@ -13,6 +13,7 @@ import numpy as np
 from typing import Optional, Dict, Any, Callable
 from gtts import gTTS
 import tempfile
+import pygame  # For smoother audio playback
 
 
 class SpeechToText:
@@ -33,6 +34,13 @@ class SpeechToText:
         self.recording_thread = None
         self.callback = None
         
+        # Initialize pygame for audio playback
+        if not pygame.get_init():
+            try:
+                pygame.init()
+            except Exception as e:
+                print(f"Warning: Failed to initialize pygame: {e}")
+        
         # Audio settings from config or defaults
         self.format = pyaudio.paInt16
         self.channels = 1
@@ -195,8 +203,12 @@ class SpeechToText:
             print(f"Error processing audio: {e}")
             
     def speak_text(self, text: str) -> None:
-        """Convert text to speech and play it back on the audio device."""
+        """Convert text to speech and play it back using pygame mixer (smoother playback)."""
         try:
+            # Initialize pygame mixer if not already done
+            if not pygame.get_init():
+                pygame.mixer.init(frequency=self.rate, channels=self.channels)
+            
             print("Converting text to speech...")
             # Create a temporary file to store the TTS audio
             with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file:
@@ -206,98 +218,50 @@ class SpeechToText:
             tts = gTTS(text=text, lang=self.language)
             tts.save(temp_filename)
             
-            # Play the audio file
             print("Playing TTS feedback...")
             
-            # Convert MP3 to WAV with matching sample rate
-            wav_file = self._convert_mp3_to_wav(temp_filename)
-            if not wav_file:
-                print("Failed to convert speech audio for playback")
-                return
-                
-            # Open the audio file
-            wf = wave.open(wav_file, 'rb')
-            
-            # Use the same sample rate we know works for the device
             try:
-                output_stream = self.p.open(
-                    format=self.p.get_format_from_width(wf.getsampwidth()),
-                    channels=wf.getnchannels(),
-                    rate=self.rate,  # Use the known working sample rate
-                    output=True,
-                    output_device_index=self.device_index
-                )
+                # Use pygame mixer for smoother playback
+                pygame.mixer.music.set_volume(1.0)
+                pygame.mixer.music.load(temp_filename)
+                pygame.mixer.music.play()
+                
+                # Wait for playback to finish
+                while pygame.mixer.music.get_busy():
+                    # Using a short sleep to not consume CPU
+                    pygame.time.wait(100)  # Wait 100ms between checks
+                
+                print("TTS playback completed")
+                
             except Exception as e:
-                print(f"Failed to open audio output stream: {e}")
-                # Try again with device default settings
-                device_info = self.p.get_device_info_by_index(self.device_index)
-                output_stream = self.p.open(
-                    format=pyaudio.paInt16,
-                    channels=self.channels,
-                    rate=int(device_info['defaultSampleRate']),
-                    output=True,
-                    output_device_index=self.device_index
-                )
-                print(f"Using device default sample rate: {device_info['defaultSampleRate']}")
-            
-            # Play the audio with larger buffer for smoother playback
-            # Using larger chunk size and adding a small delay to allow the buffer to fill
-            chunk_size = 8192  # Increased from 1024 to reduce stuttering
-            
-            # Pre-buffer data for smoother playback
-            audio_data = []
-            while True:
-                data = wf.readframes(chunk_size)
-                if len(data) == 0:
-                    break
-                audio_data.append(data)
-            
-            # Reset file position for playback
-            wf.rewind()
-            
-            # Set a larger buffer size for output
-            buffer_size = chunk_size * 4
-            
-            print(f"Playing audio with buffer size {buffer_size} bytes")
-            
-            # Play the buffered data with lower CPU load
-            for data in audio_data:
-                output_stream.write(data)
-                # Small sleep to reduce CPU load and allow buffer to process
-                time.sleep(0.005)
-            
-            # Clean up resources
-            output_stream.stop_stream()
-            output_stream.close()
-            wf.close()
-            
-            # Remove temporary files
+                print(f"Error during pygame playback: {e}")
+                # Fall back to ffplay for playback
+                self._play_with_ffplay(temp_filename)
+                
+            # Remove temporary file
             try:
                 os.unlink(temp_filename)
-                os.unlink(wav_file)  # Also remove the WAV file
             except Exception:
                 pass
                 
         except Exception as e:
             print(f"Error generating or playing speech: {e}")
             
-    def _convert_mp3_to_wav(self, mp3_file: str) -> str:
-        """Convert MP3 to WAV format with correct sample rate for PyAudio."""
-        try:
-            import subprocess
-            wav_file = mp3_file.replace('.mp3', '.wav')
+
             
-            # Use ffmpeg to convert MP3 to WAV with the same sample rate as the recording
-            # This ensures the device can play it back properly
-            subprocess.call(['ffmpeg', '-y', '-i', mp3_file, '-ar', str(self.rate), 
-                           '-ac', str(self.channels), wav_file], 
+    def _play_with_ffplay(self, audio_file: str) -> None:
+        """Play audio file using ffplay as a fallback method."""
+        try:
+            print("Trying ffplay fallback playback...")
+            import subprocess
+            # The -nodisp flag disables the graphical window
+            # -autoexit will close ffplay when playback finishes
+            subprocess.call(['ffplay', '-nodisp', '-autoexit', audio_file], 
                            stdout=subprocess.DEVNULL, 
                            stderr=subprocess.DEVNULL)
-            print(f"Converted MP3 to WAV with sample rate {self.rate} Hz")
-            return wav_file
+            print("ffplay playback completed")
         except Exception as e:
-            print(f"Error converting MP3 to WAV: {e}")
-            return ""
+            print(f"Error using ffplay for playback: {e}")
             
     def cleanup(self) -> None:
         """Clean up resources."""