speechmatics initial setup as model selection

BasedHardware · Sep 19, 2024 · 76d0eb7 · 76d0eb7
1 parent 59072b1
commit 76d0eb7
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 4 deletions.
diff --git a/app/lib/backend/http/api/memories.dart b/app/lib/backend/http/api/memories.dart
@@ -188,18 +188,22 @@ class TranscriptsResponse {
   List<TranscriptSegment> deepgram;
   List<TranscriptSegment> soniox;
   List<TranscriptSegment> whisperx;
+  List<TranscriptSegment> speechmatics;
 
   TranscriptsResponse({
     this.deepgram = const [],
     this.soniox = const [],
     this.whisperx = const [],
+    this.speechmatics = const [],
   });
 
   factory TranscriptsResponse.fromJson(Map<String, dynamic> json) {
     return TranscriptsResponse(
       deepgram: (json['deepgram'] as List<dynamic>).map((segment) => TranscriptSegment.fromJson(segment)).toList(),
       soniox: (json['soniox'] as List<dynamic>).map((segment) => TranscriptSegment.fromJson(segment)).toList(),
       whisperx: (json['whisperx'] as List<dynamic>).map((segment) => TranscriptSegment.fromJson(segment)).toList(),
+      speechmatics:
+          (json['speechmatics'] as List<dynamic>).map((segment) => TranscriptSegment.fromJson(segment)).toList(),
     );
   }
 }

diff --git a/app/lib/backend/schema/transcript_segment.dart b/app/lib/backend/schema/transcript_segment.dart
@@ -129,6 +129,26 @@ class TranscriptSegment {
     cleanSegments(joinedSimilarSegments);
 
     segments.addAll(joinedSimilarSegments);
+
+    //     for i, segment in enumerate(segments):
+    //         segments[i].text = (
+    //             segments[i].text.strip()
+    //             .replace('  ', '')
+    //             .replace(' ,', ',')
+    //             .replace(' .', '.')
+    //             .replace(' ?', '?')
+    //         )
+
+    // Speechmatics specific issue with punctuation
+    for (var i = 0; i < segments.length; i++) {
+      segments[i].text = segments[i]
+          .text
+          .replaceAll('  ', '')
+          .replaceAll(' ,', ',')
+          .replaceAll(' .', '.')
+          .replaceAll(' ?', '?')
+          .trim();
+    }
   }
 
   static String segmentsAsString(

diff --git a/app/lib/pages/memory_detail/compare_transcripts.dart b/app/lib/pages/memory_detail/compare_transcripts.dart
@@ -35,7 +35,7 @@ class _CompareTranscriptsPageState extends State<CompareTranscriptsPage> {
         backgroundColor: Theme.of(context).colorScheme.primary,
       ),
       body: DefaultTabController(
-        length: 3,
+        length: 4,
         initialIndex: 0,
         child: Column(
           children: [
@@ -50,7 +50,12 @@ class _CompareTranscriptsPageState extends State<CompareTranscriptsPage> {
               padding: EdgeInsets.zero,
               indicatorPadding: EdgeInsets.zero,
               labelStyle: Theme.of(context).textTheme.titleLarge!.copyWith(fontSize: 18),
-              tabs: const [Tab(text: 'Deepgram'), Tab(text: 'Soniox'), Tab(text: 'Whisper-x')],
+              tabs: const [
+                Tab(text: 'Deepgram'),
+                Tab(text: 'Soniox'),
+                Tab(text: 'SpeechMatics'),
+                Tab(text: 'Whisper-x'),
+              ],
               indicator: BoxDecoration(color: Colors.transparent, borderRadius: BorderRadius.circular(16)),
             ),
             Expanded(
@@ -84,6 +89,18 @@ class _CompareTranscriptsPageState extends State<CompareTranscriptsPage> {
                           )
                         ],
                       ),
+                      ListView(
+                        shrinkWrap: true,
+                        children: [
+                          TranscriptWidget(
+                            segments: transcripts?.speechmatics ?? [],
+                            horizontalMargin: false,
+                            topMargin: false,
+                            canDisplaySeconds: true,
+                            isMemoryDetail: true,
+                          )
+                        ],
+                      ),
                       ListView(
                         shrinkWrap: true,
                         children: [

diff --git a/app/lib/pages/settings/developer.dart b/app/lib/pages/settings/developer.dart
@@ -127,7 +127,7 @@ class __DeveloperSettingsPageState extends State<_DeveloperSettingsPage> {
                         underline: Container(height: 0, color: Colors.white),
                         isExpanded: true,
                         itemHeight: 48,
-                        items: ['deepgram', 'soniox'].map<DropdownMenuItem<String>>((String value) {
+                        items: ['deepgram', 'soniox', 'speechmatics'].map<DropdownMenuItem<String>>((String value) {
                           return DropdownMenuItem<String>(
                             value: value,
                             child: Text(

diff --git a/backend/utils/stt/streaming.py b/backend/utils/stt/streaming.py
@@ -257,7 +257,8 @@ async def on_message():
                         segments[i]['text'] = segments[i]['text'].strip().replace('  ', '')
 
                     # print('Soniox:', transcript.replace('<end>', ''))
-                    stream_transcript(segments, stream_id)
+                    if segments:
+                        stream_transcript(segments, stream_id)
             except websockets.exceptions.ConnectionClosedOK:
                 print("Soniox connection closed normally.")
             except Exception as e:
@@ -276,3 +277,119 @@ async def on_message():
     except Exception as e:
         print(f"Exception in process_audio_soniox: {e}")
         raise  # Re-raise the exception to be handled by the caller
+
+
+LANGUAGE = "en"
+CONNECTION_URL = f"wss://eu2.rt.speechmatics.com/v2"
+
+
+async def process_audio_speechmatics(stream_transcript, stream_id: int, language: str, uid: str):
+    # Create a transcription client
+    api_key = os.getenv('SPEECHMATICS_API_KEY')
+    uri = 'wss://eu2.rt.speechmatics.com/v2'
+    # Validate the language and construct the model name
+    # has_speech_profile = create_user_speech_profile(uid)  # only english too
+
+    request = {
+        "message": "StartRecognition",
+        "transcription_config": {
+            "language": language,
+            "diarization": "speaker",
+            "operating_point": "enhanced",
+            "max_delay_mode": "flexible",
+            "max_delay": 3,
+            "enable_partials": False,
+            "enable_entities": True,
+            "speaker_diarization_config": {"max_speakers": 4}
+        },
+        "audio_format": {"type": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
+        # "audio_events_config": {
+        #     "types": [
+        #         "laughter",
+        #         "music",
+        #         "applause"
+        #     ]
+        # }
+    }
+    try:
+        # Connect to Soniox WebSocket
+        print("Connecting to Speechmatics WebSocket...")
+        socket = await websockets.connect(uri, extra_headers={"Authorization": f"Bearer {api_key}"})
+        print("Connected to Speechmatics WebSocket.")
+
+        # Send the initial request
+        await socket.send(json.dumps(request))
+        print(f"Sent initial request: {request}")
+
+        # Start listening for messages from Soniox
+        async def on_message():
+            try:
+                async for message in socket:
+                    response = json.loads(message)
+                    if response['message'] == 'AudioAdded':
+                        continue
+                    if response['message'] == 'AddTranscript':
+                        results = response['results']
+                        if not results:
+                            continue
+                        segments = []
+                        for r in results:
+                            # print(r)
+                            if not r['alternatives']:
+                                continue
+                            r_data = r['alternatives'][0]
+                            r_type = r['type']  # word | punctuation
+                            r_start = r['start_time']
+                            r_end = r['end_time']
+
+                            r_content = r_data['content']
+                            r_confidence = r_data['confidence']
+                            if r_confidence < 0.4:
+                                print('Low confidence:', r)
+                                continue
+                            r_speaker = r_data['speaker'][1:] if r_data['speaker'] != 'UU' else '1'
+                            speaker = f"SPEAKER_0{r_speaker}"
+                            # print(r_content, r_speaker, [r_start, r_end])
+                            if not segments:
+                                segments.append({
+                                    'speaker': speaker,
+                                    'start': r_start,
+                                    'end': r_end,
+                                    'text': r_content,
+                                    'is_user': False,
+                                    'person_id': None,
+                                })
+                            else:
+                                last_segment = segments[-1]
+                                if last_segment['speaker'] == speaker:
+                                    last_segment['text'] += f' {r_content}'
+                                    last_segment['end'] += r_end
+                                else:
+                                    segments.append({
+                                        'speaker': speaker,
+                                        'start': r_start,
+                                        'end': r_end,
+                                        'text': r_content,
+                                        'is_user': False,
+                                        'person_id': None,
+                                    })
+
+                        if segments:
+                            stream_transcript(segments, stream_id)
+                        # print('---')
+                    else:
+                        print(response)
+            except websockets.exceptions.ConnectionClosedOK:
+                print("Speechmatics connection closed normally.")
+            except Exception as e:
+                print(f"Error receiving from Speechmatics: {e}")
+            finally:
+                if not socket.closed:
+                    await socket.close()
+                    print("Speechmatics WebSocket closed in on_message.")
+
+        asyncio.create_task(on_message())
+        return socket
+    except Exception as e:
+        print(f"Exception in process_audio_speechmatics: {e}")
+        raise