added speaker_id conversions to Amazon and Google

2019-03-08 10:23:01 -05:00
parent 3fc6dacfde
commit 0301b3be23
6 changed files with 130 additions and 55 deletions
--- a/tests/test_amazon.py
+++ b/tests/test_amazon.py
@@ -0,0 +1,38 @@
 import json
 import os
 import pytest
 from transcript_processing.converters.amazon import AmazonConverter
@pytest.fixture
 def transcript_data():
    with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), 'r') as fin:
        return json.load(fin)
@pytest.fixture
 def converter(transcript_data):
    return AmazonConverter(transcript_data)
 def test_get_word_objects(converter):
    word_objects = converter.get_word_objects(converter.json_data)
    assert word_objects
 def test_get_speaker_segments(converter):
    speaker_segments = converter.get_speaker_segments()
    assert speaker_segments
 def test_get_speaker_id(converter):
    speaker_segments = converter.get_speaker_segments()
    assert speaker_segments[54.58] == 0
    assert speaker_segments[32.36] == 1
 def test_convert(converter):
    converter.convert()
    print(converter.converted_words)
--- a/tests/test_convert_google.py
+++ b/tests/test_convert_google.py
@@ -1,33 +0,0 @@
 import json
 import os
 import pytest
 from transcript_processing.converters.google import (
    make_json_friendly,
    GoogleConverter,
        )
 from transcript_processing.config import GOOGLE_TRANSCRIPT_TEST_FILE
@pytest.fixture
 def transcript():
    with open(GOOGLE_TRANSCRIPT_TEST_FILE, 'r') as fin:
        return fin.read()
 def test_make_json_friendly(transcript):
    friendly = make_json_friendly(transcript)
    assert json.loads(friendly)
 def test_pre_process(transcript):
    with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
        transcript_data = fin.read()
    g = GoogleConverter(transcript_data)
    assert g.json_data
    print(g.json_data)
--- a/tests/test_google.py
+++ b/tests/test_google.py
@@ -0,0 +1,39 @@
 import json
 import os
 import pytest
 from transcript_processing.converters.google import (
    make_json_friendly,
    GoogleConverter,
        )
@pytest.fixture
 def transcript_data():
    with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
        return fin.read()
@pytest.fixture
 def converter(transcript_data):
    return GoogleConverter(transcript_data)
 def test_get_word_objects(converter):
    word_objects = converter.get_word_objects(converter.json_data)
    assert word_objects
 def test_make_json_friendly(transcript_data):
    friendly = make_json_friendly(transcript_data)
    assert json.loads(friendly)
 def test_pre_process(converter):
    assert converter.json_data
 def test_convert(converter):
    converter.convert()
    print(converter.converted_words)
--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -62,7 +62,7 @@ class TranscriptConverter:
    @staticmethod
    @abc.abstractmethod
-    def get_speaker_id(word_object):
+    def get_speaker_id(word_object, speaker_segments=None):
        pass
    @staticmethod
@@ -77,7 +77,14 @@ class TranscriptConverter:
        word_category = tagged_words[index][1] 
        return word_category in helpers.PROPER_NOUN_TAGS
-    def get_word_object(self, word_object, index, tagged_words, word_objects):
+    def get_word_object(
            self, 
            word_object, 
            index, 
            tagged_words, 
            word_objects,
            speaker_segments=None,
            ):
        word = self.get_word_word(word_object)
        return Word(
            self.get_word_start(word_object),
@@ -86,7 +93,7 @@ class TranscriptConverter:
            word,
            self.check_if_always_capitalized(word, index, tagged_words),
            self.get_next_word(word_objects, index),
-            self.get_speaker_id(word_object),
+            self.get_speaker_id(word_object, speaker_segments),
                ) 
    def get_next_word(self, word_objects, index):
--- a/transcript_processing/converters/amazon.py
+++ b/transcript_processing/converters/amazon.py
@@ -1,4 +1,5 @@
 import json
 from typing import Dict, Optional
 from ..converter import TranscriptConverter
 from .. import helpers
@@ -12,9 +13,28 @@ class AmazonConverter(TranscriptConverter):
    def __init__(self, json_data):
        super().__init__(json_data)
-    def get_word_objects(self, json_data):
+    def get_word_objects(self, json_data) -> list:
        return json_data['results']['items']
    def get_speaker_segments(self) -> Optional[Dict[float, str]]:
        try:
            segments = self.json_data['results']['speaker_labels']['segments']
        except KeyError:
            return None
        else:
            segment_dict = {}
            for segment in segments:
                word_level_segment = segment['items']
                for word in word_level_segment:
                    start_time = float(word['start_time'])
                    speaker_label = word['speaker_label']
                    speaker_id = ''
                    for char in speaker_label:
                        if char.isnumeric():
                            speaker_id += char
                    segment_dict[start_time] = int(speaker_id)
            return segment_dict
    @staticmethod
    def get_word_start(word_object):
        return float(word_object['start_time'])
@@ -35,12 +55,17 @@ class AmazonConverter(TranscriptConverter):
            word_word = 'I'
        return word_word
-    @staticmethod
+    @classmethod
-    def get_speaker_id(word_object):
+    def get_speaker_id(cls, word_object, speaker_segments=None):
-        return None
+        if speaker_segments is None:
            return None
        else:
            word_start = cls.get_word_start(word_object)
            return speaker_segments[word_start]
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
        speaker_segments = self.get_speaker_segments()
        punc_before = False
        punc_after = False
@@ -49,7 +74,13 @@ class AmazonConverter(TranscriptConverter):
            if w['type'] == 'punctuation':
                continue
            next_word_punc_after = None
-            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+            word_obj = self.get_word_object(
                    w, 
                    i,
                    tagged_words,
                    word_objects,
                    speaker_segments,
                    )
            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -57,6 +57,10 @@ class GoogleConverter(TranscriptConverter):
        return converted_words
    @staticmethod
    def get_speaker_id(word_object, _):
        return word_object.get('speaker_tag')        
    @classmethod
    def get_word_start(cls, word_object):
        return cls.get_seconds(word_object['start_time'])
@@ -88,16 +92,6 @@ def make_json_friendly(json_string):
    lines = [line.strip() for line in json_string.split('\n')]
    new_string = '['
    fields = [
        'words {', 
        'start_time {', 
        '}',
        'end_time {', 
        '}',
        'word: ', 
        'confidence: '
        ]
    start_field = 'words {'
    open_braces = 0
@@ -114,13 +108,13 @@ def make_json_friendly(json_string):
        if '}' in line:
            open_braces -= 1
        if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]:
            line = line + ', '
        if open_braces == 0:
            new_string += '}, '
            continue
        elif '{' not in line and '}' not in lines[index + 1]:
            line = line + ', '
        line = re.sub('^(?!")([0-9a-zA-Z_]+)',
                '"\\1"',
                line)
@@ -134,5 +128,4 @@ def make_json_friendly(json_string):
    new_string = new_string.replace('\\', '')
    return new_string[:-2] + ']'