Skip to content

Instantly share code, notes, and snippets.

@GabrielSGoncalves
Last active September 24, 2019 14:31
Show Gist options
  • Save GabrielSGoncalves/8e339baee85e05fc97feb8f2533151b9 to your computer and use it in GitHub Desktop.
Save GabrielSGoncalves/8e339baee85e05fc97feb8f2533151b9 to your computer and use it in GitHub Desktop.
Third part of the NLP analysis for the Medium article on AWS ML/AI tools
# 10) Function to get text from the JSON file generated using Amazon Transcribe
def get_text_from_json(bucket, key):
s3 = boto3.client('s3')
object = s3.get_object(Bucket=bucket, Key=key)
serializedObject = object['Body'].read()
data = json.loads(serializedObject)
return data.get('results').get('transcripts')[0].get('transcript')
# 11) Reading the original transcription from the JSON file
with open('original_transcripts.json', 'r') as f:
original_transcriptions = json.load(f)
# 12) Function to process text
def process_text(text):
"""
Process text by removing stop words, punctuation,
pronouns and performing lemmatization on tokens.
Parameters
text (str): Any given text
Return
str: Processed text
"""
doc = nlp(text.lower())
result = []
for token in doc:
if token.text in nlp.Defaults.stop_words:
continue
if token.is_punct:
continue
if token.lemma_ == '-PRON-':
continue
result.append(token.lemma_)
return " ".join(result)
# 13) Iterate over the speakers comparing the transcription texts using spaCy
nlp = spacy.load('en_core_web_lg')
for index, row in df_audio.iterrows():
original_transcription = nlp(process_text(original_transcriptions.get(index)))
transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
w2v_similaraty = original_transcription.similarity(transcribe_transcription)
df_audio.at[index, 'w2v_text_similarity'] = w2v_similaraty
print(f'Processed Word2vec Similiraty for {index}\'s speech: {w2v_similaraty}')
# 14) Iterate over the speakers to get Word Movers distance using spaCy and wmd
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
for index, row in df_audio.iterrows():
original_transcription = nlp(process_text(original_transcriptions.get(index)))
transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
wmd_similaraty = original_transcription.similarity(transcribe_transcription)
df_audio.at[index, 'wmd_similarity'] = wmd_similaraty
print(f'Word Movers Distance Similiraty for {index}\'s speech: {wmd_similaraty}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment