Skip to content

Instantly share code, notes, and snippets.

@GabrielSGoncalves
Last active September 24, 2019 16:26
Show Gist options
  • Save GabrielSGoncalves/d629524d418cdd1301e3fae6d779df97 to your computer and use it in GitHub Desktop.
Save GabrielSGoncalves/d629524d418cdd1301e3fae6d779df97 to your computer and use it in GitHub Desktop.
Fifth part of the NLP analysis for the Medium article on AWS ML/AI tools for NLP.
# 16) Function to call Amazon Comprehend service using boto3
def start_comprehend_job(text):
"""
Executes sentiment analysis of a text using Amazon Comprehend.
The text can be larger than 5000 bytes (one limitation for each job), as
the function will split it into multiple processes and return a
averaged value for each sentiment.
Parameter
- text (str): The text to be analyzed
Return
- final_dict (dict): Dictionary with the percentage of each one of the 4
sentiments evaluated on Amazon Comprehend model (positive, negative,
neutral, mixed)
"""
list_parts = []
text_for_analysis = ''
for sentence in text.split('.'):
current_text = text_for_analysis + f'{sentence}.'
if len(current_text.encode('utf-8')) > 5000:
list_parts.append([len(text_for_analysis), text_for_analysis])
text_for_analysis = f'{sentence}.'
else:
text_for_analysis += f'{sentence}.'
list_parts.append([len(text_for_analysis), text_for_analysis])
dict_comprehend = {}
for t_parts in list_parts:
comprehend_client = boto3.client(service_name='comprehend', region_name='us-east-1')
sentimentData = comprehend_client.detect_sentiment(Text=t_parts[1], LanguageCode='en')
dict_comprehend[t_parts[0]] = sentimentData
dict_comprehend[t_parts[0]]['ratio'] = t_parts[0]/float(len(text))
final_dict = {'Positive':0, 'Negative':0, 'Neutral':0, 'Mixed':0}
list_sentiments = ['Positive', 'Negative', 'Neutral', 'Mixed']
for sentiment in list_sentiments:
for key, value in dict_comprehend.items():
final_dict[sentiment] += value.get('SentimentScore').get(sentiment) * value.get('ratio')
return final_dic
# 17) Run the sentiment analysis using start_comprehend_job function
for index, row in df_audio.iterrows():
transcript_text = get_text_from_json(bucket_name, row.json_transcription)
comprehend_results = start_comprehend_job(transcript_text)
for k,v in comprehend_results.items():
df_audio.at[index, k] = v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment