Last active
September 24, 2019 16:26
-
-
Save GabrielSGoncalves/d629524d418cdd1301e3fae6d779df97 to your computer and use it in GitHub Desktop.
Fifth part of the NLP analysis for the Medium article on AWS ML/AI tools for NLP.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 16) Function to call Amazon Comprehend service using boto3 | |
def start_comprehend_job(text): | |
""" | |
Executes sentiment analysis of a text using Amazon Comprehend. | |
The text can be larger than 5000 bytes (one limitation for each job), as | |
the function will split it into multiple processes and return a | |
averaged value for each sentiment. | |
Parameter | |
- text (str): The text to be analyzed | |
Return | |
- final_dict (dict): Dictionary with the percentage of each one of the 4 | |
sentiments evaluated on Amazon Comprehend model (positive, negative, | |
neutral, mixed) | |
""" | |
list_parts = [] | |
text_for_analysis = '' | |
for sentence in text.split('.'): | |
current_text = text_for_analysis + f'{sentence}.' | |
if len(current_text.encode('utf-8')) > 5000: | |
list_parts.append([len(text_for_analysis), text_for_analysis]) | |
text_for_analysis = f'{sentence}.' | |
else: | |
text_for_analysis += f'{sentence}.' | |
list_parts.append([len(text_for_analysis), text_for_analysis]) | |
dict_comprehend = {} | |
for t_parts in list_parts: | |
comprehend_client = boto3.client(service_name='comprehend', region_name='us-east-1') | |
sentimentData = comprehend_client.detect_sentiment(Text=t_parts[1], LanguageCode='en') | |
dict_comprehend[t_parts[0]] = sentimentData | |
dict_comprehend[t_parts[0]]['ratio'] = t_parts[0]/float(len(text)) | |
final_dict = {'Positive':0, 'Negative':0, 'Neutral':0, 'Mixed':0} | |
list_sentiments = ['Positive', 'Negative', 'Neutral', 'Mixed'] | |
for sentiment in list_sentiments: | |
for key, value in dict_comprehend.items(): | |
final_dict[sentiment] += value.get('SentimentScore').get(sentiment) * value.get('ratio') | |
return final_dic | |
# 17) Run the sentiment analysis using start_comprehend_job function | |
for index, row in df_audio.iterrows(): | |
transcript_text = get_text_from_json(bucket_name, row.json_transcription) | |
comprehend_results = start_comprehend_job(transcript_text) | |
for k,v in comprehend_results.items(): | |
df_audio.at[index, k] = v |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment