GabrielSGoncalves/nlp_aws_medium_part5.py

## nlp_aws_medium_part5.py
# 16) Function to call Amazon Comprehend service using boto3
def start_comprehend_job(text):
    """
    Executes sentiment analysis of a text using Amazon Comprehend.
    The text can be larger than 5000 bytes (one limitation for each job), as
    the function will split it into multiple processes and return a
    averaged value for each sentiment.

    Parameter
    - text (str): The text to be analyzed

    Return
    - final_dict (dict): Dictionary with the percentage of each one of the 4
    sentiments evaluated on Amazon Comprehend model (positive, negative,
    neutral, mixed)
    """
    list_parts = []
    text_for_analysis = ''
    for sentence in text.split('.'):
        current_text = text_for_analysis + f'{sentence}.'

        if len(current_text.encode('utf-8')) > 5000:
            list_parts.append([len(text_for_analysis), text_for_analysis])
            text_for_analysis = f'{sentence}.'

        else:
            text_for_analysis += f'{sentence}.'

    list_parts.append([len(text_for_analysis), text_for_analysis])
    dict_comprehend = {}
    for t_parts in list_parts:

        comprehend_client = boto3.client(service_name='comprehend', region_name='us-east-1')
        sentimentData = comprehend_client.detect_sentiment(Text=t_parts[1], LanguageCode='en')

        dict_comprehend[t_parts[0]] = sentimentData
        dict_comprehend[t_parts[0]]['ratio'] = t_parts[0]/float(len(text))

    final_dict = {'Positive':0, 'Negative':0, 'Neutral':0, 'Mixed':0}
    list_sentiments = ['Positive', 'Negative', 'Neutral', 'Mixed']
    for sentiment in list_sentiments:
        for key, value in dict_comprehend.items():
            final_dict[sentiment] += value.get('SentimentScore').get(sentiment) * value.get('ratio')

    return final_dic


# 17) Run the sentiment analysis using start_comprehend_job function
for index, row in df_audio.iterrows():
    transcript_text = get_text_from_json(bucket_name, row.json_transcription)
    comprehend_results = start_comprehend_job(transcript_text)
    for k,v in comprehend_results.items():
        df_audio.at[index, k] = v
	# 16) Function to call Amazon Comprehend service using boto3
	def start_comprehend_job(text):
	"""
	Executes sentiment analysis of a text using Amazon Comprehend.
	The text can be larger than 5000 bytes (one limitation for each job), as
	the function will split it into multiple processes and return a
	averaged value for each sentiment.

	Parameter
	- text (str): The text to be analyzed

	Return
	- final_dict (dict): Dictionary with the percentage of each one of the 4
	sentiments evaluated on Amazon Comprehend model (positive, negative,
	neutral, mixed)
	"""
	list_parts = []
	text_for_analysis = ''
	for sentence in text.split('.'):
	current_text = text_for_analysis + f'{sentence}.'

	if len(current_text.encode('utf-8')) > 5000:
	list_parts.append([len(text_for_analysis), text_for_analysis])
	text_for_analysis = f'{sentence}.'

	else:
	text_for_analysis += f'{sentence}.'

	list_parts.append([len(text_for_analysis), text_for_analysis])
	dict_comprehend = {}
	for t_parts in list_parts:

	comprehend_client = boto3.client(service_name='comprehend', region_name='us-east-1')
	sentimentData = comprehend_client.detect_sentiment(Text=t_parts[1], LanguageCode='en')

	dict_comprehend[t_parts[0]] = sentimentData
	dict_comprehend[t_parts[0]]['ratio'] = t_parts[0]/float(len(text))

	final_dict = {'Positive':0, 'Negative':0, 'Neutral':0, 'Mixed':0}
	list_sentiments = ['Positive', 'Negative', 'Neutral', 'Mixed']
	for sentiment in list_sentiments:
	for key, value in dict_comprehend.items():
	final_dict[sentiment] += value.get('SentimentScore').get(sentiment) * value.get('ratio')

	return final_dic


	# 17) Run the sentiment analysis using start_comprehend_job function
	for index, row in df_audio.iterrows():
	transcript_text = get_text_from_json(bucket_name, row.json_transcription)
	comprehend_results = start_comprehend_job(transcript_text)
	for k,v in comprehend_results.items():
	df_audio.at[index, k] = v