Last active
December 15, 2020 18:01
-
-
Save devamitranjan/ce8dafa3d78d23a0d098036450e8623d to your computer and use it in GitHub Desktop.
This is the final version of sentiment analysis on youtube video.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import googleapiclient.discovery | |
# Fetching the data from Youtube API | |
def google_api(key,vidId): | |
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" | |
api_service_name = "youtube" | |
api_version = "v3" | |
DEVELOPER_KEY = key | |
youtube = googleapiclient.discovery.build( | |
api_service_name, api_version, developerKey = DEVELOPER_KEY) | |
request = youtube.commentThreads().list( | |
part="id,snippet", | |
maxResults=100, | |
order="relevance", | |
videoId=vidId | |
) | |
response = request.execute() | |
return response | |
key = "Your_API_KEY" | |
vidId = "621oD2zBSbI" | |
response = google_api(key,vidId) | |
print(response) | |
# Creation of Data Frame | |
import pandas as pd | |
def create_df_author_comments(response): | |
authorname = [] | |
comments = [] | |
for i in range(len(response["items"])): | |
authorname.append(response["items"][i]["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]) | |
comments.append(response["items"][i]["snippet"]["topLevelComment"]["snippet"]["textOriginal"]) | |
df_1 = pd.DataFrame(comments, index = authorname,columns=["Comments"]) | |
return df_1 | |
df = create_df_author_comments(response) | |
print(df) | |
# Cleaning Round 1 | |
import re | |
def cleaning_comments(comment): | |
comment = re.sub("[π€£|π€|π€£|π|π€|β€οΈ|π|π΄|π£|π |πͺ|π]+",'',comment) | |
comment = re.sub("[0-9]+","",comment) | |
comment = re.sub("[\:|\@|\)|\*|\.|\$|\!|\?|\,|\%|\"]+"," ",comment) | |
return comment | |
df["Comments"]= df["Comments"].apply(cleaning_comments) | |
print(df) | |
# Cleaning Round 2 | |
def cleaning_comments1(comment): | |
comment = re.sub("[π|πΎ|π|β₯|π€·ββ]+","",comment) | |
comment = re.sub("[\(|\-|\β|\β|\#|\!|\/|\Β«|\Β»|\&]+","",comment) | |
return comment | |
df["Comments"] = df["Comments"].apply(cleaning_comments1) | |
print(df) | |
# Cleaning Round 3 | |
def cleaning_comments3(comment): | |
comment = re.sub("\n"," ",comment) | |
comment = re.sub('[\'|π΅π°|\;|\οΌ]+','',comment) | |
return comment | |
df["Comments"] = df["Comments"].apply(cleaning_comments3) | |
print(df) | |
#Cleaning Round 4 | |
lower = lambda comment: comment.lower() | |
df['Comments'] = df['Comments'].apply(lower) | |
print(df) | |
# Final DataFrame after cleaning | |
print(df) | |
#Removing Empty Comments | |
def remove_comments(df): | |
# Checks for comments which has zero length in a dataframe | |
zero_length_comments = df[df["Comments"].map(len) == 0] | |
# taking all the indexes of the filtered comments in a list | |
zero_length_comments_index = [ind for ind in zero_length_comments.index] | |
# removing those rows from dataframe whose indexes matches | |
df.drop(zero_length_comments_index, inplace = True) | |
return df | |
df = remove_comments(df) | |
df | |
# Removing Non English Comments | |
from textblob.blob import TextBlob | |
lang_detection = lambda text: TextBlob(text).detect_language() | |
def remove_non_english_comments(df): | |
comment = df[df["Comments"].map(lang_detection) != 'en'] | |
authors = [author for author in comment.index] | |
df.drop(authors,inplace = True) | |
return df | |
df = remove_non_english_comments(df) | |
df | |
#Finding Subjectivity | |
def find_subjectivity_on_single_comment(text): | |
return TextBlob(text).sentiment.subjectivity | |
def apply_subjectivity_on_all_comments(df): | |
df['Subjectivity'] = df['Comments'].apply(find_subjectivity_on_single_comment) | |
return df | |
df = apply_subjectivity_on_all_comments(df) | |
df | |
#Finding Polarity | |
def find_polarity_of_single_comment(text): | |
return TextBlob(text).sentiment.polarity | |
def find_polarity_of_every_comment(df): | |
df['Polarity'] = df['Comments'].apply(find_polarity_of_single_comment) | |
return df | |
df = find_polarity_of_every_comment(df) | |
df | |
# Analysis Based on Polarity | |
analysis = lambda polarity: 'Positive' if polarity > 0 else 'Neutral' if polarity == 0 else 'Negative' | |
def analysis_based_on_polarity(df): | |
df['Analysis'] = df['Polarity'].apply(analysis) | |
return df | |
df = analysis_based_on_polarity(df) | |
df | |
# Printing Positive Comments | |
def print_positive_comments(): | |
print('Printing positive comments:\n') | |
sortedDF = df.sort_values(by=['Polarity']) | |
for i in range(0, sortedDF.shape[0] ): | |
if( sortedDF['Analysis'][i] == 'Positive'): | |
print(str(i+1) + '> '+ sortedDF['Comments'][i]) | |
print() | |
print_positive_comments() | |
# Printing Negative Comments | |
def print_negative_comments(): | |
print('Printing negative comments:\n') | |
sortedDF = df.sort_values(by=['Polarity']) | |
for i in range(0, sortedDF.shape[0] ): | |
if( sortedDF['Analysis'][i] == 'Negative'): | |
print(str(i+1) + '> '+ sortedDF['Comments'][i]) | |
print() | |
print_negative_comments() | |
# Printing Neutral Comments | |
def print_neutral_comments(): | |
print('Printing neutral comments:\n') | |
sortedDF = df.sort_values(by=['Polarity']) | |
for i in range(0, sortedDF.shape[0] ): | |
if( sortedDF['Analysis'][i] == 'Neutral'): | |
print(str(i+1) + '> '+ sortedDF['Comments'][i]) | |
print() | |
print_neutral_comments() | |
# Forming WordClouds | |
import matplotlib.pyplot as plt | |
from sklearn.feature_extraction import text | |
from wordcloud import WordCloud | |
def generate_word_clouds(df): | |
allWords = ' '.join([twts for twts in df['Comments']]) | |
wordCloud = WordCloud(stopwords = text.ENGLISH_STOP_WORDS ,width=1000, height=600, random_state=21, max_font_size=110).generate(allWords) | |
plt.imshow(wordCloud, interpolation="bilinear") | |
plt.axis('off') | |
plt.show() | |
generate_word_clouds(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment