Created
October 28, 2023 05:34
-
-
Save dheerajinampudi/ebf8094c52ba5d0af1ea5bc8648f4b91 to your computer and use it in GitHub Desktop.
Get google data using youtube API and analyze using sklearn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import pandas as pd | |
from googleapiclient.discovery import build | |
from googleapiclient.errors import HttpError | |
from collections import defaultdict | |
from sklearn.feature_extraction.text import CountVectorizer | |
import configparser | |
# %% | |
config = configparser.ConfigParser() | |
config.read("global.ini") | |
# Replace with your own API key | |
API_KEY = config["google_api_key"]["api_key"] | |
# Build the YouTube API client | |
youtube = build("youtube", "v3", developerKey=API_KEY) | |
def get_comments(video_id): | |
comments_data = [] | |
page_token = None # Token for the next page of comments | |
while True: # Keep looping until all pages of comments have been retrieved | |
try: | |
response = ( | |
youtube.commentThreads() | |
.list( | |
part="snippet", | |
videoId=video_id, | |
maxResults=100, # Get 100 comments per request (max allowed) | |
textFormat="plainText", | |
pageToken=page_token, # Pass the token for the next page (if any) | |
) | |
.execute() | |
) | |
for item in response["items"]: | |
comment = item["snippet"]["topLevelComment"]["snippet"] | |
comment_text = comment["textDisplay"] | |
likes = comment["likeCount"] | |
replies = item["snippet"]["totalReplyCount"] | |
comments_data.append([comment_text, likes, replies]) | |
# Check if there are more pages of comments | |
page_token = response.get("nextPageToken") | |
if not page_token: | |
break # Exit the loop if there are no more pages of comments | |
except HttpError as e: | |
print(f"An HTTP error {e.resp.status} occurred: {e.content}") | |
break # Exit the loop if an error occurs | |
return comments_data | |
def extract_top_reasons(df): | |
# Load the CSV file into a DataFrame (if loaded from CSV) | |
# comments_df = pd.read_csv(filename) | |
# Combine all the comments into a single text | |
all_comments = " ".join(comments_df["Comment"]) | |
# Use CountVectorizer to tokenize and count the reasons | |
vectorizer = CountVectorizer( | |
stop_words="english", ngram_range=(1, 3), max_features=100 | |
) | |
X = vectorizer.fit_transform([all_comments]) | |
# Create a dictionary with the reasons and their respective counts | |
reasons_count = defaultdict(int) | |
features = vectorizer.get_feature_names_out() | |
for feature, count in zip(features, X.toarray()[0]): | |
reasons_count[feature] = count | |
# Define a list of generic words or terms to ignore | |
ignore_terms = [ | |
"india", | |
"world", | |
"class", | |
"just", | |
"people", | |
"indian", | |
"country", | |
"citizenship", | |
"indians", | |
"like", | |
"countries", | |
"life", | |
"good", | |
"reason", | |
"want", | |
"better", | |
"leave", | |
"don", | |
"high", | |
] | |
# Filter out the generic terms from the top reasons | |
filtered_reasons = { | |
key: value for key, value in reasons_count.items() if key not in ignore_terms | |
} | |
sorted_filtered_reasons = sorted( | |
filtered_reasons.items(), key=lambda x: x[1], reverse=True | |
)[:20] | |
# Calculate the total number of likes received for comments mentioning each reason | |
reason_likes = defaultdict(int) | |
for reason, _ in sorted_filtered_reasons: | |
mask = comments_df["Comment"].str.contains(reason, case=False, na=False) | |
total_likes = comments_df[mask]["Likes"].sum() | |
reason_likes[reason] = total_likes | |
# Create a DataFrame with the reason, number of mentions, and total likes | |
df_output = pd.DataFrame( | |
{ | |
"Reason": [item[0] for item in sorted_filtered_reasons], | |
"Mentions": [item[1] for item in sorted_filtered_reasons], | |
"Total Likes": [reason_likes[item[0]] for item in sorted_filtered_reasons], | |
} | |
) | |
return df_output | |
## Main Code starts here | |
# Get comments from a specific video | |
video_id = "-MYwV49LbZY" | |
comments = get_comments(video_id) | |
# %% | |
# Convert the list of comments to a DataFrame | |
comments = pd.DataFrame(comments, columns=["Comment", "Likes", "Replies"]) | |
# sort pandas dataframe by Likes column | |
comments = comments.sort_values(by="Likes", ascending=False) | |
# %% | |
results = extract_top_reasons(comments) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment