Skip to content

Instantly share code, notes, and snippets.

@dheerajinampudi
Created October 28, 2023 05:34
Show Gist options
  • Save dheerajinampudi/ebf8094c52ba5d0af1ea5bc8648f4b91 to your computer and use it in GitHub Desktop.
Save dheerajinampudi/ebf8094c52ba5d0af1ea5bc8648f4b91 to your computer and use it in GitHub Desktop.
Get google data using youtube API and analyze using sklearn
# %%
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import configparser
# %%
config = configparser.ConfigParser()
config.read("global.ini")
# Replace with your own API key
API_KEY = config["google_api_key"]["api_key"]
# Build the YouTube API client
youtube = build("youtube", "v3", developerKey=API_KEY)
def get_comments(video_id):
comments_data = []
page_token = None # Token for the next page of comments
while True: # Keep looping until all pages of comments have been retrieved
try:
response = (
youtube.commentThreads()
.list(
part="snippet",
videoId=video_id,
maxResults=100, # Get 100 comments per request (max allowed)
textFormat="plainText",
pageToken=page_token, # Pass the token for the next page (if any)
)
.execute()
)
for item in response["items"]:
comment = item["snippet"]["topLevelComment"]["snippet"]
comment_text = comment["textDisplay"]
likes = comment["likeCount"]
replies = item["snippet"]["totalReplyCount"]
comments_data.append([comment_text, likes, replies])
# Check if there are more pages of comments
page_token = response.get("nextPageToken")
if not page_token:
break # Exit the loop if there are no more pages of comments
except HttpError as e:
print(f"An HTTP error {e.resp.status} occurred: {e.content}")
break # Exit the loop if an error occurs
return comments_data
def extract_top_reasons(df):
# Load the CSV file into a DataFrame (if loaded from CSV)
# comments_df = pd.read_csv(filename)
# Combine all the comments into a single text
all_comments = " ".join(comments_df["Comment"])
# Use CountVectorizer to tokenize and count the reasons
vectorizer = CountVectorizer(
stop_words="english", ngram_range=(1, 3), max_features=100
)
X = vectorizer.fit_transform([all_comments])
# Create a dictionary with the reasons and their respective counts
reasons_count = defaultdict(int)
features = vectorizer.get_feature_names_out()
for feature, count in zip(features, X.toarray()[0]):
reasons_count[feature] = count
# Define a list of generic words or terms to ignore
ignore_terms = [
"india",
"world",
"class",
"just",
"people",
"indian",
"country",
"citizenship",
"indians",
"like",
"countries",
"life",
"good",
"reason",
"want",
"better",
"leave",
"don",
"high",
]
# Filter out the generic terms from the top reasons
filtered_reasons = {
key: value for key, value in reasons_count.items() if key not in ignore_terms
}
sorted_filtered_reasons = sorted(
filtered_reasons.items(), key=lambda x: x[1], reverse=True
)[:20]
# Calculate the total number of likes received for comments mentioning each reason
reason_likes = defaultdict(int)
for reason, _ in sorted_filtered_reasons:
mask = comments_df["Comment"].str.contains(reason, case=False, na=False)
total_likes = comments_df[mask]["Likes"].sum()
reason_likes[reason] = total_likes
# Create a DataFrame with the reason, number of mentions, and total likes
df_output = pd.DataFrame(
{
"Reason": [item[0] for item in sorted_filtered_reasons],
"Mentions": [item[1] for item in sorted_filtered_reasons],
"Total Likes": [reason_likes[item[0]] for item in sorted_filtered_reasons],
}
)
return df_output
## Main Code starts here
# Get comments from a specific video
video_id = "-MYwV49LbZY"
comments = get_comments(video_id)
# %%
# Convert the list of comments to a DataFrame
comments = pd.DataFrame(comments, columns=["Comment", "Likes", "Replies"])
# sort pandas dataframe by Likes column
comments = comments.sort_values(by="Likes", ascending=False)
# %%
results = extract_top_reasons(comments)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment