dheerajinampudi/get_analyze_comments.py

## get_analyze_comments.py
# %%
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import configparser

# %%
config = configparser.ConfigParser()
config.read("global.ini")
# Replace with your own API key
API_KEY = config["google_api_key"]["api_key"]


# Build the YouTube API client
youtube = build("youtube", "v3", developerKey=API_KEY)


def get_comments(video_id):
    comments_data = []
    page_token = None  # Token for the next page of comments

    while True:  # Keep looping until all pages of comments have been retrieved
        try:
            response = (
                youtube.commentThreads()
                .list(
                    part="snippet",
                    videoId=video_id,
                    maxResults=100,  # Get 100 comments per request (max allowed)
                    textFormat="plainText",
                    pageToken=page_token,  # Pass the token for the next page (if any)
                )
                .execute()
            )

            for item in response["items"]:
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comment_text = comment["textDisplay"]
                likes = comment["likeCount"]
                replies = item["snippet"]["totalReplyCount"]
                comments_data.append([comment_text, likes, replies])

            # Check if there are more pages of comments
            page_token = response.get("nextPageToken")
            if not page_token:
                break  # Exit the loop if there are no more pages of comments

        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred: {e.content}")
            break  # Exit the loop if an error occurs

    return comments_data


def extract_top_reasons(df):
    # Load the CSV file into a DataFrame (if loaded from CSV)
    # comments_df = pd.read_csv(filename)

    # Combine all the comments into a single text
    all_comments = " ".join(comments_df["Comment"])

    # Use CountVectorizer to tokenize and count the reasons
    vectorizer = CountVectorizer(
        stop_words="english", ngram_range=(1, 3), max_features=100
    )
    X = vectorizer.fit_transform([all_comments])

    # Create a dictionary with the reasons and their respective counts
    reasons_count = defaultdict(int)
    features = vectorizer.get_feature_names_out()
    for feature, count in zip(features, X.toarray()[0]):
        reasons_count[feature] = count

    # Define a list of generic words or terms to ignore
    ignore_terms = [
        "india",
        "world",
        "class",
        "just",
        "people",
        "indian",
        "country",
        "citizenship",
        "indians",
        "like",
        "countries",
        "life",
        "good",
        "reason",
        "want",
        "better",
        "leave",
        "don",
        "high",
    ]

    # Filter out the generic terms from the top reasons
    filtered_reasons = {
        key: value for key, value in reasons_count.items() if key not in ignore_terms
    }
    sorted_filtered_reasons = sorted(
        filtered_reasons.items(), key=lambda x: x[1], reverse=True
    )[:20]

    # Calculate the total number of likes received for comments mentioning each reason
    reason_likes = defaultdict(int)
    for reason, _ in sorted_filtered_reasons:
        mask = comments_df["Comment"].str.contains(reason, case=False, na=False)
        total_likes = comments_df[mask]["Likes"].sum()
        reason_likes[reason] = total_likes

    # Create a DataFrame with the reason, number of mentions, and total likes
    df_output = pd.DataFrame(
        {
            "Reason": [item[0] for item in sorted_filtered_reasons],
            "Mentions": [item[1] for item in sorted_filtered_reasons],
            "Total Likes": [reason_likes[item[0]] for item in sorted_filtered_reasons],
        }
    )

    return df_output


## Main Code starts here
# Get comments from a specific video
video_id = "-MYwV49LbZY"
comments = get_comments(video_id)
# %%

# Convert the list of comments to a DataFrame
comments = pd.DataFrame(comments, columns=["Comment", "Likes", "Replies"])
# sort pandas dataframe by Likes column
comments = comments.sort_values(by="Likes", ascending=False)

# %%
results = extract_top_reasons(comments)
	# %%
	import pandas as pd
	from googleapiclient.discovery import build
	from googleapiclient.errors import HttpError
	from collections import defaultdict
	from sklearn.feature_extraction.text import CountVectorizer
	import configparser

	# %%
	config = configparser.ConfigParser()
	config.read("global.ini")
	# Replace with your own API key
	API_KEY = config["google_api_key"]["api_key"]


	# Build the YouTube API client
	youtube = build("youtube", "v3", developerKey=API_KEY)


	def get_comments(video_id):
	comments_data = []
	page_token = None # Token for the next page of comments

	while True: # Keep looping until all pages of comments have been retrieved
	try:
	response = (
	youtube.commentThreads()
	.list(
	part="snippet",
	videoId=video_id,
	maxResults=100, # Get 100 comments per request (max allowed)
	textFormat="plainText",
	pageToken=page_token, # Pass the token for the next page (if any)
	)
	.execute()
	)

	for item in response["items"]:
	comment = item["snippet"]["topLevelComment"]["snippet"]
	comment_text = comment["textDisplay"]
	likes = comment["likeCount"]
	replies = item["snippet"]["totalReplyCount"]
	comments_data.append([comment_text, likes, replies])

	# Check if there are more pages of comments
	page_token = response.get("nextPageToken")
	if not page_token:
	break # Exit the loop if there are no more pages of comments

	except HttpError as e:
	print(f"An HTTP error {e.resp.status} occurred: {e.content}")
	break # Exit the loop if an error occurs

	return comments_data


	def extract_top_reasons(df):
	# Load the CSV file into a DataFrame (if loaded from CSV)
	# comments_df = pd.read_csv(filename)

	# Combine all the comments into a single text
	all_comments = " ".join(comments_df["Comment"])

	# Use CountVectorizer to tokenize and count the reasons
	vectorizer = CountVectorizer(
	stop_words="english", ngram_range=(1, 3), max_features=100
	)
	X = vectorizer.fit_transform([all_comments])

	# Create a dictionary with the reasons and their respective counts
	reasons_count = defaultdict(int)
	features = vectorizer.get_feature_names_out()
	for feature, count in zip(features, X.toarray()[0]):
	reasons_count[feature] = count

	# Define a list of generic words or terms to ignore
	ignore_terms = [
	"india",
	"world",
	"class",
	"just",
	"people",
	"indian",
	"country",
	"citizenship",
	"indians",
	"like",
	"countries",
	"life",
	"good",
	"reason",
	"want",
	"better",
	"leave",
	"don",
	"high",
	]

	# Filter out the generic terms from the top reasons
	filtered_reasons = {
	key: value for key, value in reasons_count.items() if key not in ignore_terms
	}
	sorted_filtered_reasons = sorted(
	filtered_reasons.items(), key=lambda x: x[1], reverse=True
	)[:20]

	# Calculate the total number of likes received for comments mentioning each reason
	reason_likes = defaultdict(int)
	for reason, _ in sorted_filtered_reasons:
	mask = comments_df["Comment"].str.contains(reason, case=False, na=False)
	total_likes = comments_df[mask]["Likes"].sum()
	reason_likes[reason] = total_likes

	# Create a DataFrame with the reason, number of mentions, and total likes
	df_output = pd.DataFrame(
	{
	"Reason": [item[0] for item in sorted_filtered_reasons],
	"Mentions": [item[1] for item in sorted_filtered_reasons],
	"Total Likes": [reason_likes[item[0]] for item in sorted_filtered_reasons],
	}
	)

	return df_output


	## Main Code starts here
	# Get comments from a specific video
	video_id = "-MYwV49LbZY"
	comments = get_comments(video_id)
	# %%

	# Convert the list of comments to a DataFrame
	comments = pd.DataFrame(comments, columns=["Comment", "Likes", "Replies"])
	# sort pandas dataframe by Likes column
	comments = comments.sort_values(by="Likes", ascending=False)

	# %%
	results = extract_top_reasons(comments)