agmarrugo/coauthored_articles_code.py

## coauthored_articles_code.py

import pandas as pd
from fuzzywuzzy import fuzz

import re

def filter_by_identifier(name):
    match = re.search(r'\((\d+)\)', name)
    if match:
        identifier = match.group(1)
        return identifier not in identifier_skip_list
    return True

def generate_name_variants(full_name):
    parts = full_name.split()
    first_name = parts[0]
    last_name = parts[-2] if len(parts) > 2 else parts[-1]
    second_last_name = parts[-1] if len(parts) > 2 else ''
    variants = [
        f"{first_name} {last_name}",
        f"{first_name}",
        f"{last_name}-{second_last_name}, {first_name}" if second_last_name else ''
    ]
    return [v for v in variants if v]

# Load the CSV files (please update the paths to your local files)
students_df_path = 'lista-estudiantes-maestria-2015-2022.csv'
articles_df_path = 'pi-maestria-utb-2016-2023.csv'
students_df = pd.read_csv(students_df_path)
articles_df = pd.read_csv(articles_df_path)

# Filter the student names
filtered_student_names_set = set(students_df['NOMBRES Y APELLIDOS'].dropna())

# Initialize a dictionary to store unique articles co-authored by filtered students, categorized by year
filtered_unique_coauthored_articles = {}

# Loop through each article
for idx, row in articles_df.iterrows():
    year = row['Year']
    authors = row['Author full names'].split('; ')
    title = row['Title']

    # Perform fuzzy matching between the filtered student's name and each author's name
    for student_name in filtered_student_names_set:
        for author in authors:
            score = fuzz.token_set_ratio(student_name, author)
            if score >= 60:  # if the score is 70 or higher, consider it a match
                if year not in filtered_unique_coauthored_articles:
                    filtered_unique_coauthored_articles[year] = set()
                filtered_unique_coauthored_articles[year].add(title)

# Count the number of unique articles co-authored by filtered students per year
count_filtered_unique_by_year = {year: len(titles) for year, titles in filtered_unique_coauthored_articles.items()}

# Create a DataFrame to store the detailed information of unique articles co-authored by filtered students
detailed_coauthored_articles = []

# Loop through each article again to get the detailed list
for idx, row in articles_df.iterrows():
    year = row['Year']
    authors = row['Author full names'].split('; ')
    title = row['Title']

    # Perform fuzzy matching between the filtered student's name and each author's name
    for student_name in filtered_student_names_set:
        for author in authors:
            score = fuzz.token_set_ratio(student_name, author)
            if score >= 60:  # if the score is 70 or higher, consider it a match
                detailed_coauthored_articles.append({
                    'Year': year,
                    'Student_Name': student_name,
                    'Author_Name': author,
                    'Article_Title': title,
                    'Match_Score': score
                })

# Convert the list of dictionaries to a DataFrame
detailed_coauthored_articles_df = pd.DataFrame(detailed_coauthored_articles)

# Delete authors that are in the skip list
identifier_skip_list = ["24329839300", "57193171659", "8374046000", "55791991200"]

# detailed_coauthored_articles_df = detailed_coauthored_articles_df[~detailed_coauthored_articles_df['Author_Name'].isin(author_skip_list)]

# Apply the filter
detailed_coauthored_articles_df = detailed_coauthored_articles_df[detailed_coauthored_articles_df['Author_Name'].apply(filter_by_identifier)]


# Save the DataFrame to a CSV file (please update the path to your desired location)
csv_file_path = 'coauthored_articles_2015_2022_threshold.csv'
detailed_coauthored_articles_df.to_csv(csv_file_path, index=False)

# Assuming 'detailed_coauthored_articles_df' is your consolidated DataFrame and 'Year' is the column containing the publication years
yearly_counts = detailed_coauthored_articles_df['Year'].value_counts().sort_index()

# Convert to dictionary for easier lookup
yearly_counts_dict = yearly_counts.to_dict()

print("Article Counts by Year:", yearly_counts_dict)

# Group by 'Year' and apply nunique() function to the 'Student_Name' column to get the number of unique students per year
unique_students_per_year = detailed_coauthored_articles_df.groupby('Year')['Student_Name'].nunique()

# Convert to dictionary for easier lookup
unique_students_per_year_dict = unique_students_per_year.to_dict()

print("Unique Students Per Year:", unique_students_per_year_dict)


# Print the counts and the path to the saved CSV file
# print("Article Counts by Year:", count_filtered_unique_by_year)
print("Detailed data saved to:", csv_file_path)

	import pandas as pd
	from fuzzywuzzy import fuzz

	import re

	def filter_by_identifier(name):
	match = re.search(r'\((\d+)\)', name)
	if match:
	identifier = match.group(1)
	return identifier not in identifier_skip_list
	return True

	def generate_name_variants(full_name):
	parts = full_name.split()
	first_name = parts[0]
	last_name = parts[-2] if len(parts) > 2 else parts[-1]
	second_last_name = parts[-1] if len(parts) > 2 else ''
	variants = [
	f"{first_name} {last_name}",
	f"{first_name}",
	f"{last_name}-{second_last_name}, {first_name}" if second_last_name else ''
	]
	return [v for v in variants if v]

	# Load the CSV files (please update the paths to your local files)
	students_df_path = 'lista-estudiantes-maestria-2015-2022.csv'
	articles_df_path = 'pi-maestria-utb-2016-2023.csv'
	students_df = pd.read_csv(students_df_path)
	articles_df = pd.read_csv(articles_df_path)

	# Filter the student names
	filtered_student_names_set = set(students_df['NOMBRES Y APELLIDOS'].dropna())

	# Initialize a dictionary to store unique articles co-authored by filtered students, categorized by year
	filtered_unique_coauthored_articles = {}

	# Loop through each article
	for idx, row in articles_df.iterrows():
	year = row['Year']
	authors = row['Author full names'].split('; ')
	title = row['Title']

	# Perform fuzzy matching between the filtered student's name and each author's name
	for student_name in filtered_student_names_set:
	for author in authors:
	score = fuzz.token_set_ratio(student_name, author)
	if score >= 60: # if the score is 70 or higher, consider it a match
	if year not in filtered_unique_coauthored_articles:
	filtered_unique_coauthored_articles[year] = set()
	filtered_unique_coauthored_articles[year].add(title)

	# Count the number of unique articles co-authored by filtered students per year
	count_filtered_unique_by_year = {year: len(titles) for year, titles in filtered_unique_coauthored_articles.items()}

	# Create a DataFrame to store the detailed information of unique articles co-authored by filtered students
	detailed_coauthored_articles = []

	# Loop through each article again to get the detailed list
	for idx, row in articles_df.iterrows():
	year = row['Year']
	authors = row['Author full names'].split('; ')
	title = row['Title']

	# Perform fuzzy matching between the filtered student's name and each author's name
	for student_name in filtered_student_names_set:
	for author in authors:
	score = fuzz.token_set_ratio(student_name, author)
	if score >= 60: # if the score is 70 or higher, consider it a match
	detailed_coauthored_articles.append({
	'Year': year,
	'Student_Name': student_name,
	'Author_Name': author,
	'Article_Title': title,
	'Match_Score': score
	})

	# Convert the list of dictionaries to a DataFrame
	detailed_coauthored_articles_df = pd.DataFrame(detailed_coauthored_articles)

	# Delete authors that are in the skip list
	identifier_skip_list = ["24329839300", "57193171659", "8374046000", "55791991200"]

	# detailed_coauthored_articles_df = detailed_coauthored_articles_df[~detailed_coauthored_articles_df['Author_Name'].isin(author_skip_list)]

	# Apply the filter
	detailed_coauthored_articles_df = detailed_coauthored_articles_df[detailed_coauthored_articles_df['Author_Name'].apply(filter_by_identifier)]


	# Save the DataFrame to a CSV file (please update the path to your desired location)
	csv_file_path = 'coauthored_articles_2015_2022_threshold.csv'
	detailed_coauthored_articles_df.to_csv(csv_file_path, index=False)

	# Assuming 'detailed_coauthored_articles_df' is your consolidated DataFrame and 'Year' is the column containing the publication years
	yearly_counts = detailed_coauthored_articles_df['Year'].value_counts().sort_index()

	# Convert to dictionary for easier lookup
	yearly_counts_dict = yearly_counts.to_dict()

	print("Article Counts by Year:", yearly_counts_dict)

	# Group by 'Year' and apply nunique() function to the 'Student_Name' column to get the number of unique students per year
	unique_students_per_year = detailed_coauthored_articles_df.groupby('Year')['Student_Name'].nunique()

	# Convert to dictionary for easier lookup
	unique_students_per_year_dict = unique_students_per_year.to_dict()

	print("Unique Students Per Year:", unique_students_per_year_dict)


	# Print the counts and the path to the saved CSV file
	# print("Article Counts by Year:", count_filtered_unique_by_year)
	print("Detailed data saved to:", csv_file_path)