Skip to content

Instantly share code, notes, and snippets.

@agmarrugo
Created September 25, 2023 21:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agmarrugo/fa6fb8cf2cffd9046c7ef91d19f3472b to your computer and use it in GitHub Desktop.
Save agmarrugo/fa6fb8cf2cffd9046c7ef91d19f3472b to your computer and use it in GitHub Desktop.
This Python program analyzes a dataset of academic articles and a list of students to identify co-authored articles. It aims to find which articles were co-authored by students from a specific Master's program and count them on a yearly basis.
import pandas as pd
from fuzzywuzzy import fuzz
import re
def filter_by_identifier(name):
match = re.search(r'\((\d+)\)', name)
if match:
identifier = match.group(1)
return identifier not in identifier_skip_list
return True
def generate_name_variants(full_name):
parts = full_name.split()
first_name = parts[0]
last_name = parts[-2] if len(parts) > 2 else parts[-1]
second_last_name = parts[-1] if len(parts) > 2 else ''
variants = [
f"{first_name} {last_name}",
f"{first_name}",
f"{last_name}-{second_last_name}, {first_name}" if second_last_name else ''
]
return [v for v in variants if v]
# Load the CSV files (please update the paths to your local files)
students_df_path = 'lista-estudiantes-maestria-2015-2022.csv'
articles_df_path = 'pi-maestria-utb-2016-2023.csv'
students_df = pd.read_csv(students_df_path)
articles_df = pd.read_csv(articles_df_path)
# Filter the student names
filtered_student_names_set = set(students_df['NOMBRES Y APELLIDOS'].dropna())
# Initialize a dictionary to store unique articles co-authored by filtered students, categorized by year
filtered_unique_coauthored_articles = {}
# Loop through each article
for idx, row in articles_df.iterrows():
year = row['Year']
authors = row['Author full names'].split('; ')
title = row['Title']
# Perform fuzzy matching between the filtered student's name and each author's name
for student_name in filtered_student_names_set:
for author in authors:
score = fuzz.token_set_ratio(student_name, author)
if score >= 60: # if the score is 70 or higher, consider it a match
if year not in filtered_unique_coauthored_articles:
filtered_unique_coauthored_articles[year] = set()
filtered_unique_coauthored_articles[year].add(title)
# Count the number of unique articles co-authored by filtered students per year
count_filtered_unique_by_year = {year: len(titles) for year, titles in filtered_unique_coauthored_articles.items()}
# Create a DataFrame to store the detailed information of unique articles co-authored by filtered students
detailed_coauthored_articles = []
# Loop through each article again to get the detailed list
for idx, row in articles_df.iterrows():
year = row['Year']
authors = row['Author full names'].split('; ')
title = row['Title']
# Perform fuzzy matching between the filtered student's name and each author's name
for student_name in filtered_student_names_set:
for author in authors:
score = fuzz.token_set_ratio(student_name, author)
if score >= 60: # if the score is 70 or higher, consider it a match
detailed_coauthored_articles.append({
'Year': year,
'Student_Name': student_name,
'Author_Name': author,
'Article_Title': title,
'Match_Score': score
})
# Convert the list of dictionaries to a DataFrame
detailed_coauthored_articles_df = pd.DataFrame(detailed_coauthored_articles)
# Delete authors that are in the skip list
identifier_skip_list = ["24329839300", "57193171659", "8374046000", "55791991200"]
# detailed_coauthored_articles_df = detailed_coauthored_articles_df[~detailed_coauthored_articles_df['Author_Name'].isin(author_skip_list)]
# Apply the filter
detailed_coauthored_articles_df = detailed_coauthored_articles_df[detailed_coauthored_articles_df['Author_Name'].apply(filter_by_identifier)]
# Save the DataFrame to a CSV file (please update the path to your desired location)
csv_file_path = 'coauthored_articles_2015_2022_threshold.csv'
detailed_coauthored_articles_df.to_csv(csv_file_path, index=False)
# Assuming 'detailed_coauthored_articles_df' is your consolidated DataFrame and 'Year' is the column containing the publication years
yearly_counts = detailed_coauthored_articles_df['Year'].value_counts().sort_index()
# Convert to dictionary for easier lookup
yearly_counts_dict = yearly_counts.to_dict()
print("Article Counts by Year:", yearly_counts_dict)
# Group by 'Year' and apply nunique() function to the 'Student_Name' column to get the number of unique students per year
unique_students_per_year = detailed_coauthored_articles_df.groupby('Year')['Student_Name'].nunique()
# Convert to dictionary for easier lookup
unique_students_per_year_dict = unique_students_per_year.to_dict()
print("Unique Students Per Year:", unique_students_per_year_dict)
# Print the counts and the path to the saved CSV file
# print("Article Counts by Year:", count_filtered_unique_by_year)
print("Detailed data saved to:", csv_file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment