|
|
|
import pandas as pd |
|
from fuzzywuzzy import fuzz |
|
|
|
import re |
|
|
|
def filter_by_identifier(name): |
|
match = re.search(r'\((\d+)\)', name) |
|
if match: |
|
identifier = match.group(1) |
|
return identifier not in identifier_skip_list |
|
return True |
|
|
|
def generate_name_variants(full_name): |
|
parts = full_name.split() |
|
first_name = parts[0] |
|
last_name = parts[-2] if len(parts) > 2 else parts[-1] |
|
second_last_name = parts[-1] if len(parts) > 2 else '' |
|
variants = [ |
|
f"{first_name} {last_name}", |
|
f"{first_name}", |
|
f"{last_name}-{second_last_name}, {first_name}" if second_last_name else '' |
|
] |
|
return [v for v in variants if v] |
|
|
|
# Load the CSV files (please update the paths to your local files) |
|
students_df_path = 'lista-estudiantes-maestria-2015-2022.csv' |
|
articles_df_path = 'pi-maestria-utb-2016-2023.csv' |
|
students_df = pd.read_csv(students_df_path) |
|
articles_df = pd.read_csv(articles_df_path) |
|
|
|
# Filter the student names |
|
filtered_student_names_set = set(students_df['NOMBRES Y APELLIDOS'].dropna()) |
|
|
|
# Initialize a dictionary to store unique articles co-authored by filtered students, categorized by year |
|
filtered_unique_coauthored_articles = {} |
|
|
|
# Loop through each article |
|
for idx, row in articles_df.iterrows(): |
|
year = row['Year'] |
|
authors = row['Author full names'].split('; ') |
|
title = row['Title'] |
|
|
|
# Perform fuzzy matching between the filtered student's name and each author's name |
|
for student_name in filtered_student_names_set: |
|
for author in authors: |
|
score = fuzz.token_set_ratio(student_name, author) |
|
if score >= 60: # if the score is 70 or higher, consider it a match |
|
if year not in filtered_unique_coauthored_articles: |
|
filtered_unique_coauthored_articles[year] = set() |
|
filtered_unique_coauthored_articles[year].add(title) |
|
|
|
# Count the number of unique articles co-authored by filtered students per year |
|
count_filtered_unique_by_year = {year: len(titles) for year, titles in filtered_unique_coauthored_articles.items()} |
|
|
|
# Create a DataFrame to store the detailed information of unique articles co-authored by filtered students |
|
detailed_coauthored_articles = [] |
|
|
|
# Loop through each article again to get the detailed list |
|
for idx, row in articles_df.iterrows(): |
|
year = row['Year'] |
|
authors = row['Author full names'].split('; ') |
|
title = row['Title'] |
|
|
|
# Perform fuzzy matching between the filtered student's name and each author's name |
|
for student_name in filtered_student_names_set: |
|
for author in authors: |
|
score = fuzz.token_set_ratio(student_name, author) |
|
if score >= 60: # if the score is 70 or higher, consider it a match |
|
detailed_coauthored_articles.append({ |
|
'Year': year, |
|
'Student_Name': student_name, |
|
'Author_Name': author, |
|
'Article_Title': title, |
|
'Match_Score': score |
|
}) |
|
|
|
# Convert the list of dictionaries to a DataFrame |
|
detailed_coauthored_articles_df = pd.DataFrame(detailed_coauthored_articles) |
|
|
|
# Delete authors that are in the skip list |
|
identifier_skip_list = ["24329839300", "57193171659", "8374046000", "55791991200"] |
|
|
|
# detailed_coauthored_articles_df = detailed_coauthored_articles_df[~detailed_coauthored_articles_df['Author_Name'].isin(author_skip_list)] |
|
|
|
# Apply the filter |
|
detailed_coauthored_articles_df = detailed_coauthored_articles_df[detailed_coauthored_articles_df['Author_Name'].apply(filter_by_identifier)] |
|
|
|
|
|
# Save the DataFrame to a CSV file (please update the path to your desired location) |
|
csv_file_path = 'coauthored_articles_2015_2022_threshold.csv' |
|
detailed_coauthored_articles_df.to_csv(csv_file_path, index=False) |
|
|
|
# Assuming 'detailed_coauthored_articles_df' is your consolidated DataFrame and 'Year' is the column containing the publication years |
|
yearly_counts = detailed_coauthored_articles_df['Year'].value_counts().sort_index() |
|
|
|
# Convert to dictionary for easier lookup |
|
yearly_counts_dict = yearly_counts.to_dict() |
|
|
|
print("Article Counts by Year:", yearly_counts_dict) |
|
|
|
# Group by 'Year' and apply nunique() function to the 'Student_Name' column to get the number of unique students per year |
|
unique_students_per_year = detailed_coauthored_articles_df.groupby('Year')['Student_Name'].nunique() |
|
|
|
# Convert to dictionary for easier lookup |
|
unique_students_per_year_dict = unique_students_per_year.to_dict() |
|
|
|
print("Unique Students Per Year:", unique_students_per_year_dict) |
|
|
|
|
|
# Print the counts and the path to the saved CSV file |
|
# print("Article Counts by Year:", count_filtered_unique_by_year) |
|
print("Detailed data saved to:", csv_file_path) |