Skip to content

Instantly share code, notes, and snippets.

@natesheehan
Created March 1, 2024 11:05
Show Gist options
  • Save natesheehan/f59bdd0c072326ad563681ead64324d5 to your computer and use it in GitHub Desktop.
Save natesheehan/f59bdd0c072326ad563681ead64324d5 to your computer and use it in GitHub Desktop.
import pandas as pd
import matplotlib.pyplot as plt
# Load the CSV file
file_path = 'Interviews-Aggregated - Sheet1 (2).csv' # Make sure to update this path
data = pd.read_csv(file_path)
# Clean the `Theme` column
data['Theme'] = data['Theme'].str.strip() # Trim whitespace
data = data[data['Theme'] != ''] # Remove rows where `Theme` is blank
# Extract roles from `Interview_ID`
data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown')
# Map themes to groups
theme_grouping = {
'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration',
'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration',
'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration',
'Data Management and Curation': 'Data Management, Curation, and Governance',
'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance',
'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures',
'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures',
'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures'
}
data['Theme Group'] = data['Theme'].map(theme_grouping)
# Remove rows where `Theme Group` is NaN because they don't match any group
data = data.dropna(subset=['Theme Group'])
# Calculate the number of unique Interview_IDs per role
interview_counts = data.groupby('Role')['Interview_ID'].nunique()
# Get a summary of the number of themes per role
theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts')
# Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization
theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1)
# Pivot for plotting
theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0)
# Plotting the weighted theme groups
theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8)
plt.title('Weighted Breakdown of Theme Groups Across Different Roles')
plt.ylabel('Weighted Number of Themes per Interviewee')
plt.xlabel('Role')
plt.xticks(rotation=45)
plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
# Save plot to file
plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight')
plt.clf()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment