natesheehan/themes.py

## themes.py
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
file_path = 'Interviews-Aggregated - Sheet1 (2).csv'  # Make sure to update this path
data = pd.read_csv(file_path)

# Clean the `Theme` column
data['Theme'] = data['Theme'].str.strip()  # Trim whitespace
data = data[data['Theme'] != '']  # Remove rows where `Theme` is blank

# Extract roles from `Interview_ID`
data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown')

# Map themes to groups
theme_grouping = {
    'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration',
    'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration',
    'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration',
    'Data Management and Curation': 'Data Management, Curation, and Governance',
    'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance',
    'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures',
    'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures',
    'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures'
}
data['Theme Group'] = data['Theme'].map(theme_grouping)

# Remove rows where `Theme Group` is NaN because they don't match any group
data = data.dropna(subset=['Theme Group'])

# Calculate the number of unique Interview_IDs per role
interview_counts = data.groupby('Role')['Interview_ID'].nunique()

# Get a summary of the number of themes per role
theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts')

# Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization
theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1)

# Pivot for plotting
theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0)

# Plotting the weighted theme groups
theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8)
plt.title('Weighted Breakdown of Theme Groups Across Different Roles')
plt.ylabel('Weighted Number of Themes per Interviewee')
plt.xlabel('Role')
plt.xticks(rotation=45)
plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Save plot to file
plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight')
plt.clf()
	import pandas as pd
	import matplotlib.pyplot as plt

	# Load the CSV file
	file_path = 'Interviews-Aggregated - Sheet1 (2).csv' # Make sure to update this path
	data = pd.read_csv(file_path)

	# Clean the `Theme` column
	data['Theme'] = data['Theme'].str.strip() # Trim whitespace
	data = data[data['Theme'] != ''] # Remove rows where `Theme` is blank

	# Extract roles from `Interview_ID`
	data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown')

	# Map themes to groups
	theme_grouping = {
	'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration',
	'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration',
	'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration',
	'Data Management and Curation': 'Data Management, Curation, and Governance',
	'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance',
	'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures',
	'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures',
	'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures'
	}
	data['Theme Group'] = data['Theme'].map(theme_grouping)

	# Remove rows where `Theme Group` is NaN because they don't match any group
	data = data.dropna(subset=['Theme Group'])

	# Calculate the number of unique Interview_IDs per role
	interview_counts = data.groupby('Role')['Interview_ID'].nunique()

	# Get a summary of the number of themes per role
	theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts')

	# Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization
	theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1)

	# Pivot for plotting
	theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0)

	# Plotting the weighted theme groups
	theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8)
	plt.title('Weighted Breakdown of Theme Groups Across Different Roles')
	plt.ylabel('Weighted Number of Themes per Interviewee')
	plt.xlabel('Role')
	plt.xticks(rotation=45)
	plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
	plt.tight_layout()

	# Save plot to file
	plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight')
	plt.clf()