Skip to content

Instantly share code, notes, and snippets.

@zjrwtx
Last active June 22, 2024 07:38
Show Gist options
  • Save zjrwtx/fcd1f56bbe64a9967d04da3476be5753 to your computer and use it in GitHub Desktop.
Save zjrwtx/fcd1f56bbe64a9967d04da3476be5753 to your computer and use it in GitHub Desktop.
医学检验推荐项目的数据分析部分源码
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
# Load the data from the Excel file
data = pd.read_excel("../检验推荐系统调查数据.xlsx")
# Calculate the average values for each column
average_values = data.mean(numeric_only=True)
# Columns to plot
columns = ["推荐准确性", "推荐相关性", "操作便捷性", "生成报告时间", "数据隐私保护", "系统可接受性"]
# Values to plot
values = average_values[columns]
plt.rcParams['font.family'] = ['SimHei'] # 使用字体名称
# Create a bar plot with adjusted width
plt.figure(figsize=(3.5,4))
plt.bar(values.index, values.values, color='skyblue', width=0.3) # 调整柱状宽度为0.6
# Set font size for x and y ticks
plt.xticks(rotation=45, fontsize=10) # 设置x轴标签字体大小为12
plt.yticks(fontsize=12) # 设置y轴标签字体大小为12
# Set font size for axis labels and title
plt.xlabel('指标', fontsize=10) # 设置x轴标题字体大小
plt.ylabel('平均分', fontsize=10) # 设置y轴标题字体大小
plt.title('推荐系统各项指标的平均分', fontsize=12) # 设置图表标题字体大小
# Set the limit for y-axis
plt.ylim(0, 10)
# Add value labels on the bars
for i, v in enumerate(values.values):
plt.text(i, v + 0.1, round(v, 2), ha='center', fontsize=10) # 设置数值标签字体大小为12
# Save the plot as a PNG file
plt.savefig('recommendation_system_average_scores.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('recommendation_system_average_scores.tiff', format='tiff', bbox_inches='tight')
# Display the plot
plt.tight_layout()
plt.show()
# In[3]:
import seaborn as sns
# Compute the correlation matrix
correlation_matrix = data[columns].corr()
# Set up the matplotlib figure
plt.figure(figsize=(10, 8))
# Generate a heatmap of the correlation matrix
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22})
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22}, cbar=True, cbar_kws={'label': 'Correlation Coefficient'})
# Title and labels
plt.title('推荐系统各项指标的相关性分析', fontsize=20)
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(rotation=0, ha="right", fontsize=16)
plt.savefig('Heat map correlation analysis.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('Heat map correlation analysis.tiff', format='tiff', bbox_inches='tight')
# Show the plot
plt.tight_layout()
plt.show()
# In[ ]:
# In[7]:
import pandas as pd
from scipy.stats import pearsonr
# 1. 加载数据
file_path = '../检验推荐系统调查数据.xlsx' # 替换为您的Excel文件路径
data = pd.read_excel(file_path)
# 2. 选择用于相关性分析的列
columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性']
# 3. 检查并处理缺失值
# 检查缺失值
missing_values = data[columns].isnull().sum()
# 如果有缺失值,使用每列的均值填充
if missing_values.any():
data[columns] = data[columns].fillna(data[columns].mean())
# 4. 计算每对列之间的p值
p_values = {}
for i, col1 in enumerate(columns):
for j, col2 in enumerate(columns):
if i < j: # 只计算上三角部分,避免重复
_, p_value = pearsonr(data[col1], data[col2])
p_values[(col1, col2)] = p_value
# 5. 输出p值
for (col1, col2), p_value in p_values.items():
print(f"p值 between {col1} and {col2}: {p_value:.2f}")
# In[6]:
import pandas as pd
from scipy.stats import kruskal
from scipy.stats import pearsonr
# 定义文件路径
file_path = '../检验推荐系统调查数据.xlsx'
# 加载数据
data = pd.read_excel(file_path)
# 定义评分指标列
quantitative_columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性']
# 计算描述性统计信息
descriptive_stats = data[quantitative_columns].describe()
print("描述性统计信息:\n", descriptive_stats)
# 计算相关性矩阵
correlation_matrix = data[quantitative_columns].corr()
print("\n相关性矩阵:\n", correlation_matrix)
# 计算总体满意度
data['overall_satisfaction'] = data[quantitative_columns].mean(axis=1)
# 检查异常值并删除它们
def find_and_remove_outliers(data, columns):
outliers = {}
for col in columns:
q25 = data[col].quantile(0.25)
q75 = data[col].quantile(0.75)
iqr = q75 - q25
lower_bound = q25 - 1.5 * iqr
upper_bound = q75 + 1.5 * iqr
outliers[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
outliers_to_remove = pd.concat([outliers[col] for col in outliers]).drop_duplicates().index
return data.drop(outliers_to_remove)
data_cleaned = find_and_remove_outliers(data, quantitative_columns)
# 使用Kruskal-Wallis H检验测试评分指标之间的差异
kruskal_results_cleaned = kruskal(
data_cleaned['推荐准确性'],
data_cleaned['推荐相关性'],
data_cleaned['操作便捷性'],
data_cleaned['生成报告时间'],
data_cleaned['数据隐私保护'],
data_cleaned['系统可接受性']
)
print("\nKruskal-Wallis H检验结果:\n", kruskal_results_cleaned)
# 使用Pearson相关系数检验总体满意度与其他指标的相关性
pearson_results = {}
for col in quantitative_columns:
pearson_results[col], _ = pearsonr(data_cleaned['overall_satisfaction'], data_cleaned[col])
print("\nPearson相关系数检验结果:\n", pearson_results)
# In[3]:
# Analyzing the "系统改进建议" column to determine common suggestions
suggestions = data["系统改进建议"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统改进建议"] = data["系统改进建议"].str.replace(r'[。.]', '', regex=True)
# Analyzing the "系统对工作流程的影响" column to determine the impact on work流程
workflow_impact = data["系统对工作流程的影响"].value_counts()
# Analyzing the "系统对医疗质量的影响" column to determine the impact on medical quality
quality_impact = data["系统对医疗质量的影响"].value_counts()
# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)
plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='skyblue')
# Set font size for x and y ticks
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12
plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统改进建议', fontsize=16)
plt.title('最常见的10条系统改进建议', fontsize=16)
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('最常见的10条系统改进建议.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('最常见的10条系统改进建议.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[10]:
# Analyzing the "系统改进建议" column to determine common suggestions
suggestions = data["系统对工作流程的影响"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统对工作流程的影响"] = data["系统对工作流程的影响"].str.replace(r'[。.]', '', regex=True)
# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)
plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='lightcoral')
# Set font size for x and y ticks
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12
plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统对工作流程的影响', fontsize=16)
plt.title('最常见的10种系统对工作流程的影响', fontsize=16)
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[4]:
# Analyzing the "系统改进建议" column to determine common suggestions
suggestions = data["系统对医疗质量的影响"].value_counts()
# pd.set_option('display.max_colwidth', 100)
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40)
data["系统对医疗质量的影响"] = data["系统对医疗质量的影响"].str.replace(r'[。.]', '', regex=True)
# Plotting a bar chart for the top suggestions in "系统改进建议"
top_suggestions = suggestions.head(10)
plt.figure(figsize=(6, 6))
top_suggestions.plot(kind='barh', color='lightgreen')
# Set font size for x and y ticks
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12
plt.xlabel('出现次数', fontsize=16)
plt.ylabel('系统对医疗质量的影响', fontsize=16)
plt.title('最常见的10种系统对医疗质量的影响', fontsize=16)
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top
# Save the plot as a PNG file
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[10]:
import matplotlib.pyplot as plt
import seaborn as sns
# 设置字体为支持中文的字体,例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号,您还需要设置字体大小
plt.rcParams['font.size'] = 16
# 创建图形
plt.figure(figsize=(6, 4))
# Boxplots for each score category
sns.boxplot(x=data['推荐准确性'], width=0.2)
plt.xticks(fontsize=16)
plt.title('推荐准确性的分布')
plt.ylabel('推荐准确性')
# Save the plot as a PNG file
plt.savefig('推荐准确性的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('推荐准确性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
# Boxplots for other score categories
sns.boxplot(x=data['推荐相关性'], width=0.2)
plt.xticks(fontsize=16)
plt.title('推荐相关性的分布')
plt.ylabel('推荐相关性')
# Save the plot as a PNG file
plt.savefig('推荐相关性的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('推荐相关性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
sns.boxplot(x=data['操作便捷性'], width=0.2)
plt.xticks(fontsize=16)
plt.title('操作便捷性的分布')
plt.ylabel('操作便捷性')
# Save the plot as a PNG file
plt.savefig('操作便捷性的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('操作便捷性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
sns.boxplot(x=data['生成报告时间'], width=0.2)
plt.xticks(fontsize=16)
plt.title('生成报告时间的分布')
plt.ylabel('生成报告时间的总时间')
# Save the plot as a PNG file
plt.savefig('生成报告时间的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('生成报告时间的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
sns.boxplot(x=data['数据隐私保护'], width=0.2)
plt.xticks(fontsize=16)
plt.title('数据隐私保护的分布')
plt.ylabel('数据隐私保护')
# Save the plot as a PNG file
plt.savefig('数据隐私保护的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('数据隐私保护的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[11]:
import matplotlib.pyplot as plt
import seaborn as sns
# 设置字体为支持中文的字体,例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号,您还需要设置字体大小
plt.rcParams['font.size'] = 16
# 创建图形
plt.figure(figsize=(6, 4))
sns.boxplot(x=data['系统可接受性'], width=0.2)
plt.xticks(fontsize=16)
plt.title('系统可接受性的分布')
plt.ylabel('系统可接受性')
# Save the plot as a PNG file
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[12]:
import matplotlib.pyplot as plt
import seaborn as sns
# 设置字体为支持中文的字体,例如“SimHei”
plt.rcParams['font.sans-serif'] = ['SimHei']
# 为了支持负号,您还需要设置字体大小
plt.rcParams['font.size'] = 16
# 创建图形
plt.figure(figsize=(6, 4))
# 获取当前轴对象
ax = plt.gca()
# 设置边框线厚度
ax.spines['top'].set_linewidth(0.1)
ax.spines['bottom'].set_linewidth(0.1)
ax.spines['left'].set_linewidth(0.2)
ax.spines['right'].set_linewidth(0.1)
# 假设data是一个pandas DataFrame,且包含'系统可接受性'这一列
sns.boxplot(x=data['系统可接受性'], width=0.2,)
plt.xticks(fontsize=16)
plt.title('系统可接受性的分布')
plt.ylabel('系统可接受性')
# 添加垂直线,例如在x=0.5的位置
for x in [0.5, 2, 3.5,5.5,8,10]: # 您可以根据需要更改这些x坐标值
plt.axvline(x=x, color='gray', linestyle='-', linewidth=0.3)
# Save the plot as a PNG file
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight')
# Save the plot as a TIFF file
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight')
plt.show()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment