Last active
June 22, 2024 07:38
-
-
Save zjrwtx/fcd1f56bbe64a9967d04da3476be5753 to your computer and use it in GitHub Desktop.
医学检验推荐项目的数据分析部分源码
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# Load the data from the Excel file | |
data = pd.read_excel("../检验推荐系统调查数据.xlsx") | |
# Calculate the average values for each column | |
average_values = data.mean(numeric_only=True) | |
# Columns to plot | |
columns = ["推荐准确性", "推荐相关性", "操作便捷性", "生成报告时间", "数据隐私保护", "系统可接受性"] | |
# Values to plot | |
values = average_values[columns] | |
plt.rcParams['font.family'] = ['SimHei'] # 使用字体名称 | |
# Create a bar plot with adjusted width | |
plt.figure(figsize=(3.5,4)) | |
plt.bar(values.index, values.values, color='skyblue', width=0.3) # 调整柱状宽度为0.6 | |
# Set font size for x and y ticks | |
plt.xticks(rotation=45, fontsize=10) # 设置x轴标签字体大小为12 | |
plt.yticks(fontsize=12) # 设置y轴标签字体大小为12 | |
# Set font size for axis labels and title | |
plt.xlabel('指标', fontsize=10) # 设置x轴标题字体大小 | |
plt.ylabel('平均分', fontsize=10) # 设置y轴标题字体大小 | |
plt.title('推荐系统各项指标的平均分', fontsize=12) # 设置图表标题字体大小 | |
# Set the limit for y-axis | |
plt.ylim(0, 10) | |
# Add value labels on the bars | |
for i, v in enumerate(values.values): | |
plt.text(i, v + 0.1, round(v, 2), ha='center', fontsize=10) # 设置数值标签字体大小为12 | |
# Save the plot as a PNG file | |
plt.savefig('recommendation_system_average_scores.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('recommendation_system_average_scores.tiff', format='tiff', bbox_inches='tight') | |
# Display the plot | |
plt.tight_layout() | |
plt.show() | |
# In[3]: | |
import seaborn as sns | |
# Compute the correlation matrix | |
correlation_matrix = data[columns].corr() | |
# Set up the matplotlib figure | |
plt.figure(figsize=(10, 8)) | |
# Generate a heatmap of the correlation matrix | |
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22}) | |
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 22}, cbar=True, cbar_kws={'label': 'Correlation Coefficient'}) | |
# Title and labels | |
plt.title('推荐系统各项指标的相关性分析', fontsize=20) | |
plt.xticks(rotation=45, ha="right", fontsize=16) | |
plt.yticks(rotation=0, ha="right", fontsize=16) | |
plt.savefig('Heat map correlation analysis.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('Heat map correlation analysis.tiff', format='tiff', bbox_inches='tight') | |
# Show the plot | |
plt.tight_layout() | |
plt.show() | |
# In[ ]: | |
# In[7]: | |
import pandas as pd | |
from scipy.stats import pearsonr | |
# 1. 加载数据 | |
file_path = '../检验推荐系统调查数据.xlsx' # 替换为您的Excel文件路径 | |
data = pd.read_excel(file_path) | |
# 2. 选择用于相关性分析的列 | |
columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性'] | |
# 3. 检查并处理缺失值 | |
# 检查缺失值 | |
missing_values = data[columns].isnull().sum() | |
# 如果有缺失值,使用每列的均值填充 | |
if missing_values.any(): | |
data[columns] = data[columns].fillna(data[columns].mean()) | |
# 4. 计算每对列之间的p值 | |
p_values = {} | |
for i, col1 in enumerate(columns): | |
for j, col2 in enumerate(columns): | |
if i < j: # 只计算上三角部分,避免重复 | |
_, p_value = pearsonr(data[col1], data[col2]) | |
p_values[(col1, col2)] = p_value | |
# 5. 输出p值 | |
for (col1, col2), p_value in p_values.items(): | |
print(f"p值 between {col1} and {col2}: {p_value:.2f}") | |
# In[6]: | |
import pandas as pd | |
from scipy.stats import kruskal | |
from scipy.stats import pearsonr | |
# 定义文件路径 | |
file_path = '../检验推荐系统调查数据.xlsx' | |
# 加载数据 | |
data = pd.read_excel(file_path) | |
# 定义评分指标列 | |
quantitative_columns = ['推荐准确性', '推荐相关性', '操作便捷性', '生成报告时间', '数据隐私保护', '系统可接受性'] | |
# 计算描述性统计信息 | |
descriptive_stats = data[quantitative_columns].describe() | |
print("描述性统计信息:\n", descriptive_stats) | |
# 计算相关性矩阵 | |
correlation_matrix = data[quantitative_columns].corr() | |
print("\n相关性矩阵:\n", correlation_matrix) | |
# 计算总体满意度 | |
data['overall_satisfaction'] = data[quantitative_columns].mean(axis=1) | |
# 检查异常值并删除它们 | |
def find_and_remove_outliers(data, columns): | |
outliers = {} | |
for col in columns: | |
q25 = data[col].quantile(0.25) | |
q75 = data[col].quantile(0.75) | |
iqr = q75 - q25 | |
lower_bound = q25 - 1.5 * iqr | |
upper_bound = q75 + 1.5 * iqr | |
outliers[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col] | |
outliers_to_remove = pd.concat([outliers[col] for col in outliers]).drop_duplicates().index | |
return data.drop(outliers_to_remove) | |
data_cleaned = find_and_remove_outliers(data, quantitative_columns) | |
# 使用Kruskal-Wallis H检验测试评分指标之间的差异 | |
kruskal_results_cleaned = kruskal( | |
data_cleaned['推荐准确性'], | |
data_cleaned['推荐相关性'], | |
data_cleaned['操作便捷性'], | |
data_cleaned['生成报告时间'], | |
data_cleaned['数据隐私保护'], | |
data_cleaned['系统可接受性'] | |
) | |
print("\nKruskal-Wallis H检验结果:\n", kruskal_results_cleaned) | |
# 使用Pearson相关系数检验总体满意度与其他指标的相关性 | |
pearson_results = {} | |
for col in quantitative_columns: | |
pearson_results[col], _ = pearsonr(data_cleaned['overall_satisfaction'], data_cleaned[col]) | |
print("\nPearson相关系数检验结果:\n", pearson_results) | |
# In[3]: | |
# Analyzing the "系统改进建议" column to determine common suggestions | |
suggestions = data["系统改进建议"].value_counts() | |
# pd.set_option('display.max_colwidth', 100) | |
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30 | |
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40) | |
data["系统改进建议"] = data["系统改进建议"].str.replace(r'[。.]', '', regex=True) | |
# Analyzing the "系统对工作流程的影响" column to determine the impact on work流程 | |
workflow_impact = data["系统对工作流程的影响"].value_counts() | |
# Analyzing the "系统对医疗质量的影响" column to determine the impact on medical quality | |
quality_impact = data["系统对医疗质量的影响"].value_counts() | |
# Plotting a bar chart for the top suggestions in "系统改进建议" | |
top_suggestions = suggestions.head(10) | |
plt.figure(figsize=(6, 6)) | |
top_suggestions.plot(kind='barh', color='skyblue') | |
# Set font size for x and y ticks | |
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12 | |
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12 | |
plt.xlabel('出现次数', fontsize=16) | |
plt.ylabel('系统改进建议', fontsize=16) | |
plt.title('最常见的10条系统改进建议', fontsize=16) | |
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top | |
# Save the plot as a PNG file | |
plt.savefig('最常见的10条系统改进建议.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('最常见的10条系统改进建议.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[10]: | |
# Analyzing the "系统改进建议" column to determine common suggestions | |
suggestions = data["系统对工作流程的影响"].value_counts() | |
# pd.set_option('display.max_colwidth', 100) | |
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30 | |
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40) | |
data["系统对工作流程的影响"] = data["系统对工作流程的影响"].str.replace(r'[。.]', '', regex=True) | |
# Plotting a bar chart for the top suggestions in "系统改进建议" | |
top_suggestions = suggestions.head(10) | |
plt.figure(figsize=(6, 6)) | |
top_suggestions.plot(kind='barh', color='lightcoral') | |
# Set font size for x and y ticks | |
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12 | |
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12 | |
plt.xlabel('出现次数', fontsize=16) | |
plt.ylabel('系统对工作流程的影响', fontsize=16) | |
plt.title('最常见的10种系统对工作流程的影响', fontsize=16) | |
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top | |
# Save the plot as a PNG file | |
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[4]: | |
# Analyzing the "系统改进建议" column to determine common suggestions | |
suggestions = data["系统对医疗质量的影响"].value_counts() | |
# pd.set_option('display.max_colwidth', 100) | |
# 使用 str.wrap() 来换行文本,这里设置每行最大字符数为 30 | |
# data["系统改进建议"] = data["系统改进建议"].str.wrap(40) | |
data["系统对医疗质量的影响"] = data["系统对医疗质量的影响"].str.replace(r'[。.]', '', regex=True) | |
# Plotting a bar chart for the top suggestions in "系统改进建议" | |
top_suggestions = suggestions.head(10) | |
plt.figure(figsize=(6, 6)) | |
top_suggestions.plot(kind='barh', color='lightgreen') | |
# Set font size for x and y ticks | |
plt.xticks(fontsize=16) # 设置x轴标签字体大小为12 | |
plt.yticks(fontsize=16) # 设置y轴标签字体大小为12 | |
plt.xlabel('出现次数', fontsize=16) | |
plt.ylabel('系统对医疗质量的影响', fontsize=16) | |
plt.title('最常见的10种系统对医疗质量的影响', fontsize=16) | |
plt.gca().invert_yaxis() # Invert the y-axis to display the most frequent suggestions at the top | |
# Save the plot as a PNG file | |
plt.savefig('系统对工作流程的影响.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('系统对工作流程的影响.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[10]: | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# 设置字体为支持中文的字体,例如“SimHei” | |
plt.rcParams['font.sans-serif'] = ['SimHei'] | |
# 为了支持负号,您还需要设置字体大小 | |
plt.rcParams['font.size'] = 16 | |
# 创建图形 | |
plt.figure(figsize=(6, 4)) | |
# Boxplots for each score category | |
sns.boxplot(x=data['推荐准确性'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('推荐准确性的分布') | |
plt.ylabel('推荐准确性') | |
# Save the plot as a PNG file | |
plt.savefig('推荐准确性的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('推荐准确性的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# Boxplots for other score categories | |
sns.boxplot(x=data['推荐相关性'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('推荐相关性的分布') | |
plt.ylabel('推荐相关性') | |
# Save the plot as a PNG file | |
plt.savefig('推荐相关性的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('推荐相关性的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
sns.boxplot(x=data['操作便捷性'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('操作便捷性的分布') | |
plt.ylabel('操作便捷性') | |
# Save the plot as a PNG file | |
plt.savefig('操作便捷性的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('操作便捷性的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
sns.boxplot(x=data['生成报告时间'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('生成报告时间的分布') | |
plt.ylabel('生成报告时间的总时间') | |
# Save the plot as a PNG file | |
plt.savefig('生成报告时间的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('生成报告时间的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
sns.boxplot(x=data['数据隐私保护'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('数据隐私保护的分布') | |
plt.ylabel('数据隐私保护') | |
# Save the plot as a PNG file | |
plt.savefig('数据隐私保护的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('数据隐私保护的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[11]: | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# 设置字体为支持中文的字体,例如“SimHei” | |
plt.rcParams['font.sans-serif'] = ['SimHei'] | |
# 为了支持负号,您还需要设置字体大小 | |
plt.rcParams['font.size'] = 16 | |
# 创建图形 | |
plt.figure(figsize=(6, 4)) | |
sns.boxplot(x=data['系统可接受性'], width=0.2) | |
plt.xticks(fontsize=16) | |
plt.title('系统可接受性的分布') | |
plt.ylabel('系统可接受性') | |
# Save the plot as a PNG file | |
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[12]: | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# 设置字体为支持中文的字体,例如“SimHei” | |
plt.rcParams['font.sans-serif'] = ['SimHei'] | |
# 为了支持负号,您还需要设置字体大小 | |
plt.rcParams['font.size'] = 16 | |
# 创建图形 | |
plt.figure(figsize=(6, 4)) | |
# 获取当前轴对象 | |
ax = plt.gca() | |
# 设置边框线厚度 | |
ax.spines['top'].set_linewidth(0.1) | |
ax.spines['bottom'].set_linewidth(0.1) | |
ax.spines['left'].set_linewidth(0.2) | |
ax.spines['right'].set_linewidth(0.1) | |
# 假设data是一个pandas DataFrame,且包含'系统可接受性'这一列 | |
sns.boxplot(x=data['系统可接受性'], width=0.2,) | |
plt.xticks(fontsize=16) | |
plt.title('系统可接受性的分布') | |
plt.ylabel('系统可接受性') | |
# 添加垂直线,例如在x=0.5的位置 | |
for x in [0.5, 2, 3.5,5.5,8,10]: # 您可以根据需要更改这些x坐标值 | |
plt.axvline(x=x, color='gray', linestyle='-', linewidth=0.3) | |
# Save the plot as a PNG file | |
plt.savefig('系统可接受性的分布.png', format='png', bbox_inches='tight') | |
# Save the plot as a TIFF file | |
plt.savefig('系统可接受性的分布.tiff', format='tiff', bbox_inches='tight') | |
plt.show() | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment