Last active
May 19, 2023 08:47
-
-
Save bonifazi/4d5850e334c4f6b81d950e0e00adbee2 to your computer and use it in GitHub Desktop.
PCA for large datasets with Plink2 --pca approx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First compute PCA files using Plink2 from the command line | |
# Refer to https://www.cog-genomics.org/plink/2.0/strat#pca for how PCA approximation is done | |
# As suggested in the link above, --maf is applied for variants below 0.01 | |
# Plink2 command line is: | |
# plink2 --cow --bfile <mybfile> --keep-allele-order --maf 0.01 --pca approx | |
######### plot in R ################ | |
# Output: a 4 pages pdf file with: pages 1 to 3 are scatterplots of PCx vs PCx (with PCx from 1 to 3), page 4 is a 3D scatterplot of the first 3 PCs. | |
pacman::p_load(dplyr, data.table, ggplot2, scatterplot3d) | |
pca_res <- fread(file = "plink2.eigenvec") | |
group_info <- fread(file = "group.txt") # contains ID and grouping variable to differentiate plink samples IDs (e.g. IDs of different populations or different breeds). | |
#head(pca_res) | |
#head(group_info) | |
# attach group info to PCA ID results to add colours by group in plots | |
merged_res <- pca_res %>% | |
left_join(group_info %>% select(id, group), by=c("IID"="id")) | |
# head(merged_res) | |
pdf("PCA_plot.pdf") | |
# PC1 vs PC2 | |
merged_res %>% | |
ggplot(aes(x=PC1, y=PC2, colour = group)) + | |
geom_point(aes(colour = group))+ | |
xlab("PC1")+ylab("PC2") | |
# PC2 vs PC3 | |
merged_res %>% | |
ggplot(aes(x=PC2, y=PC3, colour = group)) + | |
geom_point(aes(colour = group))+ | |
xlab("PC2")+ylab("PC3") | |
# PC1 vs PC3 | |
merged_res %>% | |
ggplot(aes(x=PC1, y=PC3, colour = group)) + | |
geom_point(aes(colour = group))+ | |
xlab("PC1")+ylab("PC3") | |
# 3d plot | |
colors <- c("#999999", "#E69F00", "#56B4E9") # define as many colours as number of 'groups' in the dataset | |
colors_vec <- colors[as.factor(merged_res$group)] | |
scatterplot3d(merged_res[,3:5], pch = 16, color=colors_vec) | |
legend("bottom", legend = levels(as.factor(merged_res$group)), | |
col = colors, pch = 16, | |
inset = -0.18, xpd = TRUE, horiz = TRUE) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment