bonifazi/Plink2PCA.R

## Plink2PCA.R
# First compute PCA files using Plink2 from the command line
# Refer to https://www.cog-genomics.org/plink/2.0/strat#pca for how PCA approximation is done
# As suggested in the link above, --maf is applied for variants below 0.01
# Plink2 command line is:
# plink2 --cow --bfile <mybfile> --keep-allele-order --maf 0.01 --pca approx

######### plot in R  ################
# Output: a 4 pages pdf file with: pages 1 to 3 are scatterplots of PCx vs PCx (with PCx from 1 to 3), page 4 is a 3D scatterplot of the first 3 PCs.
pacman::p_load(dplyr, data.table, ggplot2, scatterplot3d)
pca_res <- fread(file = "plink2.eigenvec")
group_info <- fread(file = "group.txt") # contains ID and grouping variable to differentiate plink samples IDs (e.g. IDs of different populations or different breeds).
#head(pca_res)
#head(group_info)

# attach group info to PCA ID results to add colours by group in plots
merged_res <- pca_res %>%
  left_join(group_info %>% select(id, group), by=c("IID"="id"))
# head(merged_res)

pdf("PCA_plot.pdf")
# PC1 vs PC2
merged_res %>%
  ggplot(aes(x=PC1, y=PC2, colour = group)) +
  geom_point(aes(colour = group))+
  xlab("PC1")+ylab("PC2")

# PC2 vs PC3
merged_res %>%
  ggplot(aes(x=PC2, y=PC3, colour = group)) +
  geom_point(aes(colour = group))+
  xlab("PC2")+ylab("PC3")

# PC1 vs PC3
merged_res %>%
  ggplot(aes(x=PC1, y=PC3, colour = group)) +
  geom_point(aes(colour = group))+
  xlab("PC1")+ylab("PC3")

# 3d plot
colors <- c("#999999", "#E69F00", "#56B4E9") # define as many colours as number of 'groups' in the dataset
colors_vec <- colors[as.factor(merged_res$group)]
scatterplot3d(merged_res[,3:5], pch = 16, color=colors_vec)
legend("bottom", legend = levels(as.factor(merged_res$group)),
       col =  colors, pch = 16,
       inset = -0.18, xpd = TRUE, horiz = TRUE)

dev.off()
	# First compute PCA files using Plink2 from the command line
	# Refer to https://www.cog-genomics.org/plink/2.0/strat#pca for how PCA approximation is done
	# As suggested in the link above, --maf is applied for variants below 0.01
	# Plink2 command line is:
	# plink2 --cow --bfile <mybfile> --keep-allele-order --maf 0.01 --pca approx

	######### plot in R ################
	# Output: a 4 pages pdf file with: pages 1 to 3 are scatterplots of PCx vs PCx (with PCx from 1 to 3), page 4 is a 3D scatterplot of the first 3 PCs.
	pacman::p_load(dplyr, data.table, ggplot2, scatterplot3d)
	pca_res <- fread(file = "plink2.eigenvec")
	group_info <- fread(file = "group.txt") # contains ID and grouping variable to differentiate plink samples IDs (e.g. IDs of different populations or different breeds).
	#head(pca_res)
	#head(group_info)

	# attach group info to PCA ID results to add colours by group in plots
	merged_res <- pca_res %>%
	left_join(group_info %>% select(id, group), by=c("IID"="id"))
	# head(merged_res)

	pdf("PCA_plot.pdf")
	# PC1 vs PC2
	merged_res %>%
	ggplot(aes(x=PC1, y=PC2, colour = group)) +
	geom_point(aes(colour = group))+
	xlab("PC1")+ylab("PC2")

	# PC2 vs PC3
	merged_res %>%
	ggplot(aes(x=PC2, y=PC3, colour = group)) +
	geom_point(aes(colour = group))+
	xlab("PC2")+ylab("PC3")

	# PC1 vs PC3
	merged_res %>%
	ggplot(aes(x=PC1, y=PC3, colour = group)) +
	geom_point(aes(colour = group))+
	xlab("PC1")+ylab("PC3")

	# 3d plot
	colors <- c("#999999", "#E69F00", "#56B4E9") # define as many colours as number of 'groups' in the dataset
	colors_vec <- colors[as.factor(merged_res$group)]
	scatterplot3d(merged_res[,3:5], pch = 16, color=colors_vec)
	legend("bottom", legend = levels(as.factor(merged_res$group)),
	col = colors, pch = 16,
	inset = -0.18, xpd = TRUE, horiz = TRUE)

	dev.off()