Skip to content

Instantly share code, notes, and snippets.

@samwalrus
Last active September 16, 2020 12:38
Show Gist options
  • Save samwalrus/aac96371de8f5c4e70d0de53e60040ce to your computer and use it in GitHub Desktop.
Save samwalrus/aac96371de8f5c4e70d0de53e60040ce to your computer and use it in GitHub Desktop.
Bash script and AWK script for creating genetic principle components including removing long range LD regions
($1 == 1) && ($4 >= 48227413) && ($4 <= 52227412) {print $2}
($1 == 2) && ($4 >= 86000000) && ($4 <= 100500000) {print $2}
($1 == 2) && ($4 >= 183291755) && ($4 <= 190291755) {print $2}
($1 == 3) && ($4 >= 47524996) && ($4 <= 50024996) {print $2}
($1 == 3) && ($4 >= 83417310) && ($4 <= 86917310) {print $2}
($1 == 5) && ($4 >= 128972101) && ($4 <= 131972101) {print $2}
($1 == 5) && ($4 >= 44500000) && ($4 <= 50500000) {print $2}
($1 == 6) && ($4 >= 57000000) && ($4 <= 64000000) {print $2}
($1 == 6) && ($4 >= 25392021) && ($4 <= 33392022) {print $2}
($1 == 6) && ($4 >= 139958307) && ($4 <= 142458307) {print $2}
($1 == 7) && ($4 >= 55000000) && ($4 <= 66000000) {print $2}
($1 == 8) && ($4 >= 7962590) && ($4 <= 11962591) {print $2}
($1 == 8) && ($4 >= 111930824) && ($4 <= 114930824) {print $2}
($1 == 8) && ($4 >= 43000000) && ($4 <= 50000000) {print $2}
($1 == 10) && ($4 >= 37000000) && ($4 <= 43000000) {print $2}
($1 == 11) && ($4 >= 87860352) && ($4 <= 90860352) {print $2}
($1 == 12) && ($4 >= 33000000) && ($4 <= 40000000) {print $2}
($1 == 20) && ($4 >= 32536339) && ($4 <= 35066586) {print $2}
($1 == 1) && ($4 >= 48287981) && ($4 <= 52287979) {print $2}
($1 == 2) && ($4 >= 86088343) && ($4 <= 101041482) {print $2}
($1 == 2) && ($4 >= 134666269) && ($4 <= 138166268) {print $2}
($1 == 2) && ($4 >= 183174495) && ($4 <= 190174494) {print $2}
($1 == 3) && ($4 >= 47524997) && ($4 <= 50024996) {print $2}
($1 == 3) && ($4 >= 83417311) && ($4 <= 86917310) {print $2}
($1 == 3) && ($4 >= 88917311) && ($4 <= 96017310) {print $2}
($1 == 5) && ($4 >= 44464244) && ($4 <= 50464243) {print $2}
($1 == 5) && ($4 >= 97972101) && ($4 <= 100472101) {print $2}
($1 == 5) && ($4 >= 128972102) && ($4 <= 131972101) {print $2}
($1 == 5) && ($4 >= 135472102) && ($4 <= 138472101) {print $2}
($1 == 6) && ($4 >= 25392022) && ($4 <= 33392022) {print $2}
($1 == 6) && ($4 >= 56892042) && ($4 <= 63942041) {print $2}
($1 == 6) && ($4 >= 139958308) && ($4 <= 142458307) {print $2}
($1 == 7) && ($4 >= 55225792) && ($4 <= 66555850) {print $2}
($1 == 8) && ($4 >= 7962591) && ($4 <= 11962591) {print $2}
($1 == 8) && ($4 >= 42880844) && ($4 <= 49837447) {print $2}
($1 == 8) && ($4 >= 111930825) && ($4 <= 114930824) {print $2}
($1 == 10) && ($4 >= 36959995) && ($4 <= 43679994) {print $2}
($1 == 11) && ($4 >= 46043425) && ($4 <= 57243424) {print $2}
($1 == 11) && ($4 >= 87860353) && ($4 <= 90860352) {print $2}
($1 == 12) && ($4 >= 33108734) && ($4 <= 41713733) {print $2}
($1 == 12) && ($4 >= 111037281) && ($4 <= 113537280) {print $2}
($1 == 20) && ($4 >= 32536340) && ($4 <= 35066586) {print $2}
#!/bin/bash
# This script will generate two sets of PCs, one for mothers and one for children.
# It only generates the PCs for a set of unrelated samples.
# Point to where the best guess plink versions of the ALSPAC data are
# Expect that there are mothers and children in the same plink dataset
data="/path/to/data/directory"
# Path to plink2 or plink1.9 binary
plink="/path/to/plink"
# Path to where output files will be stored
output="/path/to/output"
# Generate GRM
mkdir -p ${output}/kinships
${plink} --bfile ${data} --make-grm-bin --maf 0.01 --out ${output}/kinships/data
# Get Mother and children ID lists
awk '{ print $1, $2 }' ${data}.fam | grep "M" > ${output}/mothers.txt
awk '{ print $1, $2 }' ${data}.fam | grep -v "M" > ${output}/children.txt
# Get unrelateds
mkdir -p ${output}/unrelated_ids
plink --grm-bin ${output}/kinships/data --keep ${output}/mothers.txt --grm-cutoff 0.05 --out ${output}/unrelated_ids/mothers
plink --grm-bin ${output}/kinships/data --remove ${output}/mothers.txt --grm-cutoff 0.05 --out ${output}/unrelated_ids/children
# Get snp list with no long range LD regions
awk -f ld.awk ${data}.bim > nold.txt
# Get independent SNPs excluding any long range LD regions
${plink} --bfile ${data} --exclude nold.txt --indep 100 5 1.01 --out indep
# Calculate PCs for unrelateds
mkdir -p ${output}/principal_components
${plink} --bfile ${data} --keep ${output}/unrelated_ids/mothers/mothers.rel.id --extract indep.prune.in --pca 20 --out ${output}/principal_components/mothers
${plink} --bfile ${data} --keep ${output}/unrelated_ids/children/children.rel.id --extract indep.prune.in --pca 20 --out ${output}/principal_components/children
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment