Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created November 12, 2010 19:51
Show Gist options
  • Save stephenturner/674574 to your computer and use it in GitHub Desktop.
Save stephenturner/674574 to your computer and use it in GitHub Desktop.
impute_step0.sh
#!/bin/bash
# For running step0 of the imputation procedure
# Splits files and creates random subset for step 1 of imputation
# adapted from original script written by Jacki Buros
# arguments are
# 1) plink bed source file
# 2) prefix for output files
# check number of arguments
E_BADARGS=65
if [ ! -n "$1" ]
then
echo "Usage: `basename $0` <file basename> <output prefix> "
exit $E_BADARGS
fi
orig_dir=$(pwd)
prefix="$2"
# ---- settings ------
# Plink bed source file (used in step0 script)
SRCDIR="$(pwd)"
PLINK="$1"
# store output here
OUTDIR="${orig_dir}"
# num individuals per group in step 1 impute
SUBSETSIZE=300
# num individuals (total) per subset in step 2 impute
STEP2SIZE=200
# paths
PLINKBIN="/usr/local/plink/plink --nonfounders --noweb" #CHANGE
GAWKBIN="/usr/bin/gawk" #CHANGE
TARBIN="/bin/tar" #CHANGE
# SUBSET and COMPLETE prefixes
COMPLETE="_${prefix}_complete" # name of plink files containing complete bed files where parents are set to 0 0
SUBSET="_${prefix}_subset" # name of plink files containing subsets of the above files
GROUP="_${prefix}_group" # prefix for per-group id lists
USERNAME=$(whoami)
# prepare dirs for output & scratch
mkdir -p ${OUTDIR}
scratch="/scratch/${USERNAME}/${prefix}_impute" #CHANGE???
mkdir -p ${scratch}
# if passing a pedigree file need to convert it to a binary file
if [ -f "${PLINK}.ped" ]
then
echo "Creating PLINK binary files"
$PLINKBIN --file ${PLINK} --map3 --allow-no-sex --make-bed --out ${PLINK}
fi
echo "$(date) | Copying source files in $SRCDIR to ${scratch}"
cd ${SRCDIR}
cp -a --dereference ${PLINK}.* ${scratch}
echo "$(date) | Preparing base PLINK bed file (named ${COMPLETE})"
gawk '{print $1,$2,"0","0"}' ${PLINK}.fam > _update_parents # pulls out family ids and individual ids from fam file and zero's the parents.
$PLINKBIN --bfile $PLINK --update-parents _update_parents --set-hh-missing --allow-no-sex --make-bed --out $COMPLETE #> /dev/null # plink command to update the parental info with zeros.
FAMFILE="temp.${PLINK}.fam"
cp ${PLINK}.fam ${FAMFILE}
rm ${PLINK}.*
echo "$(date) | Preparing subset PLINK bed file (named ${SUBSET}) to be used in Step 1"
# Randomly selects iids from fam file for use in model estimation
for i in `cut -d' ' -f 1-2 $FAMFILE| sed s/\ /,/g`; do echo "$RANDOM $i"; done | sort | cut -d' ' -f 2| sed s/,/\ /g | head -n $SUBSETSIZE > subset.iids
$PLINKBIN --bfile ${COMPLETE} --keep subset.iids --make-bed --out ${SUBSET}
rm *.ped
rm *.map
rm *.log
rm *.hh
echo "$(date) | Preparing list of ids per subset (named ${GROUP}*) to be used in step 2"
gawk '{print $1,$2}' ${COMPLETE}.fam > _idlist
split -d -l $STEP2SIZE _idlist $GROUP
echo "$(date) | Preparation complete; copy files to $OUTDIR"
tar cfz ${prefix}_step0.tar.gz ${COMPLETE}.* ${SUBSET}.* ${GROUP}*
rm -f ${GROUP}*
mv ${prefix}_step0.tar.gz ${OUTDIR}
echo "$(date) | copy remaining files to $OUTDIR/rsync & clean up"
mkdir -p ${OUTDIR}/rsync
rsync -avz ${scratch}/ ${OUTDIR}/rsync/
# Let's clean up
if [[ $? -eq 0 ]] ; then
cd ${orig_dir}
rm -rf ${scratch}
else
echo "Unable to sync remaining files."
echo "Please ssh to $(hostname) and"
echo "look at the content of ${scratch}"
fi
echo "=========================================================="
echo "Finished on : $(date)"
echo "=========================================================="
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment