David Obenshain obenshaindw

## Split VCF by Chromosome
seq 1 22 | xargs -n1 -P4 -I {} /usr/bin/vcftools/vcftools_0.1.11/bin/vcftools --gzvcf largevcf.vcf.gz --chr {} --recode --recode-INFO-all --out /split_by_chr/largevcf.chr{}

## Fix Chromosome Name in a VCF
/usr/bin/htslib/bcftools/bcftools view vcf_with_chr.vcf | sed "s/chr//g" | /usr/bin/htslib/htslib/bgzip -c > BCM_hg19.reheader.no_chr.vcf.gz

## Reheader a VCF file
/usr/bin/htslib/bcftools view -H vcf_with_bad_header.vcf > vcf_header.vcf
vim vcf_header.vcf
#Make changes to header
/usr/bin/htslib/bcftools/bcftools reheader -h vcf_header vcf_with_bad_header.vcf  -o reheadered.vcf

## Add dbSNP IDs to a VCF file
#GATK Method <- Slower and keeps original ID plut dbSNP rsID
# R=Reference FASTA
# V=VCF file to add IDs to
# --dbsnp = dbsnp VCF -- download from NCBI FTP

java -jar GenomeAnalysisTK.jar -R /reference/Homo_sapiens_assembly19.fasta -T VariantAnnotator -V vcf_to_add_id_to.vcf --dbsnp /reference/dbsnp_137.b37.vcf.gz --out /data/Broad.chr1.annotated.vcf

#bcftools Method <- Faster, replaces existing ID with dbSNP rsID
/usr/bin/htslib/bcftools/bcftools annotate -a /reference/dbsnp_137.b37.vcf.gz -c ID vcf_to_add_id_to.vcf

## Stream VCF from S3
#!/usr/bin/bash
#
# make_gz.sh
#

# Call this script with a list of s3 locations with VCF files to parse
# aws --profile NDAR s3 ls s3:/S3_URL/ | awk '{print $4}' | xargs -n1 -P4 sh make_gz.sh
# xargs -n1 -P4 accepts one argument and runs 4 parallel processes
#

## extract-genotypes.pl
use strict;
use warnings;
use Vcf;

my $filename = $ARGV[0];

open ( my $handle, "<", $filename);
my $vcf = Vcf->new(fh=>$handle);
    $vcf->parse_header();
vcf_iterate();

## Zip files in s3
echo $1
# Use grep REGEX to extract portion of s3 URL to reuse as zip file name.
folder=`echo $1 | grep -Eio '\/([0-9]+)\/$' | grep -Eio '([0-9]+)'`
mkdir ./$folder
echo s3cmd get --recursive $1 ./$folder
s3cmd get --recursive $1 ./$folder
echo zip -r $folder ./$folder/*
zip -r $folder ./$folder/*
echo rm -rf ./$folder/
rm -rf ./$folder/

## gist:bb6c2b4cf2aa7028813a
#!/bin/bash
# Pass in s3 URL=$1

# Set up Pathing
    ## Drop s3://
    pname=${1#*//}
    ## Drop Bucket Name, i.e., NDAR_Central*, NDAR_Results, etc.
    pname=${pname#*/}
    ## Get text after last /
    fname=${1##*/}

## refresh_nda_token.sh
#!/bin/bash
## NDA AWS Token Generator
## Author: NIMH Data Archives
##         http://ndar.nih.gov
## License: MIT
##          https://opensource.org/licenses/MIT

##############################################################################
#
# Script to retrieve generated AWS Tokens from NIMHDA

## mff-zipper.sh
#!/bin/bash

MFF_DIRECTORY=$1

for mffzip in "$MFF_DIRECTORY"*.mff.zip; do
  echo "Renaming $mffzip directories to just ${mffzip%.zip}"
  mv "$mffzip" "${mffzip%.zip}";
done

for mff in *.mff; do
	/usr/bin/htslib/bcftools view -H vcf_with_bad_header.vcf > vcf_header.vcf
	vim vcf_header.vcf
	#Make changes to header
	/usr/bin/htslib/bcftools/bcftools reheader -h vcf_header vcf_with_bad_header.vcf -o reheadered.vcf
	#GATK Method <- Slower and keeps original ID plut dbSNP rsID
	# R=Reference FASTA
	# V=VCF file to add IDs to
	# --dbsnp = dbsnp VCF -- download from NCBI FTP

	java -jar GenomeAnalysisTK.jar -R /reference/Homo_sapiens_assembly19.fasta -T VariantAnnotator -V vcf_to_add_id_to.vcf --dbsnp /reference/dbsnp_137.b37.vcf.gz --out /data/Broad.chr1.annotated.vcf

	#bcftools Method <- Faster, replaces existing ID with dbSNP rsID
	/usr/bin/htslib/bcftools/bcftools annotate -a /reference/dbsnp_137.b37.vcf.gz -c ID vcf_to_add_id_to.vcf
	#!/usr/bin/bash
	#
	# make_gz.sh
	#

	# Call this script with a list of s3 locations with VCF files to parse
	# aws --profile NDAR s3 ls s3:/S3_URL/ \| awk '{print $4}' \| xargs -n1 -P4 sh make_gz.sh
	# xargs -n1 -P4 accepts one argument and runs 4 parallel processes
	#
	use strict;
	use warnings;
	use Vcf;

	my $filename = $ARGV[0];

	open ( my $handle, "<", $filename);
	my $vcf = Vcf->new(fh=>$handle);
	$vcf->parse_header();
	vcf_iterate();
	echo $1
	# Use grep REGEX to extract portion of s3 URL to reuse as zip file name.
	folder=`echo $1 \| grep -Eio '\/([0-9]+)\/$' \| grep -Eio '([0-9]+)'`
	mkdir ./$folder
	echo s3cmd get --recursive $1 ./$folder
	s3cmd get --recursive $1 ./$folder
	echo zip -r $folder ./$folder/*
	zip -r $folder ./$folder/*
	echo rm -rf ./$folder/
	rm -rf ./$folder/
	#!/bin/bash
	# Pass in s3 URL=$1

	# Set up Pathing
	## Drop s3://
	pname=${1#*//}
	## Drop Bucket Name, i.e., NDAR_Central*, NDAR_Results, etc.
	pname=${pname#*/}
	## Get text after last /
	fname=${1##*/}
	#!/bin/bash
	## NDA AWS Token Generator
	## Author: NIMH Data Archives
	## http://ndar.nih.gov
	## License: MIT
	## https://opensource.org/licenses/MIT

	##############################################################################
	#
	# Script to retrieve generated AWS Tokens from NIMHDA
	#!/bin/bash

	MFF_DIRECTORY=$1

	for mffzip in "$MFF_DIRECTORY"*.mff.zip; do
	echo "Renaming $mffzip directories to just ${mffzip%.zip}"
	mv "$mffzip" "${mffzip%.zip}";
	done

	for mff in *.mff; do