Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Last active October 21, 2020 17:19
Show Gist options
  • Save aaronwolen/17cec4a4862280ca4fcc6131c633b31b to your computer and use it in GitHub Desktop.
Save aaronwolen/17cec4a4862280ca4fcc6131c633b31b to your computer and use it in GitHub Desktop.
Shell script to run ingest/exports with different versions of tiledb-vcf
#!/usr/bin/env bash
# variables
#############
tilevcf="dist/bin/tiledbvcf"
tmpdir="/mnt/data/tmp"
samplefile="$tmpdir/samples.txt"
output_dir="data"
export_dir="$output_dir/exports"
log_dir="$output_dir/logs"
version="v3_211"
version="v4_211"
version="v4_sample-string-id_211"
version="v3_release_208"
# local or remote
dest=local
array_bucket="s3://genomic-datasets/vcf-samples-20"
array_filepath="/mnt/data/test-vcf-samples-20-arrays"
bedfile_raw="libtiledbvcf/test/inputs/E001_15_coreMarks_dense.bed"
bedfile="$output_dir/E001_15_coreMarks_dense_filtered.bed"
export_samples="v2-DjrIAzkP,v2-YMaDHIoW,v2-usVwJUmo,v2-ZVudhauk"
# setup directories
#####################
mkdir -p "$export_dir" "$log_dir" "$tmpdir"
# filter bedfile
######################
awk -F"\t" \
'BEGIN {OFS="\t"};
{ if ($1 <= 3 && $4 == "7_Enh")
{$1 = "chr"$1; print}
}' "$bedfile_raw" > "$bedfile"
echo "Bedfile:"
echo $(wc -l $bedfile)
# link version specific binary/lib
#####################################
# rename dist to include version
if [[ -d "dist" ]]; then
echo "Moving dist/ directory"
mv --verbose dist "dist_$version"
fi
if [[ -L "dist" ]]; then
echo "Removing dist/ symbolic link"
rm "dist"
fi
ln -s "dist_${version}" dist
ls -al dist
$tilevcf version
commit=$($tilevcf version | head -n1 | cut -d' ' -f3)
echo "$commit"
# set array destination
#########################
if [[ "$dest" == "remote" ]]
then
uri="$array_bucket/$version/vcf-samples-20"
bcf_dir="${array_bucket}/bcfs"
else
uri="${array_filepath}/${version}/vcf-samples-20"
bcf_dir="/mnt/data/genomic-datasets/vcf-samples-20"
fi
printf "Array URI is: %s\n" "$uri"
# setup
##########
# create samples files
if [[ -d "$tmpdir" ]]; then rm -rf "$tmpdir"/*; fi
if [[ "$uri" == "s3://"* ]]
then
echo "Deleting existing remote array"
aws s3 rm --recursive "$uri"
aws s3 ls "$bcf_dir" \
| awk -v bucket="$bucket/bcfs/" '{ if ($4 ~ /bcf$/) print bucket$4}' > samples.txt
else
echo "Deleting existing local array"
mkdir -p $(dirname "$uri")
if [[ -d "$uri" ]]; then rm -rf "$uri"; fi
ls "$bcf_dir"/*.bcf > "$samplefile"
fi
echo "Found the following samples:"
cat "$samplefile"
# ingest data
###############
$tilevcf create -u"$uri" -e2
$tilevcf register -u"$uri" -f "$samplefile" -d "$tmpdir" -s 500
$tilevcf store -u"$uri" \
-d "$tmpdir" -s 6000 \
--verbose \
-f "$samplefile" 2>&1 | tee "$log_dir/ingest_${version}_${commit}.log"
# perform export
#################
$tilevcf export \
--uri "$uri" \
--mem-budget-mb 512 \
-Ot -tCHR,POS,REF,ALT,S:GT \
-s "$export_samples" \
-R "$bedfile" \
-d "$export_dir" \
-o "export-${version}_${commit}.tsv" \
--verbose 2>&1 | tee "$log_dir/export_${version}_${commit}.log"
# check exports
##################
exportfile="$export_dir/export-${version}_${commit}.tsv"
mlr --icsv --ifs tab head "$exportfile"
mlr --icsv --ifs tab \
count -g SAMPLE \
"$exportfile"
# compress
##################
gzip "$exportfile"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment