chrisamiller/bash_tips_and_tricks.sh

## bash_tips_and_tricks.sh
#notes from the "bash tips and tricks" workshop

#there are two paths coming out of every command - stdout and stderr

#by default both write to the screen
date #output to stdout
date --asdf #error to stderr

#you can redirect stdout to a file
date >file.txt

#you can also redirect stderr to a file
date --asdf 2>err.log

#you can get fancy and send both to the same file
date 2>&1

#use wc -l to count the number of lines in a file
wc -l genes1

#use sort to reorder a file
sort genes1

#set operations
#find shared genes
cat genes1 genes2 | sort | uniq -d

#find genes1 specific
cat genes1 genes2 genes2 | sort | uniq -u

#find genes2 specific
cat genes1 genes1 genes2 | sort | uniq -u

#get a certain column from a file
cut -f 10 tcga.aml.tsv | head

#find the most frequent items in a certain column
cut -f 10 tcga.aml.tsv | sort | uniq -c | sort -nk 1 | head

#find lines in one file that match genes from another
grep -wf genes1 tcga.aml.tsv | less

#what if we want to find lines in genes1 that match the genes column in our complicated file?
# Nested expressions! <(command) treats the output of that command as a file
grep -wf <(cut -f 10 tcga.aml.tsv | sort | uniq) genes1


#we can also use the output of a command just as a text string or variable:
for i in $(cut -f 10 tcga.aml.tsv | sort | uniq);do grep $i genes2;done

#loop over known values
for i in 1 2 3 4;do echo $i;done

#loop over values from a file
grep IDH genes1 | while read i;do echo $i;done

#do something more complicated in the loop
grep IDH genes1 | while read i;do echo "my favorite gene is $i";done

#nested loops
for num in 1 2 3 4;do  cat genes | while read gene;do echo "$gene $num";done;done

#xargs can also be used to split things out into separate commands, individually or in chunks
cat genes1 | xargs -n 1 echo
cat genes1 | xargs -n 5 echo

#grab multiple columns from a file
cut -f 1,2 tcga.aml.tsv | head

#this doesn't change the order of the columns
cut -f 2,1 tcga.aml.tsv | head

#but awk can help!
awk '{print $2,$1}' tcga.aml.tsv | head

#some extra parameters (via an alias) can be used to keep things tab-delimited
alias tawk='awk -F"\t" -v OFS="\t"'
awk '{print $2,$1}' tcga.aml.tsv | head


#use CTRL-R to search backwards through your history, or type something like `history | less`

#pushd and popd can be used to navigate through directories saving your place
pushd /gscuser/cmiller
popd

#let's change a gzipped VCF file from having "chr" prefixes to having no prefixes using only one line of code:
cat <(gunzip -c asdf.vcf.gz | grep "^#") <(gunzip -c asdf.vcf.gz | grep -v "^#" | sed 's/^chr//') | gzip >asdf.fixed.vcf.gz

#get the full path to your current directory
pwd -P

#get the full path to a particular file
readlink -f genes1

#in ~cmiller/.bashrc and my ~cmiller/usr/bin there are lots of other useful commands. "header", "binfo", "bwait", "urlify", etc.  Do some exploring!
	#notes from the "bash tips and tricks" workshop

	#there are two paths coming out of every command - stdout and stderr

	#by default both write to the screen
	date #output to stdout
	date --asdf #error to stderr

	#you can redirect stdout to a file
	date >file.txt

	#you can also redirect stderr to a file
	date --asdf 2>err.log

	#you can get fancy and send both to the same file
	date 2>&1

	#use wc -l to count the number of lines in a file
	wc -l genes1

	#use sort to reorder a file
	sort genes1

	#set operations
	#find shared genes
	cat genes1 genes2 \| sort \| uniq -d

	#find genes1 specific
	cat genes1 genes2 genes2 \| sort \| uniq -u

	#find genes2 specific
	cat genes1 genes1 genes2 \| sort \| uniq -u

	#get a certain column from a file
	cut -f 10 tcga.aml.tsv \| head

	#find the most frequent items in a certain column
	cut -f 10 tcga.aml.tsv \| sort \| uniq -c \| sort -nk 1 \| head

	#find lines in one file that match genes from another
	grep -wf genes1 tcga.aml.tsv \| less

	#what if we want to find lines in genes1 that match the genes column in our complicated file?
	# Nested expressions! <(command) treats the output of that command as a file
	grep -wf <(cut -f 10 tcga.aml.tsv \| sort \| uniq) genes1


	#we can also use the output of a command just as a text string or variable:
	for i in $(cut -f 10 tcga.aml.tsv \| sort \| uniq);do grep $i genes2;done

	#loop over known values
	for i in 1 2 3 4;do echo $i;done

	#loop over values from a file
	grep IDH genes1 \| while read i;do echo $i;done

	#do something more complicated in the loop
	grep IDH genes1 \| while read i;do echo "my favorite gene is $i";done

	#nested loops
	for num in 1 2 3 4;do cat genes \| while read gene;do echo "$gene $num";done;done

	#xargs can also be used to split things out into separate commands, individually or in chunks
	cat genes1 \| xargs -n 1 echo
	cat genes1 \| xargs -n 5 echo

	#grab multiple columns from a file
	cut -f 1,2 tcga.aml.tsv \| head

	#this doesn't change the order of the columns
	cut -f 2,1 tcga.aml.tsv \| head

	#but awk can help!
	awk '{print $2,$1}' tcga.aml.tsv \| head

	#some extra parameters (via an alias) can be used to keep things tab-delimited
	alias tawk='awk -F"\t" -v OFS="\t"'
	awk '{print $2,$1}' tcga.aml.tsv \| head


	#use CTRL-R to search backwards through your history, or type something like `history \| less`

	#pushd and popd can be used to navigate through directories saving your place
	pushd /gscuser/cmiller
	popd

	#let's change a gzipped VCF file from having "chr" prefixes to having no prefixes using only one line of code:
	cat <(gunzip -c asdf.vcf.gz \| grep "^#") <(gunzip -c asdf.vcf.gz \| grep -v "^#" \| sed 's/^chr//') \| gzip >asdf.fixed.vcf.gz

	#get the full path to your current directory
	pwd -P

	#get the full path to a particular file
	readlink -f genes1

	#in ~cmiller/.bashrc and my ~cmiller/usr/bin there are lots of other useful commands. "header", "binfo", "bwait", "urlify", etc. Do some exploring!