elowy01/AWK cheat sheet

## AWK cheat sheet
awk '/gold/' coins.txt #look for all the records with the word gold and shows
these rows
//
awk '{if ($3 < 1980) print $3, "    ",$5,$6,$7,$8}' coins.txt #$3 is a variable
that stores the 3rd word of each row . "    " introduces 4 whitespaces for the
printing
//
awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
//
NR gives you the total number of records being processed or line number.
In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.

$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
Processing Record -  1
Processing Record -  2
Processing Record -  3
Processing Record -  4
Processing Record -  5
5 Students Records are processed
//
awk 'END { print NR }' data #Count the lines in a file
//
NF # Number of fields (columns) in a record

For example, if we have a file like the following:

cat student-marks
Jones 2143 78 84 77
Gondrol 2321 56 58 45
RinRao 2122 38 37
Edwin 2537 78 67 45
Dayan 2415 30 47

The following awk will generate:

$ awk '{print NR,"->",NF}' student-marks
1 -> 5
2 -> 5
3 -> 4
4 -> 5
5 -> 4

//
awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk
can be told to execute the commands
//
awk 'program' input-file1 input-file2.... #If the program is short, we can run the
program from the command-line
//
$example++ #increments the specified variable by one
//
Example:
-rw-r--r--  1 arnold   user   1933 Nov  7 13:05 Makefile
-rw-r--r--  1 arnold   user  10809 Nov  7 13:03 awk.h
-rw-r--r--  1 arnold   user    983 Apr 13 12:14 awk.tab.h
-rw-r--r--  1 arnold   user  31869 Jun 15 12:20 awk.y
-rw-r--r--  1 arnold   user  22414 Nov  7 13:03 awk1.c
-rw-r--r--  1 arnold   user  37455 Nov  7 13:03 awk2.c
-rw-r--r--  1 arnold   user  27511 Dec  9 13:07 awk3.c
-rw-r--r--  1 arnold   user   7989 Nov  7 13:03 awk4.c

ls -l | awk '$6 == "Nov" { sum += $5 }
             END { print sum }'
#when 6th row is equal to Nov executes the action. In this case it adds the  5th
row value to sum varible. At the end we print the value of sum.
//
#another arithmetic operation
awk '{sum+=$3-$2} END {print sum}' test.txt
//
/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of
them on a line. This is accomplished by separating the statements
with a semicolon (;).
//
awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile | wc -l#in this case the first
line of rawdatafile starts by #. So with this regex we say awk that do not
consider this line. Besides, with \ wc -l we count the number of lines that
returns the awk command
//
#Some characters cannot be included literally in string constants ("foo") or regexp
constants (/foo/).Instead, they should be represented with escape sequences, which
are character sequences beginning with a backslash (\).
//
^@chapter #matches @chapter at the beginning of a string
//
[^awk] #matches any character that is not an a, w, or k.
//
awk '{print $1}' prueba #print number 1 column
//
awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
//
awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
record
//
awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
//
awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
the middle
//
awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
//
awk -F : #sets the field separator
awk -F"\t"  {print $2}' minus_ko_125_FDR.bed
//
awk '{s += $1} END {print s}' prueba.txt #to sum column $1
//
#calculating number of columns in a tab-separated file
awk -F'\t' '{print NF; exit}' filename
//
#skipping first line of a file
awk 'NR!=1{print}' filename
//
awk 'NR==10' file.txt #jump to line 10 in file.txt
//
#equal to string or character
awk '{if ($5=="U") print}' filename
//
#remove all whitespaces by a single tab
awk -v OFS="\t" '$1=$1' file1
//
#regex in AWK
/
# selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
awk '$1 ~ /J/' inventory-shipped
or
awk '{ if ($1 ~ /J/) print }' inventory-shipped
/
#negating the REGEX now:
awk '$1 !~ /J/' inventory-shipped
//
#Tab field separator
awk 'BEGIN { FS = "\t" } ; { print $2 }'
#Using REGEX in AWK
 awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
//
#regex substitution within a field
echo '02/08/2011 7,33 Shopping' | awk '{sub(/,/,".",$2)} 1'

02/08/2011 7.33 Shopping

//
#Print all records from some pattern:
awk '/pattern/{f=1}f' file
//
#doing arithmetic operations within AWK
awk '{sum=$1+$2; print}' filename.txt
//
#piping in AWK
cut -f1 test_path | awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
//
#string concatenation in awk
awk -F'\t' '{print "string_to_concat" $1}'
//
#printing all columns except the first one:
awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1'  filename
//
#concatenating a string to each line in a file
awk '{print "prefix" $0}' file
//
#modifying a certain column in a file and printing the new columns separated by ;
awk -F'\t'  '{ OFS=";"; $44=$44"something"; print}' file.txt
//
#getting sequence lengths in a FASTQ file:
cat file.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c
//
#add single quotes to a comma separated list of words:
awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
//
#getting the max among a set of numbers:

Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.

> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
//
#getting columns names and their position in file
awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
//
#split in awk:
 awk '{split($0, a, ":")}'
 #           ^^  ^  ^^^
 #            |  |   |
 #       string  |   delimiter
 #               |
 #               array to store the pieces

For example:

echo "12|23|11" | awk '{split($0,a,"|"); print a[3],a[2],a[1]}'
//
# remove newlines (or breaks) by whitespaces:
awk '{printf "%s ",$0} END {print ""}' yourfile.txt
//
# change chromosome notations
(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
 1. Remove 'chr' from the chromosome notation:
awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
 2. Add chr before chromosome id
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf
	awk '/gold/' coins.txt #look for all the records with the word gold and shows
	these rows
	//
	awk '{if ($3 < 1980) print $3, " ",$5,$6,$7,$8}' coins.txt #$3 is a variable
	that stores the 3rd word of each row . " " introduces 4 whitespaces for the
	printing
	//
	awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
	//
	NR gives you the total number of records being processed or line number.
	In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.

	$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
	Processing Record - 1
	Processing Record - 2
	Processing Record - 3
	Processing Record - 4
	Processing Record - 5
	5 Students Records are processed
	//
	awk 'END { print NR }' data #Count the lines in a file
	//
	NF # Number of fields (columns) in a record

	For example, if we have a file like the following:

	cat student-marks
	Jones 2143 78 84 77
	Gondrol 2321 56 58 45
	RinRao 2122 38 37
	Edwin 2537 78 67 45
	Dayan 2415 30 47

	The following awk will generate:

	$ awk '{print NR,"->",NF}' student-marks
	1 -> 5
	2 -> 5
	3 -> 4
	4 -> 5
	5 -> 4

	//
	awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk
	can be told to execute the commands
	//
	awk 'program' input-file1 input-file2.... #If the program is short, we can run the
	program from the command-line
	//
	$example++ #increments the specified variable by one
	//
	Example:
	-rw-r--r-- 1 arnold user 1933 Nov 7 13:05 Makefile
	-rw-r--r-- 1 arnold user 10809 Nov 7 13:03 awk.h
	-rw-r--r-- 1 arnold user 983 Apr 13 12:14 awk.tab.h
	-rw-r--r-- 1 arnold user 31869 Jun 15 12:20 awk.y
	-rw-r--r-- 1 arnold user 22414 Nov 7 13:03 awk1.c
	-rw-r--r-- 1 arnold user 37455 Nov 7 13:03 awk2.c
	-rw-r--r-- 1 arnold user 27511 Dec 9 13:07 awk3.c
	-rw-r--r-- 1 arnold user 7989 Nov 7 13:03 awk4.c

	ls -l \| awk '$6 == "Nov" { sum += $5 }
	END { print sum }'
	#when 6th row is equal to Nov executes the action. In this case it adds the 5th
	row value to sum varible. At the end we print the value of sum.
	//
	#another arithmetic operation
	awk '{sum+=$3-$2} END {print sum}' test.txt
	//
	/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of
	them on a line. This is accomplished by separating the statements
	with a semicolon (;).
	//
	awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile \| wc -l#in this case the first
	line of rawdatafile starts by #. So with this regex we say awk that do not
	consider this line. Besides, with \ wc -l we count the number of lines that
	returns the awk command
	//
	#Some characters cannot be included literally in string constants ("foo") or regexp
	constants (/foo/).Instead, they should be represented with escape sequences, which
	are character sequences beginning with a backslash (\).
	//
	^@chapter #matches @chapter at the beginning of a string
	//
	[^awk] #matches any character that is not an a, w, or k.
	//
	awk '{print $1}' prueba #print number 1 column
	//
	awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
	//
	awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
	record
	//
	awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
	//
	awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
	the middle
	//
	awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
	//
	awk -F : #sets the field separator
	awk -F"\t" {print $2}' minus_ko_125_FDR.bed
	//
	awk '{s += $1} END {print s}' prueba.txt #to sum column $1
	//
	#calculating number of columns in a tab-separated file
	awk -F'\t' '{print NF; exit}' filename
	//
	#skipping first line of a file
	awk 'NR!=1{print}' filename
	//
	awk 'NR==10' file.txt #jump to line 10 in file.txt
	//
	#equal to string or character
	awk '{if ($5=="U") print}' filename
	//
	#remove all whitespaces by a single tab
	awk -v OFS="\t" '$1=$1' file1
	//
	#regex in AWK
	/
	# selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
	awk '$1 ~ /J/' inventory-shipped
	or
	awk '{ if ($1 ~ /J/) print }' inventory-shipped
	/
	#negating the REGEX now:
	awk '$1 !~ /J/' inventory-shipped
	//
	#Tab field separator
	awk 'BEGIN { FS = "\t" } ; { print $2 }'
	#Using REGEX in AWK
	awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
	//
	#regex substitution within a field
	echo '02/08/2011 7,33 Shopping' \| awk '{sub(/,/,".",$2)} 1'

	02/08/2011 7.33 Shopping

	//
	#Print all records from some pattern:
	awk '/pattern/{f=1}f' file
	//
	#doing arithmetic operations within AWK
	awk '{sum=$1+$2; print}' filename.txt
	//
	#piping in AWK
	cut -f1 test_path \| awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
	//
	#string concatenation in awk
	awk -F'\t' '{print "string_to_concat" $1}'
	//
	#printing all columns except the first one:
	awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1' filename
	//
	#concatenating a string to each line in a file
	awk '{print "prefix" $0}' file
	//
	#modifying a certain column in a file and printing the new columns separated by ;
	awk -F'\t' '{ OFS=";"; $44=$44"something"; print}' file.txt
	//
	#getting sequence lengths in a FASTQ file:
	cat file.fastq \| awk '{if(NR%4==2) print length($1)}' \| sort -n \| uniq -c
	//
	#add single quotes to a comma separated list of words:
	awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
	//
	#getting the max among a set of numbers:

	Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.

	> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
	//
	#getting columns names and their position in file
	awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
	//
	#split in awk:
	awk '{split($0, a, ":")}'
	# ^^ ^ ^^^
	# \| \| \|
	# string \| delimiter
	# \|
	# array to store the pieces

	For example:

	echo "12\|23\|11" \| awk '{split($0,a,"\|"); print a[3],a[2],a[1]}'
	//
	# remove newlines (or breaks) by whitespaces:
	awk '{printf "%s ",$0} END {print ""}' yourfile.txt
	//
	# change chromosome notations
	(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
	1. Remove 'chr' from the chromosome notation:
	awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
	2. Add chr before chromosome id
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf