Mahedi-61/preprocess_go.sh

## preprocess_go.sh
#!/bin/bash
# This gist preprocess goa dataset of over 543M samples (~94 GB) to around 170M samples with selected column (~6 GB) for research
# removing first 8 lines
sed -i '1,8d' goa_uniprot_all.gaf

# putting NA to missing values
sed -i "s/\t\t/\tNA\t/g" goa_uniprot_all.gaf

# making string to one single world
sed -i 's/ /-/g' goa_uniprot_all.gaf

# making one single output csv file from selected columns
awk 'BEGIN {OFS=","}; {print $2,$5,$7,$9,$12,$13}' goa_uniprot_all.gaf > goa_uniprot_all.csv

# selecting only those samples which is protein have found in biological process (P)
awk -F"," 'BEGIN {OFS="\t"}; {if($5 == "protein" && $4 == "P") print $1,$2,$3,$6}' goa_uniprot_all.csv > final_goa_dataset.csv

## choosing unique protein id and combining multiple rows into one row. like
# protein_ID_1  Go_term_1,
# protein_ID_2  Go_term_2,
# protein_ID_1  Go_term_3,
# into like
# protein_ID_1 Go_term_1, Go_term_3
# protein_ID_2 Go_term_2

for protein_id in $(cat input_file.csv | cut -d',' -f1 | uniq); do
     libs=$(grep $protein_id input_file.csv | cut -d',' -f2 | paste -s -d, -)
     echo "$protein_id,$libs"
done

cut -d ',' -f 1 pro_seq.csv | paste -sd '|' | xargs -I{} grep -E {} prot_term.csv
# another solution using "awk"
awk -F',' '{print $1}' pro_seq.csv | paste -sd '|' | xargs -I{} grep -E {} prot_term.csv
	#!/bin/bash
	# This gist preprocess goa dataset of over 543M samples (~94 GB) to around 170M samples with selected column (~6 GB) for research
	# removing first 8 lines
	sed -i '1,8d' goa_uniprot_all.gaf

	# putting NA to missing values
	sed -i "s/\t\t/\tNA\t/g" goa_uniprot_all.gaf

	# making string to one single world
	sed -i 's/ /-/g' goa_uniprot_all.gaf

	# making one single output csv file from selected columns
	awk 'BEGIN {OFS=","}; {print $2,$5,$7,$9,$12,$13}' goa_uniprot_all.gaf > goa_uniprot_all.csv

	# selecting only those samples which is protein have found in biological process (P)
	awk -F"," 'BEGIN {OFS="\t"}; {if($5 == "protein" && $4 == "P") print $1,$2,$3,$6}' goa_uniprot_all.csv > final_goa_dataset.csv

	## choosing unique protein id and combining multiple rows into one row. like
	# protein_ID_1 Go_term_1,
	# protein_ID_2 Go_term_2,
	# protein_ID_1 Go_term_3,
	# into like
	# protein_ID_1 Go_term_1, Go_term_3
	# protein_ID_2 Go_term_2

	for protein_id in $(cat input_file.csv \| cut -d',' -f1 \| uniq); do
	libs=$(grep $protein_id input_file.csv \| cut -d',' -f2 \| paste -s -d, -)
	echo "$protein_id,$libs"
	done

	cut -d ',' -f 1 pro_seq.csv \| paste -sd '\|' \| xargs -I{} grep -E {} prot_term.csv
	# another solution using "awk"
	awk -F',' '{print $1}' pro_seq.csv \| paste -sd '\|' \| xargs -I{} grep -E {} prot_term.csv