Last active
August 21, 2018 09:03
-
-
Save Mahedi-61/644b424bf0a80f9b648f972204261e46 to your computer and use it in GitHub Desktop.
unix script for preprocessing and preparing Gene Ontology Consortium dataset (go annotations)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This gist preprocess goa dataset of over 543M samples (~94 GB) to around 170M samples with selected column (~6 GB) for research | |
# removing first 8 lines | |
sed -i '1,8d' goa_uniprot_all.gaf | |
# putting NA to missing values | |
sed -i "s/\t\t/\tNA\t/g" goa_uniprot_all.gaf | |
# making string to one single world | |
sed -i 's/ /-/g' goa_uniprot_all.gaf | |
# making one single output csv file from selected columns | |
awk 'BEGIN {OFS=","}; {print $2,$5,$7,$9,$12,$13}' goa_uniprot_all.gaf > goa_uniprot_all.csv | |
# selecting only those samples which is protein have found in biological process (P) | |
awk -F"," 'BEGIN {OFS="\t"}; {if($5 == "protein" && $4 == "P") print $1,$2,$3,$6}' goa_uniprot_all.csv > final_goa_dataset.csv | |
## choosing unique protein id and combining multiple rows into one row. like | |
# protein_ID_1 Go_term_1, | |
# protein_ID_2 Go_term_2, | |
# protein_ID_1 Go_term_3, | |
# into like | |
# protein_ID_1 Go_term_1, Go_term_3 | |
# protein_ID_2 Go_term_2 | |
for protein_id in $(cat input_file.csv | cut -d',' -f1 | uniq); do | |
libs=$(grep $protein_id input_file.csv | cut -d',' -f2 | paste -s -d, -) | |
echo "$protein_id,$libs" | |
done | |
cut -d ',' -f 1 pro_seq.csv | paste -sd '|' | xargs -I{} grep -E {} prot_term.csv | |
# another solution using "awk" | |
awk -F',' '{print $1}' pro_seq.csv | paste -sd '|' | xargs -I{} grep -E {} prot_term.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment