Last active
October 3, 2015 09:58
-
-
Save madoke/2439103 to your computer and use it in GitHub Desktop.
Pfam database parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "Super Mega Hyper Pfam database parser started. Sit back and relax :)" | |
echo "Begin parsing " $1 " file.." | |
#remove old output | |
rm -rf parser_output | |
mkdir parser_output | |
#parse command line arguments | |
isParseOrganisms=false | |
for i in $@; | |
do | |
if [[ $i =~ --organisms ]]; | |
then | |
isParseOrganisms=true | |
continue | |
fi | |
if $isParseOrganisms | |
then | |
organisms[$orgCount]=$i | |
orgCount=$[$orgCount+1] | |
fi | |
done | |
echo "Will extract groups with GF TP Family and split them among the following organisms:" | |
echo ${organisms[@]} | |
for i in ${organisms[@]}; | |
do | |
echo "<groups>" >> "parser_output/$i.txt" | |
done | |
cat $1 | while read inputLine; | |
do | |
#identify group ID by searching the string "#=GF ID" | |
if [[ $inputLine =~ \#=GF\ ID.* ]]; | |
then | |
groupId=$inputLine | |
fi | |
#determine whether this group is a family or not by searching the string "#=GF TP Family" | |
if [[ $inputLine =~ \#=GF\ TP\ Family.* ]]; | |
then | |
isFamily=true | |
fi | |
#determine the sequence number in this group by searching the string "#=GF SQ" | |
if [[ $inputLine =~ \#=GF\ SQ.* ]]; | |
then | |
seqNr=$inputLine | |
fi | |
#determine the Pfam-A accession number by searching the string "#=GF AC" | |
if [[ $inputLine =~ \#=GF\ AC.* ]]; | |
then | |
accessionNr=$inputLine | |
fi | |
#determine the Pfam family description by searching the string "#=GF DE" | |
if [[ $inputLine =~ \#=GF\ DE.* ]]; | |
then | |
familyDesc=$inputLine | |
fi | |
#determine the group comments by searching the string "#=GF CC" | |
if [[ $inputLine =~ \#=GF\ CC.* ]]; | |
then | |
comment=$inputLine | |
fi | |
#collect group sequences by searching the string "#=GS" | |
if [[ $inputLine =~ \#=GS.* ]]; | |
then | |
sequences[$seqCount]=$inputLine | |
seqCount=$[$seqCount+1] | |
#detect if the sequence belongs to one of the organisms we specified | |
for i in "${organisms[@]}"; | |
do | |
if [[ $inputLine =~ .*_$i/.* ]]; | |
then | |
match=$(echo "${foundOrganisms[@]:0}" | grep -o $i) | |
if [[ -z $match ]] | |
then | |
foundOrganisms=(${foundOrganisms[@]} $i) | |
fi | |
break | |
fi | |
done | |
fi | |
#detect end of group token. | |
if [[ $inputLine =~ // ]]; | |
then | |
stockholmCount=$[$stockholmCount+1] | |
#if the alignment belongs to a family we want to keep it | |
if $isFamily | |
then | |
for i in "${foundOrganisms[@]}"; | |
do | |
outfile="parser_output/$i.txt" | |
echo "<group hash='$stockholmCount'>" >> $outfile | |
echo "<id hash='$stockholmCount'>${groupId:2}</id>" >> $outfile | |
echo "<accessionNr hash='$stockholmCount'>${accessionNr:2}</accessionNr>" >> $outfile | |
echo "<description hash='$stockholmCount'>${familyDesc:2}</description>" >> $outfile | |
echo "<comment hash='$stockholmCount'>${comment:2}</comment>" >> $outfile | |
echo "<sequenceCount hash='$stockholmCount'>${seqNr:2}</sequenceCount>" >> $outfile | |
echo "<sequences hash='$stockholmCount'>" >> $outfile | |
for j in "${sequences[@]}"; | |
do | |
#print only this organism's sequences | |
if [[ $j =~ .*$i.* ]]; | |
then | |
echo "<sequence>${j:2}</sequence>" >> $outfile | |
fi | |
done | |
echo "</sequences>" >> $outfile | |
echo "</group>" >> $outfile | |
done | |
fi | |
#reset iteration variables | |
unset alignment | |
unset inputLineCount | |
unset isFamily | |
unset groupId | |
unset accessionNr | |
unset seqNr | |
unset familyDesc | |
unset comment | |
unset organismCount | |
unset seqcount | |
unset fileCount | |
unset foundOrganisms | |
unset sequences | |
echo "Processed group: $stockholmCount" | |
fi | |
done | |
for i in ${organisms[@]}; | |
do | |
echo "</groups>" >> "parser_output/$i.txt" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To use the parser just download it to the same folder where you have your Pfam-A.full database and then type sh parser-sh Pfam-A.full --organisms [ORGANISM 1] [ORGANISM 2], etc..
It'll then create a folder named parser_output with a .txt file per organism. The output is one xml node per group. you can then use such output in a visual XML editor like xmlgrid.net. If your output files are just too big, I recommend splitting them into smaller ones