madoke/pfam-parser.sh

## pfam-parser.sh
#!/bin/bash

echo "Super Mega Hyper Pfam database parser started. Sit back and relax :)"
echo "Begin parsing " $1 " file.."

#remove old output
rm -rf parser_output
mkdir parser_output

#parse command line arguments
isParseOrganisms=false
for i in $@;
do
	if [[ $i =~ --organisms ]];
        then
                isParseOrganisms=true
		continue
        fi

	if $isParseOrganisms
	then
		organisms[$orgCount]=$i
		orgCount=$[$orgCount+1]
	fi
done

echo "Will extract groups with GF TP Family and split them among the following organisms:"
echo ${organisms[@]}

for i in ${organisms[@]};
do
	echo "<groups>" >> "parser_output/$i.txt"
done

cat $1 | while read inputLine;
do
	#identify group ID by searching the string "#=GF ID"
	if [[ $inputLine =~ \#=GF\ ID.* ]];
	then
		groupId=$inputLine
	fi

	#determine whether this group is a family or not by searching the string "#=GF TP Family"
        if [[ $inputLine =~ \#=GF\ TP\ Family.* ]];
        then
                isFamily=true
        fi

	#determine the sequence number in this group by searching the string "#=GF SQ"
	if [[ $inputLine =~ \#=GF\ SQ.* ]];
	then
		seqNr=$inputLine
	fi

	#determine the Pfam-A accession number by searching the string "#=GF AC"
	if [[ $inputLine =~ \#=GF\ AC.* ]];
	then
		accessionNr=$inputLine
	fi

	#determine the Pfam family description by searching the string "#=GF DE"
	if [[ $inputLine =~ \#=GF\ DE.* ]];
	then
		familyDesc=$inputLine
	fi

	#determine the group comments by searching the string "#=GF CC"
	if [[ $inputLine =~ \#=GF\ CC.* ]];
	then
		comment=$inputLine
	fi

	#collect group sequences by searching the string "#=GS"
	if [[ $inputLine =~ \#=GS.* ]];
	then
		sequences[$seqCount]=$inputLine
		seqCount=$[$seqCount+1]

		#detect if the sequence belongs to one of the organisms we specified
		for i in "${organisms[@]}";
		do
			if [[ $inputLine =~ .*_$i/.* ]];
			then
				match=$(echo "${foundOrganisms[@]:0}" | grep -o $i)
				if [[ -z $match ]]
				then
					foundOrganisms=(${foundOrganisms[@]} $i)
				fi
				break
			fi
		done
	fi

	#detect end of group token.
	if [[ $inputLine =~ // ]];
	then
		stockholmCount=$[$stockholmCount+1]

		#if the alignment belongs to a family we want to keep it
		if $isFamily
		then
			for i in "${foundOrganisms[@]}";
			do
				outfile="parser_output/$i.txt"
				echo "<group hash='$stockholmCount'>" >> $outfile
				echo "<id hash='$stockholmCount'>${groupId:2}</id>" >> $outfile
				echo "<accessionNr hash='$stockholmCount'>${accessionNr:2}</accessionNr>" >> $outfile
				echo "<description hash='$stockholmCount'>${familyDesc:2}</description>" >> $outfile
				echo "<comment hash='$stockholmCount'>${comment:2}</comment>" >> $outfile
				echo "<sequenceCount hash='$stockholmCount'>${seqNr:2}</sequenceCount>" >> $outfile
				echo "<sequences hash='$stockholmCount'>" >> $outfile
				for j in "${sequences[@]}";
				do
					#print only this organism's sequences
					if [[ $j =~ .*$i.* ]];
					then
						echo "<sequence>${j:2}</sequence>" >> $outfile
					fi
				done
				echo "</sequences>" >> $outfile
				echo "</group>" >> $outfile
			done
		fi

		#reset iteration variables
		unset alignment
		unset inputLineCount
		unset isFamily
		unset groupId
		unset accessionNr
		unset seqNr
		unset familyDesc
		unset comment
		unset organismCount
		unset seqcount
		unset fileCount
		unset foundOrganisms
		unset sequences

		echo "Processed group: $stockholmCount"
	fi
done

for i in ${organisms[@]};
do
        echo "</groups>" >> "parser_output/$i.txt"
done
	#!/bin/bash

	echo "Super Mega Hyper Pfam database parser started. Sit back and relax :)"
	echo "Begin parsing " $1 " file.."

	#remove old output
	rm -rf parser_output
	mkdir parser_output

	#parse command line arguments
	isParseOrganisms=false
	for i in $@;
	do
	if [[ $i =~ --organisms ]];
	then
	isParseOrganisms=true
	continue
	fi

	if $isParseOrganisms
	then
	organisms[$orgCount]=$i
	orgCount=$[$orgCount+1]
	fi
	done

	echo "Will extract groups with GF TP Family and split them among the following organisms:"
	echo ${organisms[@]}

	for i in ${organisms[@]};
	do
	echo "<groups>" >> "parser_output/$i.txt"
	done

	cat $1 \| while read inputLine;
	do
	#identify group ID by searching the string "#=GF ID"
	if [[ $inputLine =~ \#=GF\ ID.* ]];
	then
	groupId=$inputLine
	fi

	#determine whether this group is a family or not by searching the string "#=GF TP Family"
	if [[ $inputLine =~ \#=GF\ TP\ Family.* ]];
	then
	isFamily=true
	fi

	#determine the sequence number in this group by searching the string "#=GF SQ"
	if [[ $inputLine =~ \#=GF\ SQ.* ]];
	then
	seqNr=$inputLine
	fi

	#determine the Pfam-A accession number by searching the string "#=GF AC"
	if [[ $inputLine =~ \#=GF\ AC.* ]];
	then
	accessionNr=$inputLine
	fi

	#determine the Pfam family description by searching the string "#=GF DE"
	if [[ $inputLine =~ \#=GF\ DE.* ]];
	then
	familyDesc=$inputLine
	fi

	#determine the group comments by searching the string "#=GF CC"
	if [[ $inputLine =~ \#=GF\ CC.* ]];
	then
	comment=$inputLine
	fi

	#collect group sequences by searching the string "#=GS"
	if [[ $inputLine =~ \#=GS.* ]];
	then
	sequences[$seqCount]=$inputLine
	seqCount=$[$seqCount+1]

	#detect if the sequence belongs to one of the organisms we specified
	for i in "${organisms[@]}";
	do
	if [[ $inputLine =~ ._$i/. ]];
	then
	match=$(echo "${foundOrganisms[@]:0}" \| grep -o $i)
	if [[ -z $match ]]
	then
	foundOrganisms=(${foundOrganisms[@]} $i)
	fi
	break
	fi
	done
	fi

	#detect end of group token.
	if [[ $inputLine =~ // ]];
	then
	stockholmCount=$[$stockholmCount+1]

	#if the alignment belongs to a family we want to keep it
	if $isFamily
	then
	for i in "${foundOrganisms[@]}";
	do
	outfile="parser_output/$i.txt"
	echo "<group hash='$stockholmCount'>" >> $outfile
	echo "<id hash='$stockholmCount'>${groupId:2}</id>" >> $outfile
	echo "<accessionNr hash='$stockholmCount'>${accessionNr:2}</accessionNr>" >> $outfile
	echo "<description hash='$stockholmCount'>${familyDesc:2}</description>" >> $outfile
	echo "<comment hash='$stockholmCount'>${comment:2}</comment>" >> $outfile
	echo "<sequenceCount hash='$stockholmCount'>${seqNr:2}</sequenceCount>" >> $outfile
	echo "<sequences hash='$stockholmCount'>" >> $outfile
	for j in "${sequences[@]}";
	do
	#print only this organism's sequences
	if [[ $j =~ .$i. ]];
	then
	echo "<sequence>${j:2}</sequence>" >> $outfile
	fi
	done
	echo "</sequences>" >> $outfile
	echo "</group>" >> $outfile
	done
	fi

	#reset iteration variables
	unset alignment
	unset inputLineCount
	unset isFamily
	unset groupId
	unset accessionNr
	unset seqNr
	unset familyDesc
	unset comment
	unset organismCount
	unset seqcount
	unset fileCount
	unset foundOrganisms
	unset sequences

	echo "Processed group: $stockholmCount"
	fi
	done

	for i in ${organisms[@]};
	do
	echo "</groups>" >> "parser_output/$i.txt"
	done