lmmx/countingfrominterfaces.sh

## countingfrominterfaces.sh
cut -d $'\t' -f 1 interfaces.tsv | sort | uniq | wc -l
# ==> 10,750 successfully downloaded and processed PDB entries in the interfaces file
# from the 26,425-strong shortlist

#  36,425 = 10,750 succesful + 2,327 verified as not found* + 13,348 missing from results without an explanation, later dissected as:
#
#     13,348 missing = 1,128 not found** + 6,676 downloaded with status OK (should've been processed...) + 5,544 not downloaded***
#
# *first time round, in notfoundcodemap.tsv via logfile_notfound.txt from the script pdbshortlist_mapper.sh in this gist - more were found on the...
# ...**second time round, marked as such in missingcodemapstatuses.tsv created by missingcodestatuschecker.sh
# *** 5,544 = 4,253 missed in file downloads that did occur + 1,291 from entirely empty files whose download skipped somehow

## missingcodestatuschecker.sh
#!/usr/bin/sh

# this code can be read as:
#
# for every line in the file (while read do ... done <missingcodemapsorted.tsv)
#	if the line does not contain the column header "query_of_2029" then (it's not the column names)
#		set $linecode as the first tab-separated field and $linefile as the second ('pdb_id' and 'file' colunmns)
#		find the status by getting **every PDB code in the file** and matching against the PDB code on this line, adding 1 to get the following line number containing status...
#		... and grepping the XML specifically at this line (case insensitively, i.e. slow but not for this short string, and obviously quicker to saerch shortlist of <pdb_code> tags)
#		NB this process is vastly sped up by keeping the shortlist in a variable $codelist, and retaining this (reading a list of missed codes sorted by file) until the next file
#		write the line to a report (either for its status or just that it's not downloaded), removing leading whitespace on any <status> tags by leaving it out of the quote marks:
#			"$line\t"$status
while read line
do
if [[ $line != *"query_of_2029"* ]]; then
	linecode=$(echo "$line" | awk '{split($0,a,"\t"); print a[1]}')
	if [[ $linefile != $(echo "$line" | awk '{split($0,a,"\t"); print a[2]}') ]]; then
		linefile=$(echo "$line" | awk '{split($0,a,"\t"); print a[2]}')
		codelist=$(grep -n '<pdb_code>' $linefile)
	fi
	echo -e "$linecode\t~\t$linefile"
	if [[ $codelist != '' ]]; then
		codematch=$(echo $codelist | grep -im 1 $linecode)
		if [[ $codematch != '' ]]; then
			statusline=$(echo $codematch | echo $(awk '{split($0,a,":"); print a[1]}') + 1 | bc)
			status=$(sed $statusline'q;d' $linefile)
			echo -e "$line\t"$status >> missingcodemapstatuses.tsv
		else
			echo -e "$linecode\t$linefile\tMissed download" >> missingcodesnotdownloaded.tsv
		fi
	else
		echo -e "$linecode\t$linefile\tWhole file" >> missingcodesnotdownloaded.tsv
	fi
fi
done <missingcodemapsorted.tsv

# NB this code was used after initial use of pdbshortlist_mapper.sh
# (... I realised during this that symbolic links are cooler than switching directories)

## pdbshortlist_mapper.sh
#!/usr/bin/sh

# . ~/Documents/LMB/PISA/pdbshortlist_mapper.sh

cd ~/Documents/LMB/PISA/

printf 'pdb_id\tfile\tquery_of_2029\n' > missingcodemap.tsv

missingcodes=$(cat pdbshortlist_round2_found.txt)

for pdbcode in ${missingcodes[@]}
do
	n=$(grep -in $pdbcode PISA50.txt | cut -d: -f1)
	floored_n=$(echo "($n - 1) / 50 + 11" | bc)
	fl_n_len=$((${#floored_n}-1))
	if [ ${floored_n:$fl_n_len} -eq 0 ]; then
		(( floored_n -= 9 ))
		floored_n+=0
	fi
	filename=interfaces${floored_n:0:1}-${floored_n:1}.xml
	if [ $filename == 'interfaces5-1.xml' ]; then
		filename=interfaces4-11.xml
	fi
	printf "$pdbcode\t$filename\t$n\n" >> missingcodemap.tsv
done

# To clarify, there were 2,029 queries of 50 (the last with just 12) =  101,412.
# PDB codes are not present in the output file interfaces.tsv as had been expected.
# The shortlist (unfortunately only obtained after download, hence used to target
# the processing step) stores 26,425 PDB codes identifying human protein-protein,
# protein-DNA and/or protein-RNA interface-containing deposited structures.

# “Entry not found” PDB codes in logfile.txt explain some of the 'missing' PDB codes
# >> saved as logfile_notfound.txt

# pdbshortlist.txt contains 26,425 (unique) PDB codes

# codes we have in interfaces.tsv saved >> interface_pdblist.txt...
# Missing IDs are the difference between pdbshortlist.txt and interface_pdblist.txt
# >> saved as pdbshortlist_round2.txt

# Further subset the missing proteins by removing any with "Entry not found" (for now at least)
# >> saved as pdbshortlist_round2_found.txt
	cut -d $'\t' -f 1 interfaces.tsv \| sort \| uniq \| wc -l
	# ==> 10,750 successfully downloaded and processed PDB entries in the interfaces file
	# from the 26,425-strong shortlist

	# 36,425 = 10,750 succesful + 2,327 verified as not found* + 13,348 missing from results without an explanation, later dissected as:
	#
	# 13,348 missing = 1,128 not found + 6,676 downloaded with status OK (should've been processed...) + 5,544 not downloaded*
	#
	# *first time round, in notfoundcodemap.tsv via logfile_notfound.txt from the script pdbshortlist_mapper.sh in this gist - more were found on the...
	# ...**second time round, marked as such in missingcodemapstatuses.tsv created by missingcodestatuschecker.sh
	# *** 5,544 = 4,253 missed in file downloads that did occur + 1,291 from entirely empty files whose download skipped somehow
	#!/usr/bin/sh

	# this code can be read as:
	#
	# for every line in the file (while read do ... done <missingcodemapsorted.tsv)
	# if the line does not contain the column header "query_of_2029" then (it's not the column names)
	# set $linecode as the first tab-separated field and $linefile as the second ('pdb_id' and 'file' colunmns)
	# find the status by getting every PDB code in the file and matching against the PDB code on this line, adding 1 to get the following line number containing status...
	# ... and grepping the XML specifically at this line (case insensitively, i.e. slow but not for this short string, and obviously quicker to saerch shortlist of <pdb_code> tags)
	# NB this process is vastly sped up by keeping the shortlist in a variable $codelist, and retaining this (reading a list of missed codes sorted by file) until the next file
	# write the line to a report (either for its status or just that it's not downloaded), removing leading whitespace on any <status> tags by leaving it out of the quote marks:
	# "$line\t"$status
	while read line
	do
	if [[ $line != "query_of_2029" ]]; then
	linecode=$(echo "$line" \| awk '{split($0,a,"\t"); print a[1]}')
	if [[ $linefile != $(echo "$line" \| awk '{split($0,a,"\t"); print a[2]}') ]]; then
	linefile=$(echo "$line" \| awk '{split($0,a,"\t"); print a[2]}')
	codelist=$(grep -n '<pdb_code>' $linefile)
	fi
	echo -e "$linecode\t~\t$linefile"
	if [[ $codelist != '' ]]; then
	codematch=$(echo $codelist \| grep -im 1 $linecode)
	if [[ $codematch != '' ]]; then
	statusline=$(echo $codematch \| echo $(awk '{split($0,a,":"); print a[1]}') + 1 \| bc)
	status=$(sed $statusline'q;d' $linefile)
	echo -e "$line\t"$status >> missingcodemapstatuses.tsv
	else
	echo -e "$linecode\t$linefile\tMissed download" >> missingcodesnotdownloaded.tsv
	fi
	else
	echo -e "$linecode\t$linefile\tWhole file" >> missingcodesnotdownloaded.tsv
	fi
	fi
	done <missingcodemapsorted.tsv

	# NB this code was used after initial use of pdbshortlist_mapper.sh
	# (... I realised during this that symbolic links are cooler than switching directories)
	#!/usr/bin/sh

	# . ~/Documents/LMB/PISA/pdbshortlist_mapper.sh

	cd ~/Documents/LMB/PISA/

	printf 'pdb_id\tfile\tquery_of_2029\n' > missingcodemap.tsv

	missingcodes=$(cat pdbshortlist_round2_found.txt)

	for pdbcode in ${missingcodes[@]}
	do
	n=$(grep -in $pdbcode PISA50.txt \| cut -d: -f1)
	floored_n=$(echo "($n - 1) / 50 + 11" \| bc)
	fl_n_len=$((${#floored_n}-1))
	if [ ${floored_n:$fl_n_len} -eq 0 ]; then
	(( floored_n -= 9 ))
	floored_n+=0
	fi
	filename=interfaces${floored_n:0:1}-${floored_n:1}.xml
	if [ $filename == 'interfaces5-1.xml' ]; then
	filename=interfaces4-11.xml
	fi
	printf "$pdbcode\t$filename\t$n\n" >> missingcodemap.tsv
	done

	# To clarify, there were 2,029 queries of 50 (the last with just 12) = 101,412.
	# PDB codes are not present in the output file interfaces.tsv as had been expected.
	# The shortlist (unfortunately only obtained after download, hence used to target
	# the processing step) stores 26,425 PDB codes identifying human protein-protein,
	# protein-DNA and/or protein-RNA interface-containing deposited structures.

	# “Entry not found” PDB codes in logfile.txt explain some of the 'missing' PDB codes
	# >> saved as logfile_notfound.txt

	# pdbshortlist.txt contains 26,425 (unique) PDB codes

	# codes we have in interfaces.tsv saved >> interface_pdblist.txt...
	# Missing IDs are the difference between pdbshortlist.txt and interface_pdblist.txt
	# >> saved as pdbshortlist_round2.txt

	# Further subset the missing proteins by removing any with "Entry not found" (for now at least)
	# >> saved as pdbshortlist_round2_found.txt