Last active
August 29, 2015 14:07
Mapping remaining PDB codes to source XML files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cut -d $'\t' -f 1 interfaces.tsv | sort | uniq | wc -l | |
# ==> 10,750 successfully downloaded and processed PDB entries in the interfaces file | |
# from the 26,425-strong shortlist | |
# 36,425 = 10,750 succesful + 2,327 verified as not found* + 13,348 missing from results without an explanation, later dissected as: | |
# | |
# 13,348 missing = 1,128 not found** + 6,676 downloaded with status OK (should've been processed...) + 5,544 not downloaded*** | |
# | |
# *first time round, in notfoundcodemap.tsv via logfile_notfound.txt from the script pdbshortlist_mapper.sh in this gist - more were found on the... | |
# ...**second time round, marked as such in missingcodemapstatuses.tsv created by missingcodestatuschecker.sh | |
# *** 5,544 = 4,253 missed in file downloads that did occur + 1,291 from entirely empty files whose download skipped somehow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/sh | |
# this code can be read as: | |
# | |
# for every line in the file (while read do ... done <missingcodemapsorted.tsv) | |
# if the line does not contain the column header "query_of_2029" then (it's not the column names) | |
# set $linecode as the first tab-separated field and $linefile as the second ('pdb_id' and 'file' colunmns) | |
# find the status by getting **every PDB code in the file** and matching against the PDB code on this line, adding 1 to get the following line number containing status... | |
# ... and grepping the XML specifically at this line (case insensitively, i.e. slow but not for this short string, and obviously quicker to saerch shortlist of <pdb_code> tags) | |
# NB this process is vastly sped up by keeping the shortlist in a variable $codelist, and retaining this (reading a list of missed codes sorted by file) until the next file | |
# write the line to a report (either for its status or just that it's not downloaded), removing leading whitespace on any <status> tags by leaving it out of the quote marks: | |
# "$line\t"$status | |
while read line | |
do | |
if [[ $line != *"query_of_2029"* ]]; then | |
linecode=$(echo "$line" | awk '{split($0,a,"\t"); print a[1]}') | |
if [[ $linefile != $(echo "$line" | awk '{split($0,a,"\t"); print a[2]}') ]]; then | |
linefile=$(echo "$line" | awk '{split($0,a,"\t"); print a[2]}') | |
codelist=$(grep -n '<pdb_code>' $linefile) | |
fi | |
echo -e "$linecode\t~\t$linefile" | |
if [[ $codelist != '' ]]; then | |
codematch=$(echo $codelist | grep -im 1 $linecode) | |
if [[ $codematch != '' ]]; then | |
statusline=$(echo $codematch | echo $(awk '{split($0,a,":"); print a[1]}') + 1 | bc) | |
status=$(sed $statusline'q;d' $linefile) | |
echo -e "$line\t"$status >> missingcodemapstatuses.tsv | |
else | |
echo -e "$linecode\t$linefile\tMissed download" >> missingcodesnotdownloaded.tsv | |
fi | |
else | |
echo -e "$linecode\t$linefile\tWhole file" >> missingcodesnotdownloaded.tsv | |
fi | |
fi | |
done <missingcodemapsorted.tsv | |
# NB this code was used after initial use of pdbshortlist_mapper.sh | |
# (... I realised during this that symbolic links are cooler than switching directories) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/sh | |
# . ~/Documents/LMB/PISA/pdbshortlist_mapper.sh | |
cd ~/Documents/LMB/PISA/ | |
printf 'pdb_id\tfile\tquery_of_2029\n' > missingcodemap.tsv | |
missingcodes=$(cat pdbshortlist_round2_found.txt) | |
for pdbcode in ${missingcodes[@]} | |
do | |
n=$(grep -in $pdbcode PISA50.txt | cut -d: -f1) | |
floored_n=$(echo "($n - 1) / 50 + 11" | bc) | |
fl_n_len=$((${#floored_n}-1)) | |
if [ ${floored_n:$fl_n_len} -eq 0 ]; then | |
(( floored_n -= 9 )) | |
floored_n+=0 | |
fi | |
filename=interfaces${floored_n:0:1}-${floored_n:1}.xml | |
if [ $filename == 'interfaces5-1.xml' ]; then | |
filename=interfaces4-11.xml | |
fi | |
printf "$pdbcode\t$filename\t$n\n" >> missingcodemap.tsv | |
done | |
# To clarify, there were 2,029 queries of 50 (the last with just 12) = 101,412. | |
# PDB codes are not present in the output file interfaces.tsv as had been expected. | |
# The shortlist (unfortunately only obtained after download, hence used to target | |
# the processing step) stores 26,425 PDB codes identifying human protein-protein, | |
# protein-DNA and/or protein-RNA interface-containing deposited structures. | |
# “Entry not found” PDB codes in logfile.txt explain some of the 'missing' PDB codes | |
# >> saved as logfile_notfound.txt | |
# pdbshortlist.txt contains 26,425 (unique) PDB codes | |
# codes we have in interfaces.tsv saved >> interface_pdblist.txt... | |
# Missing IDs are the difference between pdbshortlist.txt and interface_pdblist.txt | |
# >> saved as pdbshortlist_round2.txt | |
# Further subset the missing proteins by removing any with "Entry not found" (for now at least) | |
# >> saved as pdbshortlist_round2_found.txt | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment