Created
January 22, 2020 13:50
-
-
Save rosherbal/bbb4a91e98ca019aee9e032631fa7836 to your computer and use it in GitHub Desktop.
supfam parse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
TITLE :parsing_supfam | |
AUTHOR : Hernansaiz Ballesteros, Rosa. | |
rosa.hernansaiz@bioquant.uni-heidelberg.de | |
DESCRIPTION : Parsing http://www.supfam.org/ page to extract information | |
about superfamily domins of proteins | |
Information retrieved: | |
- protein | |
- Start/End Residue | |
- Superfamily | |
- Superfamily E-value | |
- Family | |
- Family E-value | |
- closest structure | |
LICENSE : GPL-v3 | |
""" | |
#=========== | |
# Packages # | |
#=========== | |
import urllib.request | |
import pandas as pd | |
import time | |
from tqdm import tqdm | |
#=================== | |
# Class definition # | |
#=================== | |
class SuperFam: | |
""" | |
Information on supfam for an entity. | |
Attibutes: | |
- StartResidue | |
- EndResidue | |
- Superfamily | |
- SuperFamEvalue | |
- Family | |
- FamilyEvalue | |
- ClosestStructure | |
""" | |
def __init__(self, StartResidue = None, EndResidue = None, | |
Superfamily = None, SuperFamEvalue = None, | |
Family = None, FamilyEvalue = None, | |
ClosestStructure = None): | |
self.StartResidue = StartResidue | |
self.EndResidue = EndResidue | |
self.Superfamily = Superfamily | |
self.SuperFamEvalue = SuperFamEvalue | |
self.Family = Family | |
self.FamilyEvalue = FamilyEvalue | |
self.ClosestStructure = ClosestStructure | |
def getSuperFam(self): | |
sf_dict = {"StartResidue":self.StartResidue, | |
"EndResidue":self.EndResidue, | |
"Superfamily":self.Superfamily, | |
"SuperFamEvalue":self.SuperFamEvalue, | |
"Family":self.Family, | |
"FamilyEvalue":self.FamilyEvalue, | |
"ClosestStructure":self.ClosestStructure} | |
return sf_dict | |
#==================================== | |
# Parameters and required variables # | |
#==================================== | |
network_file = '~/Projects/00-Networks/omnipath_signed_directed_IntAct-Signor-BioGrid_uniprot_Aug2019.tsv' | |
network = pd.read_csv(network_file, sep = '\t', header = 0) | |
proteins = list(set(network['source'].tolist() + network['target'].tolist())) | |
len(proteins) | |
saveFile = '~/Projects/00-Networks/omnipath_Aug2019_supfam.csv' | |
supfam = [] # container for all entries | |
flg = False # control of entries' information | |
#=============== | |
# Main program # | |
#=============== | |
for i in tqdm(range(len(proteins))): | |
protein = proteins[i] | |
prot_domain = SuperFam() | |
idx = 0 | |
u2 = urllib.request.urlopen('http://www.supfam.org/genome/up/sequence/'+ protein) | |
for line in u2.readlines(): | |
line = str(line) | |
# Get information for the protein | |
if "b\'\\t<tbody>\\n\'" in line: | |
flg = True | |
continue | |
if flg: | |
if "b\'\\t</tbody>\\n\'" in line: | |
flg = False | |
continue | |
if flg: | |
if "t<td>" in line and "Alignments" not in line: | |
idx = idx + 1 | |
aux = line.replace("b\'\\t\\t\\t<td>", "") | |
aux = aux.replace("</td>\\n\'", "") | |
if "</a>" in aux: | |
aux = aux.replace("</a>", "") | |
aux = aux.split('>')[1] | |
if """ in aux: | |
aux = aux.replace(""", "") | |
if ',' in aux: | |
aux = aux.replace(",", ";") | |
if idx == 1: | |
prot_domain.StartResidue = aux | |
if idx == 2: | |
prot_domain.EndResidue = aux | |
if idx == 3: | |
prot_domain.Superfamily = aux | |
if idx == 4: | |
prot_domain.SuperFamEvalue = aux | |
if idx == 5: | |
prot_domain.Family = aux | |
if idx == 6: | |
prot_domain.FamilyEvalue = aux | |
if idx == 7: | |
prot_domain.ClosestStructure = aux | |
prot_dict = prot_domain.getSuperFam() | |
prot_dict['protein'] = protein | |
idx = 0 | |
supfam.append(prot_dict) | |
prot_domain = SuperFam() | |
sf = pd.DataFrame.from_dict(supfam, orient='columns') | |
sf.to_csv(saveFile, index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment