Skip to content

Instantly share code, notes, and snippets.

@rosherbal
Created January 22, 2020 13:50
Show Gist options
  • Save rosherbal/bbb4a91e98ca019aee9e032631fa7836 to your computer and use it in GitHub Desktop.
Save rosherbal/bbb4a91e98ca019aee9e032631fa7836 to your computer and use it in GitHub Desktop.
supfam parse
"""
TITLE :parsing_supfam
AUTHOR : Hernansaiz Ballesteros, Rosa.
rosa.hernansaiz@bioquant.uni-heidelberg.de
DESCRIPTION : Parsing http://www.supfam.org/ page to extract information
about superfamily domins of proteins
Information retrieved:
- protein
- Start/End Residue
- Superfamily
- Superfamily E-value
- Family
- Family E-value
- closest structure
LICENSE : GPL-v3
"""
#===========
# Packages #
#===========
import urllib.request
import pandas as pd
import time
from tqdm import tqdm
#===================
# Class definition #
#===================
class SuperFam:
"""
Information on supfam for an entity.
Attibutes:
- StartResidue
- EndResidue
- Superfamily
- SuperFamEvalue
- Family
- FamilyEvalue
- ClosestStructure
"""
def __init__(self, StartResidue = None, EndResidue = None,
Superfamily = None, SuperFamEvalue = None,
Family = None, FamilyEvalue = None,
ClosestStructure = None):
self.StartResidue = StartResidue
self.EndResidue = EndResidue
self.Superfamily = Superfamily
self.SuperFamEvalue = SuperFamEvalue
self.Family = Family
self.FamilyEvalue = FamilyEvalue
self.ClosestStructure = ClosestStructure
def getSuperFam(self):
sf_dict = {"StartResidue":self.StartResidue,
"EndResidue":self.EndResidue,
"Superfamily":self.Superfamily,
"SuperFamEvalue":self.SuperFamEvalue,
"Family":self.Family,
"FamilyEvalue":self.FamilyEvalue,
"ClosestStructure":self.ClosestStructure}
return sf_dict
#====================================
# Parameters and required variables #
#====================================
network_file = '~/Projects/00-Networks/omnipath_signed_directed_IntAct-Signor-BioGrid_uniprot_Aug2019.tsv'
network = pd.read_csv(network_file, sep = '\t', header = 0)
proteins = list(set(network['source'].tolist() + network['target'].tolist()))
len(proteins)
saveFile = '~/Projects/00-Networks/omnipath_Aug2019_supfam.csv'
supfam = [] # container for all entries
flg = False # control of entries' information
#===============
# Main program #
#===============
for i in tqdm(range(len(proteins))):
protein = proteins[i]
prot_domain = SuperFam()
idx = 0
u2 = urllib.request.urlopen('http://www.supfam.org/genome/up/sequence/'+ protein)
for line in u2.readlines():
line = str(line)
# Get information for the protein
if "b\'\\t<tbody>\\n\'" in line:
flg = True
continue
if flg:
if "b\'\\t</tbody>\\n\'" in line:
flg = False
continue
if flg:
if "t<td>" in line and "Alignments" not in line:
idx = idx + 1
aux = line.replace("b\'\\t\\t\\t<td>", "")
aux = aux.replace("</td>\\n\'", "")
if "</a>" in aux:
aux = aux.replace("</a>", "")
aux = aux.split('>')[1]
if "&quot;" in aux:
aux = aux.replace("&quot;", "")
if ',' in aux:
aux = aux.replace(",", ";")
if idx == 1:
prot_domain.StartResidue = aux
if idx == 2:
prot_domain.EndResidue = aux
if idx == 3:
prot_domain.Superfamily = aux
if idx == 4:
prot_domain.SuperFamEvalue = aux
if idx == 5:
prot_domain.Family = aux
if idx == 6:
prot_domain.FamilyEvalue = aux
if idx == 7:
prot_domain.ClosestStructure = aux
prot_dict = prot_domain.getSuperFam()
prot_dict['protein'] = protein
idx = 0
supfam.append(prot_dict)
prot_domain = SuperFam()
sf = pd.DataFrame.from_dict(supfam, orient='columns')
sf.to_csv(saveFile, index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment