Skip to content

Instantly share code, notes, and snippets.

@edraizen
Created January 28, 2016 17:57
Show Gist options
  • Save edraizen/aa4b3cae85ec5b579c2e to your computer and use it in GitHub Desktop.
Save edraizen/aa4b3cae85ec5b579c2e to your computer and use it in GitHub Desktop.
from collections import defaultdict
import requests
from bs4 import beautifulsoup
num_sequences = defaultdict(int)
with open("/Users/edraizen/Dropbox/Membrane-Proteins/data/pfam-uniprot/pfam-polytopic_201601.txt") as pfam_f:
pfam_f.next()
for line in pfam_f:
pfam = line.split()[1]
r = requests.get("http://pfam.xfam.org/family/{}#tabview=tab3".format(pfam))
soup = BeautifulSoup(r.text)
for th in soup.find_all('th'):
if "UniProt" in th.text:
try: num_sequences[pfam] = int(th.text.split("(")[1][:-1])
except Exception as e: print e, pfam, th.text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment