Skip to content

Instantly share code, notes, and snippets.

@suqingdong
Last active March 17, 2017 03:23
Show Gist options
  • Save suqingdong/12d1de1cb8de805e62a6acc3b59b855a to your computer and use it in GitHub Desktop.
Save suqingdong/12d1de1cb8de805e62a6acc3b59b855a to your computer and use it in GitHub Desktop.
A simple example of crawler with requests and BeautifulSoup
#!/usr/bin/env python
# A simple example of crawler with requests and BeautifulSoup
# Pay attention to encoding
import bs4
import requests
def main(genelist):
with open(genelist) as f:
for line in f:
gene = line.strip()
save_gene(gene)
def save_gene(gene):
real_url = get_real_url(gene)
soup = get_soup(real_url)
with open(gene+'.txt', 'w') as out:
#result = soup.select('div[class="sgHeaderDiv bold sgSmooth"]') # does not work in linux
result = soup.find_all("div", class_="sgHeaderDiv bold sgSmooth" )
if result:
for element in result:
out.write(element.text+'\n')
else:
out.write('Nothing found in gene '+gene)
print "Done with gene:", gene
def get_real_url(gene):
url = 'http://www.informatics.jax.org/searchtool/Search.do?query=' + gene
soup = get_soup(url)
real_url = soup.select('a[href*="marker/MGI"]')[0]['href']
return real_url
def get_soup(url):
response = requests.get(url)
html = response.content.decode('windows-1252')
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
print 'Usage: python %s <genelist>' % sys.argv[0]
exit(1)
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment