Last active
March 17, 2017 03:23
-
-
Save suqingdong/12d1de1cb8de805e62a6acc3b59b855a to your computer and use it in GitHub Desktop.
A simple example of crawler with requests and BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# A simple example of crawler with requests and BeautifulSoup | |
# Pay attention to encoding | |
import bs4 | |
import requests | |
def main(genelist): | |
with open(genelist) as f: | |
for line in f: | |
gene = line.strip() | |
save_gene(gene) | |
def save_gene(gene): | |
real_url = get_real_url(gene) | |
soup = get_soup(real_url) | |
with open(gene+'.txt', 'w') as out: | |
#result = soup.select('div[class="sgHeaderDiv bold sgSmooth"]') # does not work in linux | |
result = soup.find_all("div", class_="sgHeaderDiv bold sgSmooth" ) | |
if result: | |
for element in result: | |
out.write(element.text+'\n') | |
else: | |
out.write('Nothing found in gene '+gene) | |
print "Done with gene:", gene | |
def get_real_url(gene): | |
url = 'http://www.informatics.jax.org/searchtool/Search.do?query=' + gene | |
soup = get_soup(url) | |
real_url = soup.select('a[href*="marker/MGI"]')[0]['href'] | |
return real_url | |
def get_soup(url): | |
response = requests.get(url) | |
html = response.content.decode('windows-1252') | |
soup = bs4.BeautifulSoup(html, 'html.parser') | |
return soup | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) < 2: | |
print 'Usage: python %s <genelist>' % sys.argv[0] | |
exit(1) | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment