Skip to content

Instantly share code, notes, and snippets.

@kzinglzy
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kzinglzy/b342f1b75c0031ef1423 to your computer and use it in GitHub Desktop.
Save kzinglzy/b342f1b75c0031ef1423 to your computer and use it in GitHub Desktop.
Simple Huxiu Spider
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
try:
from urllib.parse import urljoin # python 3.x
except:
from urlparse import urljoin # python 2.x
class Spider:
def __init__(self, URL):
self.URL = URL # www.huxiu.com
def start_crawl(self):
""" mission start :)
"""
for t_name, t_url in self.get_tag_list():
if not t_name:
continue
print '\n', t_name
self.get_artice_from_tag(t_url)
def get_tag_list(self):
""" yield all of the tag and it's corresponding url
"""
target = "tagslist/all.html" # this url contaings all of tags :)
url = urljoin(self.URL, target)
r = requests.get(url)
soup = BeautifulSoup(r.text)
for each in soup.find_all('dl'):
for t in each('li'):
yield t('a')[0].string, t('a')[0]['href'] # tag_name, tag_url
def get_artice_from_tag(self, tag_url):
""" output the whole articles with the given tag
"""
url = urljoin(self.URL, tag_url)
index = 1
while True:
try:
r = requests.get(url)
except:
print 'Faild to URL: {}'.format(url)
return ''
# output the article title
soup = BeautifulSoup(r.text)
for each in soup.find_all('dl'):
print index, '. ', each('h3')[0].string # article title
index += 1
# get next url
current = soup.find('div', 'pull-right pgs')
if current: # some page is special, so we have to ensure current is no None
next_url = current.find_next_siblings('b', 'a')
if next_url:
url = next_url['href']
continue
break # when all of the article is indexed
if __name__ == '__main__':
sp = Spider('http://www.huxiu.com/')
sp.start_crawl()
@kzinglzy
Copy link
Author

This code is use to find out all of the tags and its corresponding articles from www.huxiu.com

Just for fun :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment