kzinglzy/HuxiuSpider.py

## HuxiuSpider.py
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
try:
    from urllib.parse import urljoin  # python 3.x
except:
    from urlparse import urljoin   # python 2.x


class Spider:

    def __init__(self, URL):
        self.URL = URL  # www.huxiu.com

    def start_crawl(self):
        """ mission start :)
        """
        for t_name, t_url in self.get_tag_list():
            if not t_name:
                continue
            print '\n', t_name
            self.get_artice_from_tag(t_url)

    def get_tag_list(self):
        """ yield all of the tag and it's corresponding url
        """

        target = "tagslist/all.html"  # this url contaings all of tags :)
        url = urljoin(self.URL, target)

        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        for each in soup.find_all('dl'):
            for t in each('li'):
                yield t('a')[0].string, t('a')[0]['href']  # tag_name, tag_url

    def get_artice_from_tag(self, tag_url):
        """ output the whole articles with the given tag
        """

        url = urljoin(self.URL, tag_url)
        index = 1

        while True:
            try:
                r = requests.get(url)
            except:
                print 'Faild to URL: {}'.format(url)
                return ''

            # output the article title
            soup = BeautifulSoup(r.text)
            for each in soup.find_all('dl'):
                print index, '. ', each('h3')[0].string  # article title
                index += 1

            # get next url
            current = soup.find('div', 'pull-right pgs')
            if current:   # some page is special, so we have to ensure current is no None
                next_url = current.find_next_siblings('b', 'a')
                if next_url:
                    url = next_url['href']
                    continue

            break  # when all of the article is indexed


if __name__ == '__main__':
    sp = Spider('http://www.huxiu.com/')
    sp.start_crawl()
	#!/usr/bin/python3
	import requests
	from bs4 import BeautifulSoup
	try:
	from urllib.parse import urljoin # python 3.x
	except:
	from urlparse import urljoin # python 2.x


	class Spider:

	def __init__(self, URL):
	self.URL = URL # www.huxiu.com

	def start_crawl(self):
	""" mission start :)
	"""
	for t_name, t_url in self.get_tag_list():
	if not t_name:
	continue
	print '\n', t_name
	self.get_artice_from_tag(t_url)

	def get_tag_list(self):
	""" yield all of the tag and it's corresponding url
	"""

	target = "tagslist/all.html" # this url contaings all of tags :)
	url = urljoin(self.URL, target)

	r = requests.get(url)
	soup = BeautifulSoup(r.text)

	for each in soup.find_all('dl'):
	for t in each('li'):
	yield t('a')[0].string, t('a')[0]['href'] # tag_name, tag_url

	def get_artice_from_tag(self, tag_url):
	""" output the whole articles with the given tag
	"""

	url = urljoin(self.URL, tag_url)
	index = 1

	while True:
	try:
	r = requests.get(url)
	except:
	print 'Faild to URL: {}'.format(url)
	return ''

	# output the article title
	soup = BeautifulSoup(r.text)
	for each in soup.find_all('dl'):
	print index, '. ', each('h3')[0].string # article title
	index += 1

	# get next url
	current = soup.find('div', 'pull-right pgs')
	if current: # some page is special, so we have to ensure current is no None
	next_url = current.find_next_siblings('b', 'a')
	if next_url:
	url = next_url['href']
	continue

	break # when all of the article is indexed


	if __name__ == '__main__':
	sp = Spider('http://www.huxiu.com/')
	sp.start_crawl()