Skip to content

Instantly share code, notes, and snippets.

@wjx
Created April 30, 2015 11:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wjx/6091f3656297c693f9dd to your computer and use it in GitHub Desktop.
Save wjx/6091f3656297c693f9dd to your computer and use it in GitHub Desktop.
Archive hi.baidu.com articles.
#!/usr/bin/env python
import os
import sys
import io
import re
import requests
from bs4 import BeautifulSoup
from time import sleep
import random
username = raw_input('Enter user name:')
archive_base = 'http://hi.baidu.com/' + username + '/archive'
#get year list
r = requests.get(archive_base)
if r.status_code is not 200:
print "Request archive base failed."
sys.exit()
s = BeautifulSoup(r.text)
fiList = s.find('div', id='fiList')
year_as = fiList.find_all('a', recursive=False)
year_hrefs = [a.get('href') for a in year_as]
for year_href in year_hrefs:
item_year = year_href[year_href.rfind('=') + 1:]
r = requests.get(year_href)
if r.url == 'http://hi.baidu.com/com/error':
print "requests for {0} failed!".format(year_page)
break
r.encoding = 'utf8'
year_soup = BeautifulSoup(r.text)
items = year_soup.find_all('a', class_='blog-item blog-text')
print "Year {0} ,{1} articles.".format(item_year, len(items))
for item in items:
item_month = item.find_previous('h3', class_='month-tip').string
item_time = item.find_next('span', class_='info-time').string
item_href = item.get('href')
#for debugging
#print('item_month=' + item_month)
#print('item_time=' + item_time)
#print('item_href=' + item_href)
#print('item_year=' + item_year)
item_r = requests.get(item_href)
if item_r.status_code is not 200:
print("requests for {0} failed!".format(item_href))
item_r.encoding = 'utf8'
item_s = BeautifulSoup(item_r.text)
title = item_s.find(class_='title content-title').string
title = title.strip().replace(' ', '_') + item_time;
archive_dir = './archives' + os.sep + item_year + os.sep + item_month
try:
os.makedirs(archive_dir)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
dst_filename = archive_dir + os.sep + title + '.html'
f = io.open(dst_filename, 'w', encoding='utf8')
f.write(item_r.text)
f.close()
sleep(random.randint(3, 10));
#for debugging
#print('exiting.')
#sys.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment