Skip to content

Instantly share code, notes, and snippets.

@yangyaofei
Last active October 11, 2016 08:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yangyaofei/69137c0ed4dc65900808 to your computer and use it in GitHub Desktop.
Save yangyaofei/69137c0ed4dc65900808 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import requests
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def getHtml(url):
req = urllib2.Request(url)
content = urllib2.urlopen(req).read()
#data = content.decode('u8')#不知是否有必要
return content
def getCateUrl(html):
url_list =re.findall(r'<a href="(/dict/cate/index/[0-9]+)"></a>',html)
return url_list
def getCorpusUrl(html):
urls =re.findall(r'<div class="dict_dl_btn"><a href="([^<]+)">',html)
return urls
def getCorpus(url,referer):
try:
headers = {"Referer":url}
name = (re.findall(r'name=([^<]+)',url))[0]
print("get:"+name)
resp = requests.get(url,headers=headers,stream=True)
with open(name+".scel","wb") as f:
for c in resp.iter_content(chunk_size=1024):
if c:f.write(c)
f.flush()
except IOError, e:
with open("errorLog.txt","a") as ff:
ff.write(url)
ff.write(referer)
root_url = "http://pinyin.sogou.com"
main_url = "http://pinyin.sogou.com/dict/cate/index/167"
html = getHtml(main_url)
cate_url_list = getCateUrl(html)
for cate_url in cate_url_list:
iterator = 1
print("下载:"+cate_url)
while(1):
print("第"+str(iterator)+"页")
referer = root_url+cate_url+"/default/"+str(iterator)
h = getHtml(referer)
corpunsUrls = getCorpusUrl(h)
if(len(corpunsUrls) == 0):
print("下载完")
#超出界限
break
for url in corpunsUrls:
getCorpus(url,referer)
iterator += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment