Last active
October 11, 2016 08:54
-
-
Save yangyaofei/69137c0ed4dc65900808 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import re | |
import requests | |
import urllib2 | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
def getHtml(url): | |
req = urllib2.Request(url) | |
content = urllib2.urlopen(req).read() | |
#data = content.decode('u8')#不知是否有必要 | |
return content | |
def getCateUrl(html): | |
url_list =re.findall(r'<a href="(/dict/cate/index/[0-9]+)"></a>',html) | |
return url_list | |
def getCorpusUrl(html): | |
urls =re.findall(r'<div class="dict_dl_btn"><a href="([^<]+)">',html) | |
return urls | |
def getCorpus(url,referer): | |
try: | |
headers = {"Referer":url} | |
name = (re.findall(r'name=([^<]+)',url))[0] | |
print("get:"+name) | |
resp = requests.get(url,headers=headers,stream=True) | |
with open(name+".scel","wb") as f: | |
for c in resp.iter_content(chunk_size=1024): | |
if c:f.write(c) | |
f.flush() | |
except IOError, e: | |
with open("errorLog.txt","a") as ff: | |
ff.write(url) | |
ff.write(referer) | |
root_url = "http://pinyin.sogou.com" | |
main_url = "http://pinyin.sogou.com/dict/cate/index/167" | |
html = getHtml(main_url) | |
cate_url_list = getCateUrl(html) | |
for cate_url in cate_url_list: | |
iterator = 1 | |
print("下载:"+cate_url) | |
while(1): | |
print("第"+str(iterator)+"页") | |
referer = root_url+cate_url+"/default/"+str(iterator) | |
h = getHtml(referer) | |
corpunsUrls = getCorpusUrl(h) | |
if(len(corpunsUrls) == 0): | |
print("下载完") | |
#超出界限 | |
break | |
for url in corpunsUrls: | |
getCorpus(url,referer) | |
iterator += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment