Skip to content

Instantly share code, notes, and snippets.

@jason-xuan
Last active August 7, 2017 03:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jason-xuan/677cc4fc50641704e298dd0d12ce1b02 to your computer and use it in GitHub Desktop.
Save jason-xuan/677cc4fc50641704e298dd0d12ce1b02 to your computer and use it in GitHub Desktop.
爬搜狐
# -*- coding: utf-8 -*-
# @Author: jason-xuan
# @Date: 2017-08-07 09:33:12
# @Last Modified by: jason-xuan
# @Last Modified time: 2017-08-07 11:25:45
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
if not os.path.exists('./Download'):
os.mkdir('./Download')
def download(urls):
# 重用 TCP
sess = requests.Session()
failed_list = []
for url in tqdm(urls):
try:
r = sess.get(url)
r.encoding = 'gbk'
soup = BeautifulSoup(r.text, 'lxml')
title = soup.find('div', 'article_area').find('h1').string
title = title.replace('"', '').replace('/', '').replace('\\', '')
texts = [''.join(s.strings) for s in soup.find('div', 'article_area').find_all('p')]
text = '\n'.join(texts).replace('\u3000', '')
with open('Download/{0}.txt'.format(title), 'w', encoding='utf-8') as f:
f.write(text)
except BaseException as e:
# print("url:{0} parse failed..".format(url))
print(e)
failed_list.append(url)
print('{0} failed download'.format(len(failed_list)))
with open('failed urls.txt', 'w') as f:
f.write('\n'.join(failed_list))
return len(failed_list)
def first_download():
# read the file
with open('SogouTDTE.txt', 'r') as f:
lines = [line.split('\t') for line in f.read().split('\n')][:-1]
urls = [url for url, _ in lines]
return download(urls)
def re_download():
# read the file
with open('failed urls.txt', 'r') as f:
lines = [line for line in f.read().split('\n')]
return download(lines)
# for _ in range(10):
# re_download()
if __name__ == '__main__':
last = first_download()
current = re_download()
# 重复洗失效链接直到两次爬取失效链接数相同为止
while current != last:
last = current
current = re_download()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment