url.txt 格式见 url.txt~
代码有一定的问题。爬去www.qq.com 会出错,原因是网页编码的问题,代码里,直接是读取网页的编码,在用这个编码进行解码。 有的网页是没有设置编码的,所以导致了读取不到编码从而是乱码问题。
#!/usr/bin/env python3 | |
# -*- coding : utf-8 -*- | |
# author: tennc | |
# date: 2016/3/20 | |
# filename: paer.py | |
# 检测一推二级域名200状态,并且爬出名称保存 | |
# url.txt 为二级域名保存文件,save.txt为结果文件。 | |
# The MIT License | |
# NAME | |
# Copyright (c) 2015 | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a | |
# copy of this software and associated documentation files (the "Software"), | |
# to deal in the Software without restriction, including without limitation | |
# the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
# and/or sell copies of the Software, and to permit persons to whom the | |
# Software is furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
# DEALINGS IN THE SOFTWARE. | |
import requests | |
from bs4 import BeautifulSoup | |
url = open("url.txt") | |
saveurl = open('save.txt', "w+") | |
for i in url.readlines(): | |
i = i.strip() | |
try: | |
r = requests.get(i) | |
#print(r.status_code) | |
if r.status_code == 200: | |
encodin = r.encoding | |
issue = BeautifulSoup(r.text.encode(encoding=encodin), "html.parser") | |
titlename = issue.title.string | |
print(titlename,i,"\n") | |
saveurl.writelines(str(titlename) + "," + str(i) + ''+ '\n') | |
else: | |
pass | |
except Exception as e: | |
pass | |
saveurl.closed |
http://www.qq.com | |
http://ww1.qq.com | |
http://lol.qq.com | |
http://110.qq.com | |
http://guanjia.qq.com |