Created
May 1, 2017 07:36
-
-
Save Tsukasa007/2097bfa39a614ad42896e6936ea83512 to your computer and use it in GitHub Desktop.
爬取新浪热点新闻
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Author;Tsukasa | |
import requests | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import json | |
import pandas as pd | |
#生产js网页 1-20 | |
appendurl_new = [] | |
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1491844351564' | |
for i in range(1,31): | |
appendurl_new.append((url.format(i))) | |
#生产js网页里面带的url | |
url_js = [] | |
for url_new in appendurl_new: | |
res = requests.get(url_new) | |
jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');')) | |
for a in jd['result']['data']: | |
url_js.append(a['url']) | |
#解析url | |
def content(newsurl): | |
res = requests.get(newsurl) | |
res.encoding = 'utf-8' | |
soup = BeautifulSoup(res.text,'lxml') | |
header = soup.select('h1')[0].text #标题 | |
text = '\n\t'.join([p.text.strip() for p in soup.select('#artibody p ')[:-1]]) #内文文章 | |
editor = soup.select('#artibody p ')[-1].text.lstrip('责任编辑:') | |
timesource = soup.select('.time-source')[0].contents[0].strip() #timesource and dt 时间 | |
dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') | |
dt = dt.strftime('%Y-%m-%d\t%H:%M') | |
madesource = soup.select('.time-source')[0].contents[1].text.strip() #发布来源 | |
print(header,'\n',dt,madesource,'\n\t',text,'\n责任编辑:',editor,'\n\n\n\n') # # | |
def pandas_to_csv(pd_list): | |
pd_look = pd.DataFrame(pd_list) | |
pd_look.to_csv('房天下.csv',mode='a+',header=False) | |
#爬取首页url | |
res = requests.get('http://news.sina.com.cn/china/') | |
res.encoding = 'utf-8' | |
newslist = [] | |
soup = BeautifulSoup(res.text,'html5lib') | |
for news in soup.select('.news-item'): | |
if len(news.select('h2')) > 0: | |
htmlurl = news.select('a')[0]['href'] | |
newslist.append(htmlurl) | |
for end in url_js + newslist: | |
pandas_to_csv(content(end)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment