Created
May 1, 2017 07:39
-
-
Save Tsukasa007/d7bb8e9aa9f4376428cd3d22925e81bf to your computer and use it in GitHub Desktop.
猫眼练手
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Author;Tsukasa | |
import json | |
import re | |
import pandas as pd | |
import requests | |
from requests.exceptions import RequestException | |
from multiprocessing import Pool | |
User_Agent = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'} | |
def get_one_page(url): | |
try: | |
res = requests.get(url,'lxml') | |
if res.status_code == 200: | |
return res.text | |
return print('请求错误代码:',res.status_code) | |
except RequestException: | |
return None | |
def get_re_one_page(html_text): | |
re_set = re.compile('<dd>.*?">(\d+)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?class="integer">(.*?)</i>.*?class="fraction">(.*?)</i>',re.S) | |
re_get_html = re.findall(re_set,html_text) | |
for i in re_get_html: | |
yield { | |
'排名':i[0], | |
'片名':i[1], | |
'封面链接':i[2], | |
'主演':i[3].lstrip().lstrip('主演:').rstrip(), | |
'上映时间':i[4].lstrip('上映时间:'), | |
'评分':i[5] + i[6] | |
} | |
def writer_to_text(text): | |
with open('TOP 100.text','a',encoding='utf-8')as f: | |
f.write(json.dumps(text,ensure_ascii=False)+'\n') | |
f.close() | |
def pandas_to_xlsx(pd_list): | |
pd_look = pd.DataFrame(pd_list) | |
pd_look.to_excel('Top 100.xlsx',sheet_name='猫眼Top100') | |
data = [] | |
def main(offset): | |
url = 'http://maoyan.com/board/4?offset=' + str(offset) | |
print(url) | |
html_text = get_one_page(url) | |
for i in get_re_one_page(html_text): | |
writer_to_text(i) | |
data.append(i) | |
if __name__ == '__main__': | |
for i in range(10): | |
main(i*10) | |
pandas_to_xlsx(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment