Skip to content

Instantly share code, notes, and snippets.

@Tsukasa007
Created May 1, 2017 07:39
Show Gist options
  • Save Tsukasa007/d7bb8e9aa9f4376428cd3d22925e81bf to your computer and use it in GitHub Desktop.
Save Tsukasa007/d7bb8e9aa9f4376428cd3d22925e81bf to your computer and use it in GitHub Desktop.
猫眼练手
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author;Tsukasa
import json
import re
import pandas as pd
import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
User_Agent = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
def get_one_page(url):
try:
res = requests.get(url,'lxml')
if res.status_code == 200:
return res.text
return print('请求错误代码:',res.status_code)
except RequestException:
return None
def get_re_one_page(html_text):
re_set = re.compile('<dd>.*?">(\d+)</i>.*?title="(.*?)".*?data-src="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?class="integer">(.*?)</i>.*?class="fraction">(.*?)</i>',re.S)
re_get_html = re.findall(re_set,html_text)
for i in re_get_html:
yield {
'排名':i[0],
'片名':i[1],
'封面链接':i[2],
'主演':i[3].lstrip().lstrip('主演:').rstrip(),
'上映时间':i[4].lstrip('上映时间:'),
'评分':i[5] + i[6]
}
def writer_to_text(text):
with open('TOP 100.text','a',encoding='utf-8')as f:
f.write(json.dumps(text,ensure_ascii=False)+'\n')
f.close()
def pandas_to_xlsx(pd_list):
pd_look = pd.DataFrame(pd_list)
pd_look.to_excel('Top 100.xlsx',sheet_name='猫眼Top100')
data = []
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
print(url)
html_text = get_one_page(url)
for i in get_re_one_page(html_text):
writer_to_text(i)
data.append(i)
if __name__ == '__main__':
for i in range(10):
main(i*10)
pandas_to_xlsx(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment