Skip to content

Instantly share code, notes, and snippets.

@queensferryme queensferryme/charts.py
Last active Feb 15, 2019

Embed
What would you like to do?
Crawling, analyzing & visualizing #WanderingEarth comments on movie.douban.com
#!/usr/bin/env python
import json
import os
from collections import Counter, defaultdict
import jieba.analyse
from pyecharts import Bar, Line, WordCloud
class Charts:
'''analyze & render echarts from raw data'''
def __init__(self):
self.data = json.load(open('data/src/data.json', 'rt', encoding='utf-8'))['data']
if not os.path.exists('data/output'):
os.makedirs('data/output')
def render(self):
'''run echart rendering tasks'''
self.render_chart_comment_per_day()
self.render_chart_word_clouds()
def render_chart_comment_per_day(self):
'''render comment number per day in bar graph'''
# prepare data
key_map = ('all', 'low', 'low', 'middle', 'high', 'high')
data = {key: defaultdict(int) for key in ('all', 'high', 'middle', 'low')}
for item in self.data:
data[key_map[item['rate']]][item['date']] += 1
for date in {item['date'] for item in self.data}:
data['all'][date] = data['all'][date] + data['high'][date]\
+ data['middle'][date] + data['low'][date]
# render chart
chart = Line('Comment Number Per Day')
for category in ('all', 'high', 'middle', 'low'):
dates = sorted(data[category])
numbers = [data[category][date] for date in dates]
chart.add(category, dates, numbers, mark_point=['average', 'max'])
chart.render('data/output/comment-per-day.html')
def render_chart_word_clouds(self):
'''render word cloud graph for high/middle/low/all comments'''
# prepare data
key_map = ['all', 'low', 'low', 'middle', 'high', 'high']
counters = {key: Counter() for key in ('all', 'high', 'middle', 'low')}
for item in self.data:
temp = Counter([
word for word in jieba.analyse.extract_tags(item['content'], topK=10)
if len(word) > 1 and word not in [
'电影', '什么', '可以', '一个', '这部', '一群', '以为', '本来', '为什么',
'看过', '有点', '还是', '这种', '就是', '卧槽', '觉得', '真的', '一部',
'这个', '不是', '不行', '本片', '这么', '确实', '看到', '一样', '不要'
]
])
counters[key_map[item['rate']]] += temp
counters['all'] += (temp if item['rate'] != 0 else {})
# render chart
clouds = {
key: WordCloud(width=1200, height=600) for key in ('all', 'high', 'middle', 'low')
}
for category in ('all', 'high', 'middle', 'low'):
counter = counters[category].most_common(n=150)
words = [item[0] for item in counter]
counts = [item[1] for item in counter]
clouds[category].add('', words, counts)
clouds[category].render(f'data/output/word-cloud-{category}.html')
if __name__ == '__main__':
Charts().render()
#!/usr/bin/env python
import os
import random
import time
from requests import RequestException, Session
PARAMS = [
{'percent_type': type, 'sort': 'new_score'}
for type in ('h', 'm', 'l')
]
URLS = {
'/': 'https://www.douban.com',
'/login': 'https://accounts.douban.com/j/mobile/login/basic',
'/comment': 'https://movie.douban.com/subject/26266893/comments?limit=20&status=P'
}
USER_AGENTS = [
'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
]
class Client(Session):
'''client for handling HTTP requests'''
def __init__(self):
super().__init__()
self.agent = random.choice(USER_AGENTS)
self.login()
def get(self, url, **kwargs):
'''override GET method with default user-agent'''
return super().get(url, **kwargs, headers={'user-agent': self.agent})
def login(self):
'''authenticate & login current user'''
self.get(URLS['/'])
self.post(URLS['/login'], data={
'name': os.getenv('name'),
'password': os.getenv('password'),
})
self.raise_for_login()
def raise_for_login(self):
'''raise an exception if not login'''
resp = self.get(URLS['/'])
if resp.text.find(os.getenv('login')) < 0:
raise RequestException('user login failed')
def reset(self):
'''reset current session'''
self.agent = random.choice(USER_AGENTS)
self.cookies.clear()
self.login()
def run(self, begin=0):
'''run web scraping tasks'''
for index, param in enumerate(PARAMS):
self.reset()
if index < begin:
continue
if not os.path.exists(f'pages/{index}'):
os.makedirs(f'pages/{index}')
for start in range(0, 500, 20):
resp = self.get(URLS['/comment'], params={**param, 'start': start})
with open(f'pages/{index}/{start}.html', 'wt', encoding='utf-8') as fout:
fout.write(resp.text)
time.sleep(random.randint(0, 3))
if __name__ == '__main__':
Client().run()
#!/usr/bin/env python
import json
import os
from csv import DictWriter
from bs4 import BeautifulSoup
class Parser:
'''parse html content & export csv/json data'''
def __init__(self):
if not os.path.exists('data/src'):
os.makedirs('data/src')
@property
def data(self):
'''list comment items as generator'''
for direc in range(0, 3):
for name in range(0, 500, 20):
with open(f'pages/{direc}/{name}.html', 'rt', encoding='utf-8') as fin:
page = BeautifulSoup(fin.read(), 'html.parser')
for comment in page.find_all(class_='comment-item'):
yield {
'name': comment.a.attrs['title'],
'rate': {
'力荐': 5, '推荐': 4, '还行': 3, '较差': 2, '很差': 1, '未知': 0
}[
comment.find(class_='rating')['title']
if comment.find(class_='rating')
else '未知'
],
'date': comment.find(class_='comment-time').get_text(strip=True),
'votes': int(
comment.find(class_='votes').get_text(strip=True)
),
'content': comment.p.get_text(strip=True)
}
def export(self):
'''export comment data in csv/json format'''
self._export_csv()
self._export_json()
def _export_csv(self):
'''export comment data in csv format'''
with open('data/src/data.csv', 'wt', encoding='utf-8') as fout:
writer = DictWriter(fout, ['name', 'rate', 'date', 'votes', 'content'])
writer.writeheader()
writer.writerows(tuple(self.data))
def _export_json(self):
'''export comment data in json format'''
with open('data/src/data.json', 'wt', encoding='utf-8') as fout:
fout.write(json.dumps(
{'data': tuple(self.data)},
ensure_ascii=False,
indent=2
))
if __name__ == '__main__':
Parser().export()
beautifulsoup4 >= 4.7
jieba >= 0.38
pyecharts >= 0.5
requests >= 2.20
@queensferryme

This comment has been minimized.

Copy link
Owner Author

queensferryme commented Feb 15, 2019

Usage

  1. Prepare: set a few environment variables for the program to access you douban account information:

    • login: your douban username
    • name: your douban account, usually phone/email
    • password: your douban password

    you also need to install dependencies with pip install -r requirements.txt

  2. Execute: execute client.py, parser.py, charts.py in exactly this order

  3. Waiting: waiting for a minute or two & you are expected to see output html charts in data/output under current directory

Screenshots

image
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.