Start from https://github.com/lzjun567/crawler_html2pdf
# -*- coding:utf-8 -*- | |
import os | |
import codecs | |
import re | |
import jieba.analyse | |
import matplotlib.pyplot as plt | |
import requests | |
from wordcloud import WordCloud | |
__author__ = 'liuzhijun & wittyfilter' | |
# Please replace the field with your own user id and container id! | |
uid = "1998412373" | |
container_id = "1076031998412373" | |
# Maybe useless | |
luicode = "10000011" | |
url = "https://m.weibo.cn/api/container/getIndex" | |
params = {"uid": "{uid}", | |
"luicode": "{luicode}", | |
"type": "uid", | |
"value": "{uid}", | |
"containerid": "{containerid}", | |
"page": "{page}"} | |
headers = { | |
"Host": "m.weibo.cn", | |
"Referer": 'https://m.weibo.cn/u/'+uid, | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/68.0.3440.106 Safari/537.36", | |
} | |
def clean_html(raw_html): | |
pattern1 = re.compile(r'<.*?>|>|"|转发微博|网页链接|Repost|(分享|查看|刚刚).*(图片|专辑|单曲|歌曲|照片|视频)') | |
pattern2 = re.compile(r'/@[^\s]+/|/+@[^\n]+') | |
pattern3 = re.compile(r'[((].*@[^\s]+[))]|[??]|、|[!!]') | |
text = re.sub(pattern3, '', re.sub(pattern2, '', re.sub(pattern1, '', raw_html))) | |
return text | |
def fetch_data(): | |
page = 0 | |
total = 2000 | |
blogs = [] | |
for i in range(0, total // 10): | |
params['uid'] = uid | |
params['page'] = str(page) | |
params['containerid'] = container_id | |
res = requests.get(url, params=params, headers=headers) | |
cards = res.json().get("data").get("cards") | |
for card in cards: | |
# Weibo post content | |
if card.get("card_type") == 9: | |
text = card.get("mblog").get("text") | |
text = clean_html(text) | |
blogs.append(text) | |
page += 1 | |
print("Fetching Page.{page}. {count} weibo posts in total.".format(page=page, count=len(blogs))) | |
with codecs.open(uid+'.txt', 'w', encoding='utf-8') as f: | |
f.write("\n".join(blogs)) | |
def generate_image(): | |
data = [] | |
jieba.analyse.set_stop_words("stopwords.txt") | |
with codecs.open(uid+'.txt', 'r', encoding="utf-8") as f: | |
for text in f.readlines(): | |
data.extend(jieba.analyse.extract_tags(text, topK=20)) | |
data = " ".join(data) | |
wordcloud = WordCloud(width=640, | |
height=480, | |
collocations=False, | |
font_path='Songti', | |
background_color='white', | |
).generate(data) | |
# plt.figure( figsize=(20,10) ) | |
plt.title(uid) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis('off') | |
plt.savefig(uid+'.jpg', dpi=1600) | |
if __name__ == '__main__': | |
if not os.path.isfile(uid+'.txt'): | |
fetch_data() | |
generate_image() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment