Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# -*- coding:utf-8 -*-
import os
import codecs
import re
import jieba.analyse
import matplotlib.pyplot as plt
import requests
from wordcloud import WordCloud
__author__ = 'liuzhijun & wittyfilter'
# Please replace the field with your own user id and container id!
uid = "1998412373"
container_id = "1076031998412373"
# Maybe useless
luicode = "10000011"
url = "https://m.weibo.cn/api/container/getIndex"
params = {"uid": "{uid}",
"luicode": "{luicode}",
"type": "uid",
"value": "{uid}",
"containerid": "{containerid}",
"page": "{page}"}
headers = {
"Host": "m.weibo.cn",
"Referer": 'https://m.weibo.cn/u/'+uid,
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/68.0.3440.106 Safari/537.36",
}
def clean_html(raw_html):
pattern1 = re.compile(r'<.*?>|&gt;|&quot;|转发微博|网页链接|Repost|(分享|查看|刚刚).*(图片|专辑|单曲|歌曲|照片|视频)')
pattern2 = re.compile(r'/@[^\s]+/|/+@[^\n]+')
pattern3 = re.compile(r'[((].*@[^\s]+[))]|[??]||[!!]')
text = re.sub(pattern3, '', re.sub(pattern2, '', re.sub(pattern1, '', raw_html)))
return text
def fetch_data():
page = 0
total = 2000
blogs = []
for i in range(0, total // 10):
params['uid'] = uid
params['page'] = str(page)
params['containerid'] = container_id
res = requests.get(url, params=params, headers=headers)
cards = res.json().get("data").get("cards")
for card in cards:
# Weibo post content
if card.get("card_type") == 9:
text = card.get("mblog").get("text")
text = clean_html(text)
blogs.append(text)
page += 1
print("Fetching Page.{page}. {count} weibo posts in total.".format(page=page, count=len(blogs)))
with codecs.open(uid+'.txt', 'w', encoding='utf-8') as f:
f.write("\n".join(blogs))
def generate_image():
data = []
jieba.analyse.set_stop_words("stopwords.txt")
with codecs.open(uid+'.txt', 'r', encoding="utf-8") as f:
for text in f.readlines():
data.extend(jieba.analyse.extract_tags(text, topK=20))
data = " ".join(data)
wordcloud = WordCloud(width=640,
height=480,
collocations=False,
font_path='Songti',
background_color='white',
).generate(data)
# plt.figure( figsize=(20,10) )
plt.title(uid)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig(uid+'.jpg', dpi=1600)
if __name__ == '__main__':
if not os.path.isfile(uid+'.txt'):
fetch_data()
generate_image()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.