Skip to content

Instantly share code, notes, and snippets.

@kinoko3
Created March 17, 2018 02:40
Show Gist options
  • Save kinoko3/35d56aa34743d1cb123bb2b87310f183 to your computer and use it in GitHub Desktop.
Save kinoko3/35d56aa34743d1cb123bb2b87310f183 to your computer and use it in GitHub Desktop.
solidot-wordcloud
import requests
from concurrent import futures
import pymongo
from lxml import etree
import datetime
MAX_WORKERS = 8
def get(url):
session = requests.session()
content = session.get(url=url).text
html = etree.HTML(content)
content = html.xpath('//*[@id="center"]/div/div[3]/div/text()')
if content:
data = [i.strip() for i in content]
for i in data:
with open('cloud_main_data.txt', 'at', encoding='UTF-8') as f:
f.write(i)
print('OK')
else:
pass
def get_list(url_list):
wokers = max(MAX_WORKERS, len(url_list))
with futures.ThreadPoolExecutor(wokers) as executor:
executor.map(get, url_list)
if __name__ == '__main__':
begin = datetime.date(2018, 1, 1)
end = datetime.date(2018, 3, 12)
for i in range((end - begin).days + 1):
day = begin + datetime.timedelta(days=i)
url_list = ['https://www.solidot.org/?issue=' + str(begin + datetime.timedelta(days=i)).replace('-', '')
for i in range((end - begin).days + 1)]
cloud_url_list = ['https://cloud.solidot.org/?issue=' + str(begin + datetime.timedelta(days=i)).replace('-', '')
for i in range((end - begin).days + 1)]
get_list(url_list=cloud_url_list)
import jieba
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from pprint import pprint
text_from_file = open('main_data.txt', 'r', encoding='UTF-8').read()
# Word_spilt_jieba = jieba.cut(text_from_file, cut_all=False)
# word_space = ' '.join(Word_spilt_jieba)
jieba.load_userdict("userdict.txt")
jieba.analyse.set_stop_words('stop_words.txt')
tags = jieba.analyse.extract_tags(text_from_file, topK=100, withWeight=True)
data_dict = {}
for i in tags:
v, k = i
data_dict[v] = k
# text = ''.join(tags)
wc = WordCloud(
font_path='simkai.ttf',
background_color='white',
max_words=100,
max_font_size=180,
random_state=42,
width=1500, height=1500,
)
wc.generate_from_frequencies(data_dict)
plt.figure()
plt.imshow(wc)
plt.axis("off")
plt.show()
# with open('tags.txt', 'w', encoding='UTF-8') as f:
# f.write(str(tags))
wc.to_file('100.png')
pprint(data_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment