Last active
July 19, 2017 10:58
-
-
Save martinweigert/020abe5e1e40393eb62abc42a8b427a5 to your computer and use it in GitHub Desktop.
analyze_hn_frontpage_csv.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import statistics | |
| from collections import Counter | |
| from collections import defaultdict | |
| import requests | |
| import json | |
| import datetime | |
| from wordcloud import WordCloud | |
| import random | |
| plt.style.use('ggplot') | |
| color = '#0580ae' | |
| # reading the data from the csv into a list of lists. | |
| hn_result_list_with_post = [] | |
| with open('hn_entries.csv', 'r') as f: | |
| for line in f: | |
| hn_result_list_with_post.append(line.strip().split(';')) | |
| # filtering out selected posts with certain criteria # | |
| hn_result_list_with_post = [post for post in hn_result_list_with_post[1:] if "YC" not in post[0] if post[1] != "1" if int(post[1]) < 500] | |
| # functions for plotting and data preparation | |
| def count_posts(): | |
| count = 0 | |
| for post in hn_result_list_with_post: | |
| count += 1 | |
| return count | |
| def highest_score(): | |
| highest_score_list = [] | |
| for item in hn_result_list_with_post: | |
| highest_score_list.append(int(item[1])) | |
| global score_dict_aggregate | |
| score_dict_aggregate = {} | |
| highest_score = int(max(highest_score_list)) | |
| for number in range(highest_score+1): | |
| score_dict_aggregate[number] = 0 | |
| for post in hn_result_list_with_post: | |
| score_dict_aggregate[int(post[1])] += 1 | |
| def score_distribution(): | |
| scores_list = [] | |
| number_of_posts_list = [] | |
| highest_score() | |
| count_excess_value = 0 # number of posts that go beyond the x-axis scale | |
| for key, value in score_dict_aggregate.items(): | |
| if key <= 20: | |
| scores_list.append(key) | |
| number_of_posts_list.append(value) | |
| if key > 20: | |
| count_excess_value += value | |
| scores_list.append('21+') | |
| number_of_posts_list.append(count_excess_value) | |
| plt.xlabel('Score', fontsize=8) | |
| plt.ylabel('Number of posts out of total: %s' % count_posts(), fontsize=8) | |
| plt.bar(range(len(number_of_posts_list)), number_of_posts_list, align='center', color=color) | |
| my_xticks = scores_list #here custom ticks could be used | |
| plt.xticks(range(len(number_of_posts_list)), my_xticks, size='small') | |
| plt.title('Score at which an item hit Hacker News frontpage, July 2017', fontsize=10) | |
| plt.show() | |
| def minute_distribution(): | |
| minutes_list = [] # getting all the minute values from the csv, appending it to a list | |
| for post in hn_result_list_with_post: | |
| minutes_list.append(post[3]) | |
| minutes_list = list(map(int, minutes_list)) # converting all list values into int | |
| bin_boundaries = [(0, 15),(15, 60),(60,120),(120,300),(300, max(minutes_list))] | |
| #my_xticks = [bin_boundaries[y-1] for y in bin_count_list] | |
| my_xticks = ["1-15", "16-60", "61-120", "121-300", "301+"] #can be automatically generated with code on line above | |
| bin_counts = defaultdict(int) | |
| for value in minutes_list: | |
| for low, high in bin_boundaries: | |
| if low < value <= high: | |
| bin_counts[(low, high)] += 1 | |
| bin_count_list = [i+1 for i in range(len(bin_boundaries))] | |
| count_per_bin_for_graph = [bin_counts[bin_boundaries[a]] for a in range(len(bin_boundaries))] | |
| plt.xlabel('Minutes from posting to frontpage', fontsize=8) | |
| plt.ylabel('Number of posts (total analyzed: %s)' % count_posts(),fontsize=8) | |
| plt.bar(bin_count_list, count_per_bin_for_graph, align='center', color=color) | |
| plt.xticks(bin_count_list, my_xticks, size='small') | |
| plt.title('Time until an article hit the Hacker News frontpage, July 2017', fontsize=10) | |
| '''for a,b in zip(bin_count_list, count_per_bin_for_graph): | |
| plt.text(a, b, str(b), fontsize=8)''' | |
| #showing the numbers directly at the bars | |
| plt.show() | |
| def contributor_distribution(): | |
| name_dict = {} | |
| for post in hn_result_list_with_post: | |
| if post[4] in name_dict: | |
| name_dict[post[4]] += 1 | |
| else: | |
| name_dict[post[4]] = 1 | |
| contribution_dict = {} | |
| contribution_list = [] | |
| for contribution in name_dict.values(): | |
| if contribution in contribution_dict: | |
| contribution_dict[contribution]+= 1 | |
| else: | |
| contribution_dict[contribution]= 1 | |
| contribution_list = [] | |
| for number in contribution_dict.values(): | |
| contribution_list.append(number) | |
| sum_i = 0 | |
| for i in contribution_list[2:]: | |
| sum_i += i | |
| new_contribution_list = [contribution_list[0],contribution_list[1],sum_i] | |
| labels = [r'By user with 1 front page contribution', r'By user with 2 front page contributions', | |
| r'By user with 3 or more front page contributions'] | |
| plt.pie(new_contribution_list, | |
| #labels=activity_list, | |
| colors=(color,'#75c9e8','#0d546e'), | |
| startangle=90, | |
| explode=(0.1,0.1,0.1), | |
| autopct='%1.1f%%', | |
| pctdistance=0.625) | |
| plt.title('Number of front page posts per user, July 2017', fontsize=10) | |
| plt.legend(labels, fontsize=8) | |
| plt.text(0, -1.5,'Based on %s Hacker News front page posts.' % count_posts(), | |
| horizontalalignment='center', | |
| verticalalignment='center', fontsize=8) | |
| plt.show() | |
| def wordcloud_color(word, font_size, position, orientation, random_state=None,**kwargs): | |
| return "hsl(196, 94%%, %d%%)" % random.randint(20, 99) | |
| def wordcloud_all(): | |
| string = '' | |
| count = 0 | |
| for post in hn_result_list_with_post: | |
| count += 1 | |
| string += post[0] | |
| wordcloud = WordCloud(max_font_size=40).generate(string) | |
| default_colors = wordcloud.to_array() | |
| plt.figure() | |
| plt.imshow(wordcloud.recolor(color_func=wordcloud_color, random_state=3),interpolation="bilinear") | |
| plt.axis("off") | |
| plt.show() | |
| def wordcloud_15_minutes(): | |
| string = '' | |
| count = 0 | |
| for post in hn_result_list_with_post: | |
| if int(post[3]) > 0: | |
| if int(post[3])<= 15: | |
| count += 1 | |
| string += post[0] | |
| print(count) | |
| wordcloud = WordCloud(max_font_size=40).generate(string) | |
| default_colors = wordcloud.to_array() | |
| plt.figure() | |
| plt.imshow(wordcloud.recolor(color_func=wordcloud_color, random_state=3),interpolation="bilinear") | |
| plt.axis("off") | |
| plt.show() | |
| ### Functions to run for plots | |
| score_distribution() | |
| minute_distribution() | |
| contributor_distribution() | |
| wordcloud_all() | |
| wordcloud_15_minutes() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment