Skip to content

Instantly share code, notes, and snippets.

@jon1scr
Forked from VirenMohindra/daily-hackernews.py
Created March 26, 2021 11:01
Show Gist options
  • Save jon1scr/7b5a900d6993c166091534df8ba6be8a to your computer and use it in GitHub Desktop.
Save jon1scr/7b5a900d6993c166091534df8ba6be8a to your computer and use it in GitHub Desktop.
from github import Github
import csv
import re
import requests
from datetime import datetime, timedelta
g = Github("access_token")
REPO_NAME = 'xueyuanl/daily-hackernews'
FILE_NAME = "hn25.csv"
BASE_URL = 'https://hacker-news.firebaseio.com/v0/'
ITEM_URL = BASE_URL + 'item/'
USER_URL = BASE_URL + 'user/'
NOOB_ACCOUNT_TRESHOLD = 15
repo = g.get_repo(REPO_NAME)
issues = repo.get_issues(state='open')
fields = ['Date', 'Title', 'Link', 'userID', 'userID Age', 'New Account?', 'Number of Points', 'Number of Comments', 'HN Link']
rows = []
for issue in issues:
date = all_square_brackets = all_links = title = hn_link = link = user_id = ''
user_id_age = is_new = number_of_points = number_of_comments = ''
try:
date = issue.title.split(' ')[3]
except IndexError:
print ('bad title format: ', issue.title)
# TODO
post = issue.body.split('\n')
del post[:3] # first 3 elements are empty
for line in post:
# regex for title
all_square_brackets = re.findall("\[(.*?)\]", line)
# regex for Link, HN Link
all_links = re.findall("(?P<url>https?://[^\s]+)", line.lower())
title = all_square_brackets[0].strip('**')
link = all_links[0].strip(')')
hn_link = all_links[1].strip(')')
item_id = hn_link.split('=')[1]
item = requests.get(ITEM_URL + item_id + '.json')
user_id = item.json()['by']
number_of_points = item.json()['score']
user = requests.get(USER_URL + user_id + '.json')
# number_of_comments not outright available, needs more computing.
try:
user_id_age = user.json()['created']
account_created_date = datetime.fromtimestamp(user_id_age)
submission_date = is_new = ''
try:
submission_date = datetime.strptime(date, '%d-%m-%Y')
calc_date = - timedelta(days=NOOB_ACCOUNT_TRESHOLD)
is_new = account_created_date > submission_date # submission date should always be 15 days greater than account creation
except ValueError:
print ('date parsing issue: ', line)
# TODO
except TypeError:
print ('Automated submission.')
row = [date, title, link, user_id, user_id_age, is_new, number_of_points, number_of_comments, hn_link]
rows.append(row)
with open(FILE_NAME, 'w', encoding='utf-8-sig') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(fields)
csvwriter.writerows(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment