Skip to content

Instantly share code, notes, and snippets.

@VirenMohindra
Created March 25, 2021 23:22
Show Gist options
  • Save VirenMohindra/ced3318325e8a4ea7dae65b6eb486baa to your computer and use it in GitHub Desktop.
Save VirenMohindra/ced3318325e8a4ea7dae65b6eb486baa to your computer and use it in GitHub Desktop.
from github import Github
import csv
import re
import requests
from datetime import datetime, timedelta
g = Github("access_token")
REPO_NAME = 'headllines/hackernews-daily'
FILE_NAME = "hackernews-daily.csv"
HN_API_URL = 'https://hacker-news.firebaseio.com/v0/user/'
NOOB_ACCOUNT_TRESHOLD = 15
repo = g.get_repo(REPO_NAME)
issues = repo.get_issues(state='open')
fields = ['Date', 'Title', 'Link', 'userID', 'userID Age', 'New Account?', 'Number of Points', 'Number of Comments', 'HN Link']
rows = []
for issue in issues:
if (issue.title == 'npm ci'):
continue
date = all_square_brackets = all_links = title = link = user_id = ''
user_id_age = is_new = user_id_link = number_of_points = number_of_comments = ''
try:
date = issue.title.split('@')[1].strip(" ")
except IndexError:
date = issue.title.split('之')[1].strip(" ")
post = issue.body.split('\n\n')
for line in post[:-1]:
# regex for title, user_id, number of comments
all_square_brackets = re.findall("\[(.*?)\]", line)
# regex for Link, userID Link, HN Link
all_links = re.findall("(?P<url>https?://[^\s]+)", line.lower())
title = all_square_brackets[0]
link = all_links[0].strip("**").strip(" )")
user_id = all_square_brackets[1]
r = requests.get(HN_API_URL + user_id + '.json')
user_id_age = r.json()['created']
account_created_date = datetime.fromtimestamp(user_id_age)
submission_date = is_new = ''
try:
submission_date = datetime.strptime(date, '%Y-%m-%d')
calc_date = - timedelta(days=NOOB_ACCOUNT_TRESHOLD)
is_new = account_created_date > submission_date # submission date should always be 15 days greater than account creation
except ValueError:
cleaned_date = date.replace(' GMT+0000 (Coordinated Universal Time)', '')
submission_date = datetime.strptime(cleaned_date, '%a %b %d %Y %H:%M:%S')
calc_date = submission_date - timedelta(days=NOOB_ACCOUNT_TRESHOLD)
is_new = account_created_date > submission_date
date = submission_date.strftime('%Y-%m-%d') # conforming all dates to YYYY/MM/DD standard
user_id_link = all_links[1].strip(" )").replace('https://news.ycombinator.com/user?id=', '')
number_of_points = re.findall('(\w+ ){1}point', line)[0].strip(" ")
number_of_comments = all_square_brackets[2].split(' ')[0]
try:
hn_link = all_links[2].strip(" )")
except IndexError:
link = ''
user_id_link = all_links[0].strip(" )")
hn_link = all_links[1].strip(" )")
row = [date, title, link, user_id, user_id_age, is_new, number_of_points, number_of_comments, hn_link]
rows.append(row)
with open(FILE_NAME, 'w', encoding='utf-8-sig') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(fields)
csvwriter.writerows(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment