Skip to content

Instantly share code, notes, and snippets.

@verdverm
Created October 14, 2020 18:49
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save verdverm/23aefb64ee981e17452e95dd5c491d26 to your computer and use it in GitHub Desktop.
Save verdverm/23aefb64ee981e17452e95dd5c491d26 to your computer and use it in GitHub Desktop.
HN Upvotes Backup and JSONify
#!/usr/bin/env bash
set -euo pipefail
COOKIE="user=verdverm&..."
USERNAME="verdverm"
BASEURL="https://news.ycombinator.com/upvoted"
PAGE=0
mkdir -p html
while true; do
URL="${BASEURL}?id=${USERNAME}&p=${PAGE}"
NUM=$(printf "%06d\n" $PAGE)
echo "fetching page $NUM -- $(date)"
curl -s -b "${COOKIE}" "$URL" > html/page-$NUM.html
# loop update
PAGE=$((PAGE+1))
sleep 1
done
#!/usr/bin/env python3
from os import walk
from bs4 import BeautifulSoup
import json
AllItems = []
def parseFile(fn):
with open(fn, 'r') as content_file:
content = content_file.read()
soup = BeautifulSoup(content, 'html.parser')
upvotes = soup.find(class_="itemlist")
parseItems(upvotes)
def parseItems(upvotes):
items = upvotes.find_all(class_="athing")
for item in items:
parseItem(item)
def parseItem(item):
itemID = item.get("id")
story = item.find(class_="storylink")
href = story.get("href")
title = story.contents[0]
site = "none"
try:
site = item.find(class_="sitestr").contents[0]
except:
site = "user"
saveItem(itemID, href, title, site)
def saveItem(itemID, href, title, site):
item = {
"id": itemID,
"href": href,
"title": title,
"site": site
}
AllItems.append(item)
# Read data
for (dirpath, dirnames, filenames) in walk("html"):
for fn in filenames:
print("parsing: " + fn)
parseFile("html/" + fn)
# Write data
with open('data.json', 'w') as outfile:
json.dump(AllItems, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment