verdverm/backup.sh

## backup.sh
#!/usr/bin/env bash
set -euo pipefail

COOKIE="user=verdverm&..."
USERNAME="verdverm"
BASEURL="https://news.ycombinator.com/upvoted"

PAGE=0

mkdir -p html

while true; do
  URL="${BASEURL}?id=${USERNAME}&p=${PAGE}"
  NUM=$(printf "%06d\n" $PAGE)
  echo "fetching page $NUM  --  $(date)"
  curl -s -b "${COOKIE}" "$URL" > html/page-$NUM.html

  # loop update
  PAGE=$((PAGE+1))
  sleep 1
done

## html2json.py
#!/usr/bin/env python3

from os import walk
from bs4 import BeautifulSoup
import json

AllItems = []

def parseFile(fn):
    with open(fn, 'r') as content_file:
        content = content_file.read()
        soup = BeautifulSoup(content, 'html.parser')
        upvotes = soup.find(class_="itemlist")
        parseItems(upvotes)

def parseItems(upvotes):
    items = upvotes.find_all(class_="athing")
    for item in items:
        parseItem(item)

def parseItem(item):
    itemID = item.get("id")
    story = item.find(class_="storylink")
    href  = story.get("href")
    title = story.contents[0]
    site = "none"
    try:
        site  = item.find(class_="sitestr").contents[0]
    except:
        site = "user"
    saveItem(itemID, href, title, site)

def saveItem(itemID, href, title, site):
    item = {
        "id": itemID,
        "href": href,
        "title": title,
        "site": site
    }
    AllItems.append(item)


# Read data
for (dirpath, dirnames, filenames) in walk("html"):
    for fn in filenames:
        print("parsing: " + fn)
        parseFile("html/" + fn)

# Write data
with open('data.json', 'w') as outfile:
    json.dump(AllItems, outfile)
	#!/usr/bin/env bash
	set -euo pipefail

	COOKIE="user=verdverm&..."
	USERNAME="verdverm"
	BASEURL="https://news.ycombinator.com/upvoted"

	PAGE=0

	mkdir -p html

	while true; do
	URL="${BASEURL}?id=${USERNAME}&p=${PAGE}"
	NUM=$(printf "%06d\n" $PAGE)
	echo "fetching page $NUM -- $(date)"
	curl -s -b "${COOKIE}" "$URL" > html/page-$NUM.html

	# loop update
	PAGE=$((PAGE+1))
	sleep 1
	done
	#!/usr/bin/env python3

	from os import walk
	from bs4 import BeautifulSoup
	import json

	AllItems = []

	def parseFile(fn):
	with open(fn, 'r') as content_file:
	content = content_file.read()
	soup = BeautifulSoup(content, 'html.parser')
	upvotes = soup.find(class_="itemlist")
	parseItems(upvotes)

	def parseItems(upvotes):
	items = upvotes.find_all(class_="athing")
	for item in items:
	parseItem(item)

	def parseItem(item):
	itemID = item.get("id")
	story = item.find(class_="storylink")
	href = story.get("href")
	title = story.contents[0]
	site = "none"
	try:
	site = item.find(class_="sitestr").contents[0]
	except:
	site = "user"
	saveItem(itemID, href, title, site)

	def saveItem(itemID, href, title, site):
	item = {
	"id": itemID,
	"href": href,
	"title": title,
	"site": site
	}
	AllItems.append(item)


	# Read data
	for (dirpath, dirnames, filenames) in walk("html"):
	for fn in filenames:
	print("parsing: " + fn)
	parseFile("html/" + fn)

	# Write data
	with open('data.json', 'w') as outfile:
	json.dump(AllItems, outfile)