Skip to content

Instantly share code, notes, and snippets.

@mcrowe
Created August 20, 2015 16:31
Show Gist options
  • Save mcrowe/ea5101a1b8442838ce72 to your computer and use it in GitHub Desktop.
Save mcrowe/ea5101a1b8442838ce72 to your computer and use it in GitHub Desktop.
# Fetches stories from Hacker News and stores them in an sqlite db.
#
# Uses a thread pool of 25 works to do the http requests to make things
# faster.
#
# Usage:
# python fetch_hn_stories 1 1000
#
from hackernews import HackerNews
from threading import Thread
from Queue import Queue
from time import gmtime, strftime
import sys
import sqlite3
import pyprind
import time
if len(sys.argv) < 3:
raise Exception('first_id and last_id must be provided')
STORY_IDS = range(int(sys.argv[2]), int(sys.argv[1]), -1)
NUM_STORY_IDS = len(STORY_IDS)
DB_NAME = 'hackernews.db'
NUM_FETCHERS = 25
INSERT_ITEM_SQL = """
INSERT OR IGNORE INTO items (item_id, item_type, by, score, comments, url, title, submission_time)
VALUES (:item_id, :item_type, :by, :score, :comments, :url, :title, :submission_time)
"""
def build_item_row(item):
comments = len(item.kids) if item.kids else 0
return (
item.item_id,
item.item_type,
item.by,
item.score,
comments,
item.url,
item.title,
item.submission_time
)
def fetcher():
while True:
id = id_queue.get()
id_queue.task_done()
try:
item_queue.put( hn.get_item(id) )
except:
pass
def start_worker(job):
t = Thread(target=job)
t.daemon = True
t.start()
hn = HackerNews()
id_queue = Queue(NUM_STORY_IDS)
item_queue = Queue(5000)
# Start fetcher workers.
for _ in range(0, NUM_FETCHERS):
start_worker(fetcher)
# Add ids to fetch queue
print('Enqueuing ids ' + str(STORY_IDS[0]) + '-' + str(STORY_IDS[-1]))
for story_id in STORY_IDS:
id_queue.put(story_id)
bar = pyprind.ProgBar(NUM_STORY_IDS, width=100)
with sqlite3.connect(DB_NAME) as conn:
while not (id_queue.empty() and item_queue.empty()):
rows = []
while not item_queue.empty():
item = item_queue.get()
item_queue.task_done()
bar.update(item_id=item.item_id)
if item.item_type == 'story':
rows.append( build_item_row(item) )
if len(rows) > 0:
conn.executemany(INSERT_ITEM_SQL, rows)
conn.commit()
time.sleep(4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment