Last active
October 22, 2022 02:40
-
-
Save matias-pg/041af42b10a6c520843c0cb356f98732 to your computer and use it in GitHub Desktop.
Generates a CSV containing stories from Hacker News
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from datetime import datetime | |
from hn import search_by_date | |
# Edit this line to change how many stories you want in the CSV | |
# Note that fetching too many stories may trigger a rate limit | |
max_stories = 1_000_000 | |
filename = f'stories_{max_stories}.csv' | |
with open(filename, 'w') as csvFile: | |
fields = ['ID', 'Title', 'Author', 'Created At', | |
'URL', 'Points', 'Number of Comments'] | |
# You could also use DictWriter, but the CSV would end with a different header | |
writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL, escapechar='\\') | |
writer.writerow(fields) | |
written_count = 0 | |
start = datetime.now() | |
print(f'Start: {start}') | |
for story in search_by_date(stories=True): | |
row = [story['objectID'], story['title'], | |
story['author'], story['created_at'], story['url'], | |
story['points'], story['num_comments']] | |
writer.writerow(row) | |
written_count += 1 | |
# Avoid excessive printing, since the library fetches pages of 1000 stories anyway | |
if written_count % 1_000 == 0: | |
print(written_count) | |
# Stop fetching stories | |
if written_count >= max_stories: | |
break | |
end = datetime.now() | |
print(f'End: {start}') | |
print(f'Took: {end - start}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To run this script, I recommend you to create a virtual environment with
venv
. To do that, run:After that, enter the virtual environment by running one of the following commands depending on your operating system:
Once you are in the virtual environment, install the dependency using the following command:
After that, generate the CSV using the following command: