matias-pg/generate_hackernews_stories_csv.py

## generate_hackernews_stories_csv.py
import csv
from datetime import datetime

from hn import search_by_date

# Edit this line to change how many stories you want in the CSV
# Note that fetching too many stories may trigger a rate limit
max_stories = 1_000_000
filename = f'stories_{max_stories}.csv'

with open(filename, 'w') as csvFile:
    fields = ['ID', 'Title', 'Author', 'Created At',
              'URL', 'Points', 'Number of Comments']

    # You could also use DictWriter, but the CSV would end with a different header
    writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL, escapechar='\\')

    writer.writerow(fields)
    written_count = 0

    start = datetime.now()
    print(f'Start: {start}')

    for story in search_by_date(stories=True):
        row = [story['objectID'], story['title'],
               story['author'], story['created_at'], story['url'],
               story['points'], story['num_comments']]

        writer.writerow(row)
        written_count += 1

        # Avoid excessive printing, since the library fetches pages of 1000 stories anyway
        if written_count % 1_000 == 0:
            print(written_count)

        # Stop fetching stories
        if written_count >= max_stories:
            break

    end = datetime.now()
    print(f'End: {start}')
    print(f'Took: {end - start}')
	import csv
	from datetime import datetime

	from hn import search_by_date

	# Edit this line to change how many stories you want in the CSV
	# Note that fetching too many stories may trigger a rate limit
	max_stories = 1_000_000
	filename = f'stories_{max_stories}.csv'

	with open(filename, 'w') as csvFile:
	fields = ['ID', 'Title', 'Author', 'Created At',
	'URL', 'Points', 'Number of Comments']

	# You could also use DictWriter, but the CSV would end with a different header
	writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL, escapechar='\\')

	writer.writerow(fields)
	written_count = 0

	start = datetime.now()
	print(f'Start: {start}')

	for story in search_by_date(stories=True):
	row = [story['objectID'], story['title'],
	story['author'], story['created_at'], story['url'],
	story['points'], story['num_comments']]

	writer.writerow(row)
	written_count += 1

	# Avoid excessive printing, since the library fetches pages of 1000 stories anyway
	if written_count % 1_000 == 0:
	print(written_count)

	# Stop fetching stories
	if written_count >= max_stories:
	break

	end = datetime.now()
	print(f'End: {start}')
	print(f'Took: {end - start}')