Last active
May 2, 2020 00:57
-
-
Save brienna/1ad72691a1a8ae15eeab7a9aeccd876a to your computer and use it in GitHub Desktop.
Collects details for each article in The New York Times over a period of time
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = {'headline': [], | |
'date': [], | |
'doc_type': [], | |
'material_type': [], | |
'section': [], | |
'keywords': []} | |
for response in responses: # For each response, get all the articles | |
articles = response['response']['docs'] | |
for article in articles: # For each article, make sure it falls within our date range | |
date = dateutil.parser.parse(article['pub_date']).date() | |
is_in_range = date > start and date < end | |
if is_in_range and article['headline']['main']: # Collect its details, only if it has a headline | |
data['date'].append(date) | |
data['headline'].append(article['headline']['main']) | |
data['section'].append(article['section_name']) | |
data['doc_type'].append(article['document_type']) | |
if 'material_type' in article: | |
data['material_type'].append(article['type_of_material']) | |
else: | |
data['material_type'].append(None) | |
keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject'] | |
data['keywords'].append(keywords) | |
df = pd.DataFrame(data) | |
df.to_csv('NYT.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment