Skip to content

Instantly share code, notes, and snippets.

@brienna
Last active May 2, 2020 00:57
Show Gist options
  • Save brienna/1ad72691a1a8ae15eeab7a9aeccd876a to your computer and use it in GitHub Desktop.
Save brienna/1ad72691a1a8ae15eeab7a9aeccd876a to your computer and use it in GitHub Desktop.
Collects details for each article in The New York Times over a period of time
data = {'headline': [],
'date': [],
'doc_type': [],
'material_type': [],
'section': [],
'keywords': []}
for response in responses: # For each response, get all the articles
articles = response['response']['docs']
for article in articles: # For each article, make sure it falls within our date range
date = dateutil.parser.parse(article['pub_date']).date()
is_in_range = date > start and date < end
if is_in_range and article['headline']['main']: # Collect its details, only if it has a headline
data['date'].append(date)
data['headline'].append(article['headline']['main'])
data['section'].append(article['section_name'])
data['doc_type'].append(article['document_type'])
if 'material_type' in article:
data['material_type'].append(article['type_of_material'])
else:
data['material_type'].append(None)
keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
data['keywords'].append(keywords)
df = pd.DataFrame(data)
df.to_csv('NYT.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment