Skip to content

Instantly share code, notes, and snippets.

@ifnull
Last active February 2, 2021 21:42
Show Gist options
  • Save ifnull/220f103499d731c93972ec2a6944f7e3 to your computer and use it in GitHub Desktop.
Save ifnull/220f103499d731c93972ec2a6944f7e3 to your computer and use it in GitHub Desktop.
MIT xPRO: DSx Data Science and Big Data Analytics: Making Data-Driven Decisions - News Downloader
import os
import requests
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from bs4 import BeautifulSoup
# Configurations
newsapi = NewsApiClient(api_key='****')
output_path = './corpus/'
# Setting True will scrape each article URL for topics and full content.
# Scraping is specific to AP news source
scrape_topics_and_extended_content = True
# Create an output folder if it doesn't exist already
if not os.path.exists(output_path):
os.makedirs(output_path)
# Get the most recent 100 articles from AP since yesterday
yesterday = datetime.now() - timedelta(1)
articles = newsapi.get_everything(sources='associated-press',
from_param=datetime.strftime(yesterday, '%Y-%m-%d'),
sort_by='publishedAt',
page_size=100)['articles']
# Iterate over articles and write corpus files.
for idx, article in enumerate(articles):
# Create title file
fh = open('{}title-{}.txt'.format(output_path, idx), 'w')
fh.write(article['title'])
fh.close()
# Create topic file
fh = open('{}topic-{}.txt'.format(output_path, idx), 'w')
# Either use the basic dataset provided by NewsAPI and manually provided topics
if scrape_topics_and_extended_content:
print('Scraping: {}'.format(article['url']))
# Get article URL body
raw_body = requests.get(article['url']).text
# Parse article content
body = BeautifulSoup(raw_body, 'html.parser')
related_topics = body.findAll('a', {'data-key': 'related-tag'})
topics = []
for topic in related_topics:
topics.append(topic.text)
# Write topic to file as comma seperated list
fh.write(','.join(topics))
fh.close()
# Parse article content and iterate over paragraphs joinign to one string.
plain_content = ''
article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p')
for child in article_content_paragraphs:
# Ignore elements with links to keep data clean.
if 'http' not in child.text:
plain_content = ''.join((plain_content, ' ', child.text))
# Create article file from scaped data
fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
fh.write(plain_content)
fh.close()
else:
# Create an empty file to be manually entered as a comma seperated list of topics
fh.write('')
fh.close()
# Create article file from article summary
fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
fh.write(article['content'])
fh.close()
requests==2.24.0
newsapi-python==0.2.6
beautifulsoup4==4.9.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment