ifnull/news_dataset_download.py

## news_dataset_download.py
import os
import requests

from datetime import datetime, timedelta
from newsapi import NewsApiClient
from bs4 import BeautifulSoup

# Configurations
newsapi = NewsApiClient(api_key='****')
output_path = './corpus/'

# Setting True will scrape each article URL for topics and full content.
# Scraping is specific to AP news source
scrape_topics_and_extended_content = True

# Create an output folder if it doesn't exist already
if not os.path.exists(output_path):
    os.makedirs(output_path)


# Get the most recent 100 articles from AP since yesterday
yesterday = datetime.now() - timedelta(1)
articles = newsapi.get_everything(sources='associated-press',
                                  from_param=datetime.strftime(yesterday, '%Y-%m-%d'),
                                  sort_by='publishedAt',
                                  page_size=100)['articles']

# Iterate over articles and write corpus files.
for idx, article in enumerate(articles):

	# Create title file
	fh = open('{}title-{}.txt'.format(output_path, idx), 'w')
	fh.write(article['title'])
	fh.close()


	# Create topic file
	fh = open('{}topic-{}.txt'.format(output_path, idx), 'w')

	# Either use the basic dataset provided by NewsAPI and manually provided topics
	if scrape_topics_and_extended_content:

		print('Scraping: {}'.format(article['url']))

		# Get article URL body
		raw_body = requests.get(article['url']).text

		# Parse article content
		body = BeautifulSoup(raw_body, 'html.parser')
		related_topics = body.findAll('a', {'data-key': 'related-tag'})

		topics = []

		for topic in related_topics:
			topics.append(topic.text)

		# Write topic to file as comma seperated list
		fh.write(','.join(topics))
		fh.close()


		# Parse article content and iterate over paragraphs joinign to one string.
		plain_content = ''
		article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p')

		for child in article_content_paragraphs:
			# Ignore elements with links to keep data clean.
			if 'http' not in child.text:
				plain_content = ''.join((plain_content, ' ', child.text))


		# Create article file from scaped data
		fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
		fh.write(plain_content)
		fh.close()

	else:
		# Create an empty file to be manually entered as a comma seperated list of topics
		fh.write('')
		fh.close()

		# Create article file from article summary
		fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
		fh.write(article['content'])
		fh.close()

## requirements.txt
requests==2.24.0
newsapi-python==0.2.6
beautifulsoup4==4.9.3
	import os
	import requests

	from datetime import datetime, timedelta
	from newsapi import NewsApiClient
	from bs4 import BeautifulSoup

	# Configurations
	newsapi = NewsApiClient(api_key='****')
	output_path = './corpus/'

	# Setting True will scrape each article URL for topics and full content.
	# Scraping is specific to AP news source
	scrape_topics_and_extended_content = True

	# Create an output folder if it doesn't exist already
	if not os.path.exists(output_path):
	os.makedirs(output_path)


	# Get the most recent 100 articles from AP since yesterday
	yesterday = datetime.now() - timedelta(1)
	articles = newsapi.get_everything(sources='associated-press',
	from_param=datetime.strftime(yesterday, '%Y-%m-%d'),
	sort_by='publishedAt',
	page_size=100)['articles']

	# Iterate over articles and write corpus files.
	for idx, article in enumerate(articles):

	# Create title file
	fh = open('{}title-{}.txt'.format(output_path, idx), 'w')
	fh.write(article['title'])
	fh.close()


	# Create topic file
	fh = open('{}topic-{}.txt'.format(output_path, idx), 'w')

	# Either use the basic dataset provided by NewsAPI and manually provided topics
	if scrape_topics_and_extended_content:

	print('Scraping: {}'.format(article['url']))

	# Get article URL body
	raw_body = requests.get(article['url']).text

	# Parse article content
	body = BeautifulSoup(raw_body, 'html.parser')
	related_topics = body.findAll('a', {'data-key': 'related-tag'})

	topics = []

	for topic in related_topics:
	topics.append(topic.text)

	# Write topic to file as comma seperated list
	fh.write(','.join(topics))
	fh.close()


	# Parse article content and iterate over paragraphs joinign to one string.
	plain_content = ''
	article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p')

	for child in article_content_paragraphs:
	# Ignore elements with links to keep data clean.
	if 'http' not in child.text:
	plain_content = ''.join((plain_content, ' ', child.text))


	# Create article file from scaped data
	fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
	fh.write(plain_content)
	fh.close()

	else:
	# Create an empty file to be manually entered as a comma seperated list of topics
	fh.write('')
	fh.close()

	# Create article file from article summary
	fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
	fh.write(article['content'])
	fh.close()