Skip to content

Instantly share code, notes, and snippets.

@davidlenz
Last active April 26, 2018 13:26
Show Gist options
  • Save davidlenz/961b5cf501b8f89a8c10cbd45306001b to your computer and use it in GitHub Desktop.
Save davidlenz/961b5cf501b8f89a8c10cbd45306001b to your computer and use it in GitHub Desktop.
Scrape the sources from the newsapi headers every 12 hours. https://newsapi.org/
import justext, time
import pandas as pd
import requests, urllib
import utils_func
def get_sources(key):
"""
retrieve all sources from newsapi, filter the german and english speaking
and return them as dataframe
:param key:
:return:
"""
response = requests.get('https://newsapi.org/v2/sources?apiKey={}'.format(key))
sources = pd.DataFrame(response.json()['sources']).set_index('id')
sources = sources[sources.language.isin(['de', 'en'])]
print('found {} sources'.format(sources.shape[0]))
return sources
def request_newsheader_from_source(source, key):
"""
Retrieve the top headlines for a single source from newsapi
:param source: newsapi source id, for example 'bbc-sport'
:param key: newsapi API key
:return: dataframe holding the top headlines with meta information
"""
url = 'https://newsapi.org/v2/top-headlines?sources={}&apiKey={}'.format(source, key)
response = requests.get(url)
df = pd.DataFrame(response.json()['articles'])
df['source'] = source
return df
def english_justtext(url):
# open page
page = urllib.request.urlopen(url).read()
# apply justtext on page
paragraphs = justext.justext(page, justext.get_stoplist('English'))
# extract the relevant paragraphs
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']
# list to string
text = ' '.join(text_list)
return text
def german_justtext(url):
# open page
page = urllib.request.urlopen(url).read()
# apply justtext on page
paragraphs = justext.justext(page, justext.get_stoplist('German'))
# extract the relevant paragraphs
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']
# list to string
text = ' '.join(text_list)
return text
def get_jusText_text(frame_in):
"""
Use the jusText library to scrape newsarticle textbodys from websites
:param frame_in: dataframe with row named 'url'
:return: dataframe with text appended
"""
frame = frame_in.copy()
for i, row in frame.iterrows():
try:
text = english_justtext(row.url)
# if empty string, do german justtext
if text == "":
text = german_justtext(row.url)
# if still empty, notify
if text == '':
print(i, row.url, 'nope')
# add the text to the entry in the dataframe
frame.loc[i, 'text'] = text.replace("'", "")
except Exception as e:
print(i, row.url, e)
return frame
if __name__ == '__main__':
key = # newsapi key
methods = ['none', 'latest', 'top', 'popular']
while True:
date_n_time = utils_func.get_timestamp()
sleep_time = int(3600*12) #int(sys.argv[1])
print()
print('Time:', date_n_time)
print('Requesting News Sources from Newsapi...')
sources = get_sources(key)
print('Scraping Top Headlines..')
ddf = [request_newsheader_from_source(source, key) for source in sources.index.unique()]
news = pd.concat(ddf).reset_index(drop=True)
print('Scraping article texts for top headlines...')
out = get_jusText_text(news)
outpath = 'newsapi_v2/{}.csv'.format(date_n_time)
print('Saving result to csv: ', outpath)
out.to_csv(outpath, encoding='utf-8-sig')
print('Sleeping for {} minutes'.format(sleep_time/60))
time.sleep(sleep_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment