Skip to content

Instantly share code, notes, and snippets.

Last active April 26, 2018 13:26
Show Gist options
  • Save davidlenz/961b5cf501b8f89a8c10cbd45306001b to your computer and use it in GitHub Desktop.
Save davidlenz/961b5cf501b8f89a8c10cbd45306001b to your computer and use it in GitHub Desktop.
Scrape the sources from the newsapi headers every 12 hours.
import justext, time
import pandas as pd
import requests, urllib
import utils_func
def get_sources(key):
retrieve all sources from newsapi, filter the german and english speaking
and return them as dataframe
:param key:
response = requests.get('{}'.format(key))
sources = pd.DataFrame(response.json()['sources']).set_index('id')
sources = sources[sources.language.isin(['de', 'en'])]
print('found {} sources'.format(sources.shape[0]))
return sources
def request_newsheader_from_source(source, key):
Retrieve the top headlines for a single source from newsapi
:param source: newsapi source id, for example 'bbc-sport'
:param key: newsapi API key
:return: dataframe holding the top headlines with meta information
url = '{}&apiKey={}'.format(source, key)
response = requests.get(url)
df = pd.DataFrame(response.json()['articles'])
df['source'] = source
return df
def english_justtext(url):
# open page
page = urllib.request.urlopen(url).read()
# apply justtext on page
paragraphs = justext.justext(page, justext.get_stoplist('English'))
# extract the relevant paragraphs
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']
# list to string
text = ' '.join(text_list)
return text
def german_justtext(url):
# open page
page = urllib.request.urlopen(url).read()
# apply justtext on page
paragraphs = justext.justext(page, justext.get_stoplist('German'))
# extract the relevant paragraphs
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']
# list to string
text = ' '.join(text_list)
return text
def get_jusText_text(frame_in):
Use the jusText library to scrape newsarticle textbodys from websites
:param frame_in: dataframe with row named 'url'
:return: dataframe with text appended
frame = frame_in.copy()
for i, row in frame.iterrows():
text = english_justtext(row.url)
# if empty string, do german justtext
if text == "":
text = german_justtext(row.url)
# if still empty, notify
if text == '':
print(i, row.url, 'nope')
# add the text to the entry in the dataframe
frame.loc[i, 'text'] = text.replace("'", "")
except Exception as e:
print(i, row.url, e)
return frame
if __name__ == '__main__':
key = # newsapi key
methods = ['none', 'latest', 'top', 'popular']
while True:
date_n_time = utils_func.get_timestamp()
sleep_time = int(3600*12) #int(sys.argv[1])
print('Time:', date_n_time)
print('Requesting News Sources from Newsapi...')
sources = get_sources(key)
print('Scraping Top Headlines..')
ddf = [request_newsheader_from_source(source, key) for source in sources.index.unique()]
news = pd.concat(ddf).reset_index(drop=True)
print('Scraping article texts for top headlines...')
out = get_jusText_text(news)
outpath = 'newsapi_v2/{}.csv'.format(date_n_time)
print('Saving result to csv: ', outpath)
out.to_csv(outpath, encoding='utf-8-sig')
print('Sleeping for {} minutes'.format(sleep_time/60))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment