Last active
April 26, 2018 13:26
-
-
Save davidlenz/961b5cf501b8f89a8c10cbd45306001b to your computer and use it in GitHub Desktop.
Scrape the sources from the newsapi headers every 12 hours. https://newsapi.org/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import justext, time | |
import pandas as pd | |
import requests, urllib | |
import utils_func | |
def get_sources(key): | |
""" | |
retrieve all sources from newsapi, filter the german and english speaking | |
and return them as dataframe | |
:param key: | |
:return: | |
""" | |
response = requests.get('https://newsapi.org/v2/sources?apiKey={}'.format(key)) | |
sources = pd.DataFrame(response.json()['sources']).set_index('id') | |
sources = sources[sources.language.isin(['de', 'en'])] | |
print('found {} sources'.format(sources.shape[0])) | |
return sources | |
def request_newsheader_from_source(source, key): | |
""" | |
Retrieve the top headlines for a single source from newsapi | |
:param source: newsapi source id, for example 'bbc-sport' | |
:param key: newsapi API key | |
:return: dataframe holding the top headlines with meta information | |
""" | |
url = 'https://newsapi.org/v2/top-headlines?sources={}&apiKey={}'.format(source, key) | |
response = requests.get(url) | |
df = pd.DataFrame(response.json()['articles']) | |
df['source'] = source | |
return df | |
def english_justtext(url): | |
# open page | |
page = urllib.request.urlopen(url).read() | |
# apply justtext on page | |
paragraphs = justext.justext(page, justext.get_stoplist('English')) | |
# extract the relevant paragraphs | |
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good'] | |
# list to string | |
text = ' '.join(text_list) | |
return text | |
def german_justtext(url): | |
# open page | |
page = urllib.request.urlopen(url).read() | |
# apply justtext on page | |
paragraphs = justext.justext(page, justext.get_stoplist('German')) | |
# extract the relevant paragraphs | |
text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good'] | |
# list to string | |
text = ' '.join(text_list) | |
return text | |
def get_jusText_text(frame_in): | |
""" | |
Use the jusText library to scrape newsarticle textbodys from websites | |
:param frame_in: dataframe with row named 'url' | |
:return: dataframe with text appended | |
""" | |
frame = frame_in.copy() | |
for i, row in frame.iterrows(): | |
try: | |
text = english_justtext(row.url) | |
# if empty string, do german justtext | |
if text == "": | |
text = german_justtext(row.url) | |
# if still empty, notify | |
if text == '': | |
print(i, row.url, 'nope') | |
# add the text to the entry in the dataframe | |
frame.loc[i, 'text'] = text.replace("'", "") | |
except Exception as e: | |
print(i, row.url, e) | |
return frame | |
if __name__ == '__main__': | |
key = # newsapi key | |
methods = ['none', 'latest', 'top', 'popular'] | |
while True: | |
date_n_time = utils_func.get_timestamp() | |
sleep_time = int(3600*12) #int(sys.argv[1]) | |
print() | |
print('Time:', date_n_time) | |
print('Requesting News Sources from Newsapi...') | |
sources = get_sources(key) | |
print('Scraping Top Headlines..') | |
ddf = [request_newsheader_from_source(source, key) for source in sources.index.unique()] | |
news = pd.concat(ddf).reset_index(drop=True) | |
print('Scraping article texts for top headlines...') | |
out = get_jusText_text(news) | |
outpath = 'newsapi_v2/{}.csv'.format(date_n_time) | |
print('Saving result to csv: ', outpath) | |
out.to_csv(outpath, encoding='utf-8-sig') | |
print('Sleeping for {} minutes'.format(sleep_time/60)) | |
time.sleep(sleep_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment