davidlenz/scrape_newsapi.py

## scrape_newsapi.py
import justext, time
import pandas as pd
import requests, urllib
import utils_func


def get_sources(key):
    """
    retrieve all sources from newsapi, filter the german and english speaking
    and return them as dataframe
    :param key:
    :return:
    """
    response = requests.get('https://newsapi.org/v2/sources?apiKey={}'.format(key))
    sources = pd.DataFrame(response.json()['sources']).set_index('id')

    sources = sources[sources.language.isin(['de', 'en'])]
    print('found {} sources'.format(sources.shape[0]))
    return sources


def request_newsheader_from_source(source, key):
    """
    Retrieve the top headlines for a single source from newsapi
    :param source: newsapi source id, for example 'bbc-sport'
    :param key: newsapi API key
    :return: dataframe holding the top headlines with meta information
    """
    url = 'https://newsapi.org/v2/top-headlines?sources={}&apiKey={}'.format(source, key)

    response = requests.get(url)
    df = pd.DataFrame(response.json()['articles'])
    df['source'] = source
    return df


def english_justtext(url):
    # open page
    page = urllib.request.urlopen(url).read()

    # apply justtext on page
    paragraphs = justext.justext(page, justext.get_stoplist('English'))

    # extract the relevant paragraphs
    text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']

    # list to string
    text = ' '.join(text_list)
    return text

def german_justtext(url):
    # open page
    page = urllib.request.urlopen(url).read()

    # apply justtext on page
    paragraphs = justext.justext(page, justext.get_stoplist('German'))

    # extract the relevant paragraphs
    text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']

    # list to string
    text = ' '.join(text_list)
    return text


def get_jusText_text(frame_in):
    """
    Use the jusText library to scrape newsarticle textbodys from websites
    :param frame_in: dataframe with row named 'url'
    :return: dataframe with text appended
    """
    frame = frame_in.copy()
    for i, row in frame.iterrows():
        try:
            text = english_justtext(row.url)
            # if empty string, do german justtext
            if text == "":
                text = german_justtext(row.url)

            # if still empty, notify
            if text == '':
                print(i, row.url, 'nope')

            # add the text to the entry in the dataframe
            frame.loc[i, 'text'] = text.replace("'", "")
        except Exception as e:
            print(i, row.url, e)
    return frame


if __name__ == '__main__':
    key =   # newsapi key
    methods = ['none', 'latest', 'top', 'popular']


    while True:

        date_n_time = utils_func.get_timestamp()
        sleep_time = int(3600*12) #int(sys.argv[1])

        print()
        print('Time:', date_n_time)

        print('Requesting News Sources from Newsapi...')
        sources = get_sources(key)

        print('Scraping Top Headlines..')
        ddf = [request_newsheader_from_source(source, key) for source in sources.index.unique()]
        news = pd.concat(ddf).reset_index(drop=True)

        print('Scraping article texts for top headlines...')
        out = get_jusText_text(news)

        outpath = 'newsapi_v2/{}.csv'.format(date_n_time)
        print('Saving result to csv: ', outpath)
        out.to_csv(outpath, encoding='utf-8-sig')

        print('Sleeping for {} minutes'.format(sleep_time/60))
        time.sleep(sleep_time)
	import justext, time
	import pandas as pd
	import requests, urllib
	import utils_func


	def get_sources(key):
	"""
	retrieve all sources from newsapi, filter the german and english speaking
	and return them as dataframe
	:param key:
	:return:
	"""
	response = requests.get('https://newsapi.org/v2/sources?apiKey={}'.format(key))
	sources = pd.DataFrame(response.json()['sources']).set_index('id')

	sources = sources[sources.language.isin(['de', 'en'])]
	print('found {} sources'.format(sources.shape[0]))
	return sources


	def request_newsheader_from_source(source, key):
	"""
	Retrieve the top headlines for a single source from newsapi
	:param source: newsapi source id, for example 'bbc-sport'
	:param key: newsapi API key
	:return: dataframe holding the top headlines with meta information
	"""
	url = 'https://newsapi.org/v2/top-headlines?sources={}&apiKey={}'.format(source, key)

	response = requests.get(url)
	df = pd.DataFrame(response.json()['articles'])
	df['source'] = source
	return df


	def english_justtext(url):
	# open page
	page = urllib.request.urlopen(url).read()

	# apply justtext on page
	paragraphs = justext.justext(page, justext.get_stoplist('English'))

	# extract the relevant paragraphs
	text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']

	# list to string
	text = ' '.join(text_list)
	return text

	def german_justtext(url):
	# open page
	page = urllib.request.urlopen(url).read()

	# apply justtext on page
	paragraphs = justext.justext(page, justext.get_stoplist('German'))

	# extract the relevant paragraphs
	text_list = [paragraph.text for paragraph in paragraphs if paragraph.class_type == 'good']

	# list to string
	text = ' '.join(text_list)
	return text



	def get_jusText_text(frame_in):
	"""
	Use the jusText library to scrape newsarticle textbodys from websites
	:param frame_in: dataframe with row named 'url'
	:return: dataframe with text appended
	"""
	frame = frame_in.copy()
	for i, row in frame.iterrows():
	try:
	text = english_justtext(row.url)
	# if empty string, do german justtext
	if text == "":
	text = german_justtext(row.url)

	# if still empty, notify
	if text == '':
	print(i, row.url, 'nope')

	# add the text to the entry in the dataframe
	frame.loc[i, 'text'] = text.replace("'", "")
	except Exception as e:
	print(i, row.url, e)
	return frame


	if __name__ == '__main__':
	key = # newsapi key
	methods = ['none', 'latest', 'top', 'popular']


	while True:

	date_n_time = utils_func.get_timestamp()
	sleep_time = int(3600*12) #int(sys.argv[1])

	print()
	print('Time:', date_n_time)

	print('Requesting News Sources from Newsapi...')
	sources = get_sources(key)

	print('Scraping Top Headlines..')
	ddf = [request_newsheader_from_source(source, key) for source in sources.index.unique()]
	news = pd.concat(ddf).reset_index(drop=True)

	print('Scraping article texts for top headlines...')
	out = get_jusText_text(news)

	outpath = 'newsapi_v2/{}.csv'.format(date_n_time)
	print('Saving result to csv: ', outpath)
	out.to_csv(outpath, encoding='utf-8-sig')

	print('Sleeping for {} minutes'.format(sleep_time/60))
	time.sleep(sleep_time)