davidlenz/heise_scraper.py

## heise_scraper.py
import requests
import bs4 as bs
from bs4 import BeautifulSoup
import pandas as pd
import os

def get_timestamp():
    import time, datetime
    date_n_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
    return date_n_time

def get_soup(quote_page):
    page = requests.get(quote_page)
    soup = BeautifulSoup(page.content, 'html.parser')
    return page, soup

## Download all relevant links from all archive pages
years = list(range(1997,2020))
months = ['01','02','03','04','05','06','07','08','09','10','11','12']


dfs = []
for year in years:
    print(year)
    for month in months:
        print(month)
        url = 'https://www.heise.de/newsticker/archiv/{}/{}'.format(year, month)
        page, soup = get_soup(url)
        news = soup.find_all('a',{'class':'archiv-liste__text'}, href=True)
        urls = [new['href'] for new in news]
        titles = [new['title'].strip() for new in news]
        df = pd.DataFrame([titles,urls]).T
        df.columns = ["title", "url"]
        df['month'] = month
        df['year'] = year
        dfs.append(df)
out = pd.concat(dfs)

# create folder
os.makedirs('heise_archiv/')

# save urls
out.to_csv('heise_archiv/urls.csv',encoding='utf-8-sig')

# read urls
out = pd.read_csv('heise_archiv/urls.csv',encoding='utf-8-sig')


# ## Iterate over the resulting dataframe to download all the texts from the stored urls
out['url'] = 'https://www.heise.de' + out['url']
out.reset_index(inplace=True) # set new index
out = out.drop('index',axis=1) # drop the old index
out = out.rename(columns={'0':'titel','1':'link'}) # rename columns

import justext
# out = pd.read_csv('out_tmp.csv', index_col='Unnamed: 0')

import numpy as np
frame = out.copy()
print(out.shape)
for i,row in frame.iterrows():
    print(i)

    try:
        response,soup = get_soup(row.url)
        paragraphs = justext.justext(response.content, justext.get_stoplist("German"))
        b = [paragraph.text.replace('\n',' ') for paragraph in paragraphs if not paragraph.is_boilerplate]
        text2 = "".join(b)
        if text2 == '' :
            print(i, row.url, 'nope')

        text1 = soup.find_all('div', class_='article-layout__content')[0].find_all('p')
        text1 = " ".join([art.get_text() for art in text1]).strip()


        author = soup.find_all('meta',{'name':'author'})[0]['content']
        description = soup.find_all('meta',{'property':'og:description'})[0]['content']
        date = soup.find_all('meta',{'name':'date'})[0]['content']

        tmpdf = pd.DataFrame([text1, text2, author, description, date])
        tmpdf.T.to_csv('heise_archiv/heise_archiv.csv',encoding='utf-8', mode='a')

    except Exception as e:
        print('ERROR',e)
        print(i, row.url)
	import requests
	import bs4 as bs
	from bs4 import BeautifulSoup
	import pandas as pd
	import os

	def get_timestamp():
	import time, datetime
	date_n_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
	return date_n_time

	def get_soup(quote_page):
	page = requests.get(quote_page)
	soup = BeautifulSoup(page.content, 'html.parser')
	return page, soup

	## Download all relevant links from all archive pages
	years = list(range(1997,2020))
	months = ['01','02','03','04','05','06','07','08','09','10','11','12']


	dfs = []
	for year in years:
	print(year)
	for month in months:
	print(month)
	url = 'https://www.heise.de/newsticker/archiv/{}/{}'.format(year, month)
	page, soup = get_soup(url)
	news = soup.find_all('a',{'class':'archiv-liste__text'}, href=True)
	urls = [new['href'] for new in news]
	titles = [new['title'].strip() for new in news]
	df = pd.DataFrame([titles,urls]).T
	df.columns = ["title", "url"]
	df['month'] = month
	df['year'] = year
	dfs.append(df)
	out = pd.concat(dfs)

	# create folder
	os.makedirs('heise_archiv/')

	# save urls
	out.to_csv('heise_archiv/urls.csv',encoding='utf-8-sig')

	# read urls
	out = pd.read_csv('heise_archiv/urls.csv',encoding='utf-8-sig')


	# ## Iterate over the resulting dataframe to download all the texts from the stored urls
	out['url'] = 'https://www.heise.de' + out['url']
	out.reset_index(inplace=True) # set new index
	out = out.drop('index',axis=1) # drop the old index
	out = out.rename(columns={'0':'titel','1':'link'}) # rename columns

	import justext
	# out = pd.read_csv('out_tmp.csv', index_col='Unnamed: 0')

	import numpy as np
	frame = out.copy()
	print(out.shape)
	for i,row in frame.iterrows():
	print(i)

	try:
	response,soup = get_soup(row.url)
	paragraphs = justext.justext(response.content, justext.get_stoplist("German"))
	b = [paragraph.text.replace('\n',' ') for paragraph in paragraphs if not paragraph.is_boilerplate]
	text2 = "".join(b)
	if text2 == '' :
	print(i, row.url, 'nope')

	text1 = soup.find_all('div', class_='article-layout__content')[0].find_all('p')
	text1 = " ".join([art.get_text() for art in text1]).strip()


	author = soup.find_all('meta',{'name':'author'})[0]['content']
	description = soup.find_all('meta',{'property':'og:description'})[0]['content']
	date = soup.find_all('meta',{'name':'date'})[0]['content']

	tmpdf = pd.DataFrame([text1, text2, author, description, date])
	tmpdf.T.to_csv('heise_archiv/heise_archiv.csv',encoding='utf-8', mode='a')

	except Exception as e:
	print('ERROR',e)
	print(i, row.url)