Last active
August 6, 2019 18:12
-
-
Save davidlenz/2f14b28ead67ee1522c2d5fcbf76a45e to your computer and use it in GitHub Desktop.
Scrape the heise newsticker archive (https://www.heise.de/newsticker/archiv) using beatifulsoup.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 as bs | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import os | |
def get_timestamp(): | |
import time, datetime | |
date_n_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S') | |
return date_n_time | |
def get_soup(quote_page): | |
page = requests.get(quote_page) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
return page, soup | |
## Download all relevant links from all archive pages | |
years = list(range(1997,2020)) | |
months = ['01','02','03','04','05','06','07','08','09','10','11','12'] | |
dfs = [] | |
for year in years: | |
print(year) | |
for month in months: | |
print(month) | |
url = 'https://www.heise.de/newsticker/archiv/{}/{}'.format(year, month) | |
page, soup = get_soup(url) | |
news = soup.find_all('a',{'class':'archiv-liste__text'}, href=True) | |
urls = [new['href'] for new in news] | |
titles = [new['title'].strip() for new in news] | |
df = pd.DataFrame([titles,urls]).T | |
df.columns = ["title", "url"] | |
df['month'] = month | |
df['year'] = year | |
dfs.append(df) | |
out = pd.concat(dfs) | |
# create folder | |
os.makedirs('heise_archiv/') | |
# save urls | |
out.to_csv('heise_archiv/urls.csv',encoding='utf-8-sig') | |
# read urls | |
out = pd.read_csv('heise_archiv/urls.csv',encoding='utf-8-sig') | |
# ## Iterate over the resulting dataframe to download all the texts from the stored urls | |
out['url'] = 'https://www.heise.de' + out['url'] | |
out.reset_index(inplace=True) # set new index | |
out = out.drop('index',axis=1) # drop the old index | |
out = out.rename(columns={'0':'titel','1':'link'}) # rename columns | |
import justext | |
# out = pd.read_csv('out_tmp.csv', index_col='Unnamed: 0') | |
import numpy as np | |
frame = out.copy() | |
print(out.shape) | |
for i,row in frame.iterrows(): | |
print(i) | |
try: | |
response,soup = get_soup(row.url) | |
paragraphs = justext.justext(response.content, justext.get_stoplist("German")) | |
b = [paragraph.text.replace('\n',' ') for paragraph in paragraphs if not paragraph.is_boilerplate] | |
text2 = "".join(b) | |
if text2 == '' : | |
print(i, row.url, 'nope') | |
text1 = soup.find_all('div', class_='article-layout__content')[0].find_all('p') | |
text1 = " ".join([art.get_text() for art in text1]).strip() | |
author = soup.find_all('meta',{'name':'author'})[0]['content'] | |
description = soup.find_all('meta',{'property':'og:description'})[0]['content'] | |
date = soup.find_all('meta',{'name':'date'})[0]['content'] | |
tmpdf = pd.DataFrame([text1, text2, author, description, date]) | |
tmpdf.T.to_csv('heise_archiv/heise_archiv.csv',encoding='utf-8', mode='a') | |
except Exception as e: | |
print('ERROR',e) | |
print(i, row.url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment