Skip to content

Instantly share code, notes, and snippets.

Last active August 6, 2019 18:12
Show Gist options
  • Save davidlenz/2f14b28ead67ee1522c2d5fcbf76a45e to your computer and use it in GitHub Desktop.
Save davidlenz/2f14b28ead67ee1522c2d5fcbf76a45e to your computer and use it in GitHub Desktop.
Scrape the heise newsticker archive ( using beatifulsoup.
import requests
import bs4 as bs
from bs4 import BeautifulSoup
import pandas as pd
import os
def get_timestamp():
import time, datetime
date_n_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
return date_n_time
def get_soup(quote_page):
page = requests.get(quote_page)
soup = BeautifulSoup(page.content, 'html.parser')
return page, soup
## Download all relevant links from all archive pages
years = list(range(1997,2020))
months = ['01','02','03','04','05','06','07','08','09','10','11','12']
dfs = []
for year in years:
for month in months:
url = '{}/{}'.format(year, month)
page, soup = get_soup(url)
news = soup.find_all('a',{'class':'archiv-liste__text'}, href=True)
urls = [new['href'] for new in news]
titles = [new['title'].strip() for new in news]
df = pd.DataFrame([titles,urls]).T
df.columns = ["title", "url"]
df['month'] = month
df['year'] = year
out = pd.concat(dfs)
# create folder
# save urls
# read urls
out = pd.read_csv('heise_archiv/urls.csv',encoding='utf-8-sig')
# ## Iterate over the resulting dataframe to download all the texts from the stored urls
out['url'] = '' + out['url']
out.reset_index(inplace=True) # set new index
out = out.drop('index',axis=1) # drop the old index
out = out.rename(columns={'0':'titel','1':'link'}) # rename columns
import justext
# out = pd.read_csv('out_tmp.csv', index_col='Unnamed: 0')
import numpy as np
frame = out.copy()
for i,row in frame.iterrows():
response,soup = get_soup(row.url)
paragraphs = justext.justext(response.content, justext.get_stoplist("German"))
b = [paragraph.text.replace('\n',' ') for paragraph in paragraphs if not paragraph.is_boilerplate]
text2 = "".join(b)
if text2 == '' :
print(i, row.url, 'nope')
text1 = soup.find_all('div', class_='article-layout__content')[0].find_all('p')
text1 = " ".join([art.get_text() for art in text1]).strip()
author = soup.find_all('meta',{'name':'author'})[0]['content']
description = soup.find_all('meta',{'property':'og:description'})[0]['content']
date = soup.find_all('meta',{'name':'date'})[0]['content']
tmpdf = pd.DataFrame([text1, text2, author, description, date])
tmpdf.T.to_csv('heise_archiv/heise_archiv.csv',encoding='utf-8', mode='a')
except Exception as e:
print(i, row.url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment