Skip to content

Instantly share code, notes, and snippets.

@balzer82
Last active February 24, 2021 07:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save balzer82/e09c4e23410f7ec5ff38 to your computer and use it in GitHub Desktop.
Save balzer82/e09c4e23410f7ec5ff38 to your computer and use it in GitHub Desktop.
Top 10 Video views of CCC events from media.ccc.de
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <headingcell level=1>
# Crawls 'media.ccc.de' and ranks the videos for views
# <codecell>
from bs4 import BeautifulSoup
import requests
# <codecell>
#%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
# <headingcell level=2>
# Some URLs you can use
# <codecell>
#url = 'http://media.ccc.de/browse/conferences/datenspuren/2014/index.html'
#url = 'http://media.ccc.de/browse/conferences/eh2014/index.html'
#url = 'http://media.ccc.de/browse/conferences/fiffkon/2014/index.html'
#url = 'http://media.ccc.de/browse/conferences/vcfb/2014/index.html'
#url = 'http://media.ccc.de/browse/conferences/hackover/2014/index.html'
#url = 'http://media.ccc.de/browse/conferences/mrmcd/mrmcd14/index.html'
#url = 'http://media.ccc.de/browse/conferences/froscon/2014/index.html'
#url = 'http://media.ccc.de/browse/congress/2014/index.html'
# Or you can even try all years for the congresses:
for year in range(2000, 2015):
url = 'http://media.ccc.de/browse/congress/%i/index.html' % year
r = requests.get(url)
# <codecell>
soup = BeautifulSoup(r.content)
# <codecell>
eventname = soup.find('h2').text.encode('utf-8')
if len(eventname)==0:
eventname = soup.find('h1').text.encode('utf-8')
print('%s...' % eventname)
# <codecell>
talkurllist = []
for link in soup.find_all('a', attrs={'class': 'event-preview'}):
talkurllist.append(link.get('href'))
# <codecell>
talks = {}
for talkurl in talkurllist:
r = requests.get('http://media.ccc.de/' + talkurl)
soup = BeautifulSoup(r.content)
title = soup.find('h1')
ul = soup.find('ul', attrs={'class': 'metadata'})
for i,li in enumerate(ul.find_all('li')):
if i==2:
talks[title.text.encode('utf-8').strip()[:60]] = int(li.text)
# <codecell>
name = 'Views \'media.ccc.de\''
df = pd.DataFrame(data=talks.values(), index=talks.keys(), columns=[name])
df.sort(name, inplace='True')
df = df.tail(10)
# <codecell>
df.plot(kind='barh', figsize=(11,5), title='Top 10 Most Viewed \'%s\' Talks' % eventname, color='#94C600')
plt.tight_layout()
plt.savefig('%s-Top10-Talks.png' % eventname, dpi=150)
# <codecell>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment