Last active
February 24, 2021 07:47
-
-
Save balzer82/e09c4e23410f7ec5ff38 to your computer and use it in GitHub Desktop.
Top 10 Video views of CCC events from media.ccc.de
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <headingcell level=1> | |
# Crawls 'media.ccc.de' and ranks the videos for views | |
# <codecell> | |
from bs4 import BeautifulSoup | |
import requests | |
# <codecell> | |
#%matplotlib inline | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
# <headingcell level=2> | |
# Some URLs you can use | |
# <codecell> | |
#url = 'http://media.ccc.de/browse/conferences/datenspuren/2014/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/eh2014/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/fiffkon/2014/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/vcfb/2014/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/hackover/2014/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/mrmcd/mrmcd14/index.html' | |
#url = 'http://media.ccc.de/browse/conferences/froscon/2014/index.html' | |
#url = 'http://media.ccc.de/browse/congress/2014/index.html' | |
# Or you can even try all years for the congresses: | |
for year in range(2000, 2015): | |
url = 'http://media.ccc.de/browse/congress/%i/index.html' % year | |
r = requests.get(url) | |
# <codecell> | |
soup = BeautifulSoup(r.content) | |
# <codecell> | |
eventname = soup.find('h2').text.encode('utf-8') | |
if len(eventname)==0: | |
eventname = soup.find('h1').text.encode('utf-8') | |
print('%s...' % eventname) | |
# <codecell> | |
talkurllist = [] | |
for link in soup.find_all('a', attrs={'class': 'event-preview'}): | |
talkurllist.append(link.get('href')) | |
# <codecell> | |
talks = {} | |
for talkurl in talkurllist: | |
r = requests.get('http://media.ccc.de/' + talkurl) | |
soup = BeautifulSoup(r.content) | |
title = soup.find('h1') | |
ul = soup.find('ul', attrs={'class': 'metadata'}) | |
for i,li in enumerate(ul.find_all('li')): | |
if i==2: | |
talks[title.text.encode('utf-8').strip()[:60]] = int(li.text) | |
# <codecell> | |
name = 'Views \'media.ccc.de\'' | |
df = pd.DataFrame(data=talks.values(), index=talks.keys(), columns=[name]) | |
df.sort(name, inplace='True') | |
df = df.tail(10) | |
# <codecell> | |
df.plot(kind='barh', figsize=(11,5), title='Top 10 Most Viewed \'%s\' Talks' % eventname, color='#94C600') | |
plt.tight_layout() | |
plt.savefig('%s-Top10-Talks.png' % eventname, dpi=150) | |
# <codecell> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment