Skip to content

Instantly share code, notes, and snippets.

@zelark
Created April 24, 2014 13:42
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zelark/11255067 to your computer and use it in GitHub Desktop.
Save zelark/11255067 to your computer and use it in GitHub Desktop.
easy web scraping with python
import bs4
import requests
import argparse
import re
from multiprocessing import Pool
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
def parse_args():
parser = argparse.ArgumentParser(description='Show PyCon 2014 video statistics.')
parser.add_argument(
'--sort',
metavar='FIELD',
choices=['views', 'likes', 'dislikes'],
default='views',
help='sort by the specified field. Options are views, likes and dislikes.')
parser.add_argument(
'--max',
metavar='MAX',
type=int,
help='show the top MAX entries only.')
parser.add_argument(
'--workers',
type=int,
default=8,
help='number of workers to use, 8 by default.')
return parser.parse_args()
def get_video_page_urls():
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text)
links = [a.attrs.get('href') for a in soup.select('div.video-summary-data a[href^=/video]')]
return links
def get_video_data(video_page_url):
video_data = {}
response = requests.get(root_url + video_page_url)
soup = bs4.BeautifulSoup(response.text)
video_data['title'] = soup.select('div#videobox h3')[0].get_text()
video_data['speakers'] = [a.get_text() for a in soup.select('div#sidebar a[href^=/speaker]')]
video_data['youtube_url'] = soup.select('div#sidebar a[href^=http://www.youtube.com]')[0].get_text()
response = requests.get(video_data['youtube_url'])
soup = bs4.BeautifulSoup(response.text)
raw_views = soup.select('.watch-view-count')[0].get_text()
video_data['views'] = int(re.sub('[^0-9]', '', raw_views))
raw_likes = soup.select('.likes-count')[0].get_text()
video_data['likes'] = int(re.sub('[^0-9]', '', raw_likes))
raw_dislikes = soup.select('.dislikes-count')[0].get_text()
video_data['dislikes'] = int(re.sub('[^0-9]', '', raw_dislikes))
return video_data
def show_video_stats(options):
pool = Pool(options.workers)
video_page_urls = get_video_page_urls()
resluts = sorted(
pool.map(get_video_data, video_page_urls),
key=lambda video: video[options.sort],
reverse=True)
max = options.max
if max is None or max > len(resluts):
max = len(resluts)
print('Views +1 -1 Title (Spekears)')
for i in range(max):
print('{0:5d} {1:3d} {2:3d} {3} {4}'.format(
resluts[i]['views'],
resluts[i]['likes'],
resluts[i]['dislikes'],
resluts[i]['title'],
', '.join(resluts[i]['speakers'])))
if __name__ == '__main__':
show_video_stats(parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment