herrmendez/chillscraper.py

## chillscraper.py
# Author: Federico Mendez
#              _       _                 _     _  _        _
#    ___  _ _ <_> ___ | |__  ___ ._ _  _| |  _| |<_> _ _ _| |_ _ _
#  / . || | || |/ | '| / / <_> || ' |/ . | / . || || '_> | | | | |
# \_  |`___||_|\_|_.|_\_\ <___||_|_|\___| \___||_||_|   |_| `_. |
#  ___| _    _  _  _                                        <___'
#   |  _>| |_ <_>| || |    ___  ___ ._ _ _   ___ ___  _ _  ___  ___  ___  _ _
#  | <__| . || || || | _ / | '/ . \| ' ' | <_-</ | '| '_><_> || . \/ ._>| '_>
# `___/|_|_||_||_||_|<_>\_|_.\___/|_|_|_| /__/\_|_.|_|  <___||  _/\___.|_|
#                                                            |_|
#
# Usage: python chillscraper.py <username> <path_to_save_txt>
# Example: python chillscraper.py someUser ./my_videos.txt
#
# Notes: it can only retrieve the urls of videos from youtube and vimeo,
# it doesn't work with vevo because of the way it's uri's are formatted
# (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this
# script depends on selenium to retrieve the data, since chill.com generates
# its content on the browser through Dojo JS framework (making urllib worhtless).
#
# Dependencies: selenium 2.29

import sys
import json
import time
from selenium import webdriver

wd = webdriver.Firefox()
wd.implicitly_wait(3)

def get_page(url):
    wd.get(url)

def get_collections(username):
    get_page('http://www.chill.com/%s' % username)
    collections = {}
    div_collection = wd.find_elements_by_class_name('collection-name-container')
    for e in div_collection:
        collections[e.text] = e.find_element_by_class_name('name').get_attribute('href')
    return collections

def get_items(items):
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    while len(wd.find_elements_by_class_name('watch-video-item')) > len(items):
        items = wd.find_elements_by_class_name('watch-video-item')
        time.sleep(5)
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    return items

def download_links(username):
    collections = get_collections(username)
    for name, url in collections.iteritems():
        get_page(url)
        links = {}
        div_items = get_items(wd.find_elements_by_class_name('watch-video-item'))
        for e in div_items:
            temp = e.get_attribute('data-embeddata')
            metadata = json.loads(temp)
            title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text
            if metadata['source'] == 'embedly':
                links[title] = metadata['videoId']
            elif metadata['source'] == 'youtube':
                links[title] = 'http://www.youtu.be/%s' % metadata['videoId']
            elif metadata['source'] == 'vimeo':
                links[title] = 'http://www.vimeo.com/%s' % metadata['videoId']
        collections[name] = links
    return collections

if __name__ == '__main__':
    collections = download_links(sys.argv[1])
    f = open(sys.argv[2], 'w')
    for k, ls in collections.iteritems():
        f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls)))
        for name, url in ls.iteritems():
            f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8')))
        f.write("\n\n")

wd.quit()

## requirements.txt
selenium==2.29.0
wsgiref==0.1.2
	# Author: Federico Mendez
	# _ _ _ _ _ _
	# ___ _ _ <_> ___ \| \|__ ___ ._ _ _\| \| _\| \|<_> _ _ _\| \|_ _ _
	# / . \|\| \| \|\| \|/ \| '\| / / <_> \|\| ' \|/ . \| / . \|\| \|\| '_> \| \| \| \| \|
	# \_ \|`___\|\|_\|\_\|_.\|_\_\ <___\|\|_\|_\|\___\| \___\|\|_\|\|_\| \|_\| `_. \|
	# ___\| _ _ _ _ <___'
	# \| _>\| \|_ <_>\| \|\| \| ___ ___ ._ _ _ ___ ___ _ _ ___ ___ ___ _ _
	# \| <__\| . \|\| \|\| \|\| \| _ / \| '/ . \\| ' ' \| <_-</ \| '\| '_><_> \|\| . \/ ._>\| '_>
	# `___/\|_\|_\|\|_\|\|_\|\|_\|<_>\_\|_.\___/\|_\|_\|_\| /__/\_\|_.\|_\| <___\|\| _/\___.\|_\|
	# \|_\|
	#
	# Usage: python chillscraper.py <username> <path_to_save_txt>
	# Example: python chillscraper.py someUser ./my_videos.txt
	#
	# Notes: it can only retrieve the urls of videos from youtube and vimeo,
	# it doesn't work with vevo because of the way it's uri's are formatted
	# (www.vevo.com/watch/<artist_name>/<song_name>/). Also note that this
	# script depends on selenium to retrieve the data, since chill.com generates
	# its content on the browser through Dojo JS framework (making urllib worhtless).
	#
	# Dependencies: selenium 2.29

	import sys
	import json
	import time
	from selenium import webdriver

	wd = webdriver.Firefox()
	wd.implicitly_wait(3)

	def get_page(url):
	wd.get(url)

	def get_collections(username):
	get_page('http://www.chill.com/%s' % username)
	collections = {}
	div_collection = wd.find_elements_by_class_name('collection-name-container')
	for e in div_collection:
	collections[e.text] = e.find_element_by_class_name('name').get_attribute('href')
	return collections

	def get_items(items):
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(5)
	while len(wd.find_elements_by_class_name('watch-video-item')) > len(items):
	items = wd.find_elements_by_class_name('watch-video-item')
	time.sleep(5)
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	return items

	def download_links(username):
	collections = get_collections(username)
	for name, url in collections.iteritems():
	get_page(url)
	links = {}
	div_items = get_items(wd.find_elements_by_class_name('watch-video-item'))
	for e in div_items:
	temp = e.get_attribute('data-embeddata')
	metadata = json.loads(temp)
	title = e.find_element_by_css_selector('div[style="overflow:hidden"]').text
	if metadata['source'] == 'embedly':
	links[title] = metadata['videoId']
	elif metadata['source'] == 'youtube':
	links[title] = 'http://www.youtu.be/%s' % metadata['videoId']
	elif metadata['source'] == 'vimeo':
	links[title] = 'http://www.vimeo.com/%s' % metadata['videoId']
	collections[name] = links
	return collections

	if __name__ == '__main__':
	collections = download_links(sys.argv[1])
	f = open(sys.argv[2], 'w')
	for k, ls in collections.iteritems():
	f.write("Collection: %s (%d items) \n~~~~~~~~~~\n" % (k, len(ls)))
	for name, url in ls.iteritems():
	f.write("\t%s - %s\n" % (name.encode('utf-8'), url.encode('utf-8')))
	f.write("\n\n")

	wd.quit()