Skip to content

Instantly share code, notes, and snippets.

@darvell
Created March 22, 2012 05:12
Show Gist options
  • Save darvell/2156265 to your computer and use it in GitHub Desktop.
Save darvell/2156265 to your computer and use it in GitHub Desktop.
GH Downloader
# I am totally aware BeautifulSoup is slow.
# Please do not inform me, this is just for fun.
import urllib2
import BeautifulSoup
import re
import os
import sys
from threading import Thread
def slash():
if os.name == "nt":
return "\\"
else:
return "/"
def replace_all(text, dic):
for i, j in dic.iteritems():
text = text.replace(i, j)
return text
class download(Thread):
def __init__(self,url):
Thread.__init__(self)
self.url = url
def run(self):
req = urllib2.urlopen(self.url)
CHUNK = 16 * 1024
with open(os.getcwd() + slash() + SOUNDTRACK_NAME + slash() + self.url.split('/')[-1], 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
fp.write(chunk)
print self.url.split('/')[-1],'complete.'
BASE_URL = "http://gh.ffshrine.org/"
SOUNDTRACK_ID = int(raw_input('Soundtrack ID (e.g. http://gh.ffshrine.org/soundtracks/NUMBER/): '))
html_dict = {'%3A':':','%2F':'/','%27':"'",'%22':'"','%3B':';','%28':'(','%29':')'}
page_data = urllib2.urlopen(BASE_URL + "soundtracks/" + str(SOUNDTRACK_ID)).read()
soup = BeautifulSoup.BeautifulSoup(page_data)
SOUNDTRACK_NAME = str(soup.findAll('h1')[0])[4:-19]
url_list = []
print 'Found soundtrack:',SOUNDTRACK_NAME
print 'Parsing all pages for download links. This may take a while.'
for a in soup.findAll('a',attrs={'href':re.compile("^/song/")}):
song_url = a['href']
download_page_data = urllib2.urlopen(BASE_URL + song_url[1:]).read()
temp_soup = BeautifulSoup.BeautifulSoup(download_page_data)
for script in temp_soup.findAll('script'):
if 'var data' in str(script):
# Actual stuff we want always is between chars 42 and len(script) - 38
java = replace_all(str(script)[42:-38],html_dict)
url_list.append(java[0:java.index('";')])
print "List built, starting download."
# Maybe it exists, fuck you?
try:
os.mkdir(SOUNDTRACK_NAME)
except:
pass
threadlist = []
for i in range(0,len(url_list)):
cur_thread = download(url_list[i])
threadlist.append(cur_thread)
cur_thread.start()
print 'Started thread for:',url_list[i].split('/')[-1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment