Skip to content

Instantly share code, notes, and snippets.

@Sorseg
Last active December 10, 2015 13:29
Show Gist options
  • Save Sorseg/4441203 to your computer and use it in GitHub Desktop.
Save Sorseg/4441203 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
'''
BeautifulSoup is required:
http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.1.tar.gz
'''
from BeautifulSoup import BeautifulSoup as soup
from urllib import urlopen as uopen, urlretrieve
import re, os, errno, subprocess as sp, threading
from time import sleep
from urlparse import urljoin
from contextlib import closing
ROOT_PAGE = 'http://2ch.hk/'
GRAB_RULE = re.compile('General')
BOARD = 'mlp'
SAVEDIR = '/media/storage/dev/mlp_general/'
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def get_threads():
s = soup(uopen(os.path.join(ROOT_PAGE, BOARD)))
def finder(tag):
return tag(True, 'oppost', text = GRAB_RULE)
return [int(i['id'].split('_')[-1]) for i in s(finder, 'thread')]
def get_links(thread):
url = '/'.join([ROOT_PAGE, BOARD, 'res', str(thread)+'.html'])
page = uopen(url)
if page.code != 200:
print "ERROR", page.code, "in thread", url
s = soup(page)
links = [a['href'] for post in s('div','oppost')+s('table', 'post') for a in post('a', {'name':'expandfunc'})]
links.append('/'.join([BOARD, 'res', str(thread)+'.html']))
return links
def dwnl(end_url, end_fname):
print "Downloading...", end_url, end_fname
with closing(uopen(end_url)) as rem_file:
data = rem_file.read()
if '<html' in data and not end_url.endswith('html'):
print "FAILED!", end_url
return
urlretrieve(end_url, end_fname + '.part')
os.rename(end_fname+'.part', end_fname)
print "DONE", end_fname, "LEFT:", threading.active_count() - 2
def download(thread, links):
thread_fs_root = os.path.join(SAVEDIR, str(thread))
mkdir_p(thread_fs_root)
new_links = []
for l in links:
fname = l.split('/')[-1]
end_fname = os.path.join(thread_fs_root, fname)
if not os.path.exists(end_fname) or end_fname.endswith('html'):
new_links.append(l)
print "Thread:", thread, "({}/{})".format(len(new_links), len(links))
for l in new_links:
end_url = '/'.join([ROOT_PAGE, l])
end_fname = os.path.join(thread_fs_root, l.split('/')[-1])
t = threading.Thread(target = dwnl, args = (end_url, end_fname))
t.start()
while threading.active_count() > 30:
sleep(0.5)
if __name__ == '__main__':
threads = get_threads()
mkdir_p(SAVEDIR)
print "Working on", threads
for t in threads:
l = get_links(t)
download(t, l)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment