Sorseg/grab_mlp_general.py

## grab_mlp_general.py
#!/usr/bin/python
'''
BeautifulSoup is required:
http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.1.tar.gz
'''
from BeautifulSoup import BeautifulSoup as soup
from urllib import urlopen as uopen, urlretrieve
import re, os, errno, subprocess as sp, threading
from time import sleep
from urlparse import urljoin
from contextlib import closing
ROOT_PAGE = 'http://2ch.hk/'
GRAB_RULE = re.compile('General')
BOARD = 'mlp'
SAVEDIR = '/media/storage/dev/mlp_general/'

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise


def get_threads():
    s = soup(uopen(os.path.join(ROOT_PAGE, BOARD)))
    def finder(tag):
        return tag(True, 'oppost', text = GRAB_RULE)
    return [int(i['id'].split('_')[-1]) for i in s(finder, 'thread')]

def get_links(thread):
    url = '/'.join([ROOT_PAGE, BOARD, 'res', str(thread)+'.html'])
    page = uopen(url)
    if page.code != 200:
        print "ERROR", page.code, "in thread", url
    s = soup(page)
    links = [a['href'] for post in s('div','oppost')+s('table', 'post') for a in post('a', {'name':'expandfunc'})]
    links.append('/'.join([BOARD, 'res', str(thread)+'.html']))
    return links

def dwnl(end_url, end_fname):
    print "Downloading...", end_url, end_fname
    with closing(uopen(end_url)) as rem_file:
        data = rem_file.read()
    if '<html' in data and not end_url.endswith('html'):
        print "FAILED!", end_url
        return
    urlretrieve(end_url, end_fname + '.part')
    os.rename(end_fname+'.part', end_fname)
    print "DONE", end_fname, "LEFT:", threading.active_count() - 2

def download(thread, links):
    thread_fs_root = os.path.join(SAVEDIR, str(thread))
    mkdir_p(thread_fs_root)
    new_links = []
    for l in links:
        fname = l.split('/')[-1]
        end_fname = os.path.join(thread_fs_root, fname)
        if not os.path.exists(end_fname) or end_fname.endswith('html'):
            new_links.append(l)
    print "Thread:", thread, "({}/{})".format(len(new_links), len(links))
    for l in new_links:
        end_url = '/'.join([ROOT_PAGE, l])
        end_fname = os.path.join(thread_fs_root, l.split('/')[-1])
        t = threading.Thread(target = dwnl, args = (end_url, end_fname))
        t.start()
        while threading.active_count() > 30:
            sleep(0.5)

if __name__ == '__main__':
    threads = get_threads()
    mkdir_p(SAVEDIR)
    print "Working on", threads
    for t in threads:
        l = get_links(t)
        download(t, l)
	#!/usr/bin/python
	'''
	BeautifulSoup is required:
	http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.1.tar.gz
	'''
	from BeautifulSoup import BeautifulSoup as soup
	from urllib import urlopen as uopen, urlretrieve
	import re, os, errno, subprocess as sp, threading
	from time import sleep
	from urlparse import urljoin
	from contextlib import closing
	ROOT_PAGE = 'http://2ch.hk/'
	GRAB_RULE = re.compile('General')
	BOARD = 'mlp'
	SAVEDIR = '/media/storage/dev/mlp_general/'

	def mkdir_p(path):
	try:
	os.makedirs(path)
	except OSError as exc:
	if exc.errno == errno.EEXIST and os.path.isdir(path):
	pass
	else: raise


	def get_threads():
	s = soup(uopen(os.path.join(ROOT_PAGE, BOARD)))
	def finder(tag):
	return tag(True, 'oppost', text = GRAB_RULE)
	return [int(i['id'].split('_')[-1]) for i in s(finder, 'thread')]

	def get_links(thread):
	url = '/'.join([ROOT_PAGE, BOARD, 'res', str(thread)+'.html'])
	page = uopen(url)
	if page.code != 200:
	print "ERROR", page.code, "in thread", url
	s = soup(page)
	links = [a['href'] for post in s('div','oppost')+s('table', 'post') for a in post('a', {'name':'expandfunc'})]
	links.append('/'.join([BOARD, 'res', str(thread)+'.html']))
	return links

	def dwnl(end_url, end_fname):
	print "Downloading...", end_url, end_fname
	with closing(uopen(end_url)) as rem_file:
	data = rem_file.read()
	if '<html' in data and not end_url.endswith('html'):
	print "FAILED!", end_url
	return
	urlretrieve(end_url, end_fname + '.part')
	os.rename(end_fname+'.part', end_fname)
	print "DONE", end_fname, "LEFT:", threading.active_count() - 2

	def download(thread, links):
	thread_fs_root = os.path.join(SAVEDIR, str(thread))
	mkdir_p(thread_fs_root)
	new_links = []
	for l in links:
	fname = l.split('/')[-1]
	end_fname = os.path.join(thread_fs_root, fname)
	if not os.path.exists(end_fname) or end_fname.endswith('html'):
	new_links.append(l)
	print "Thread:", thread, "({}/{})".format(len(new_links), len(links))
	for l in new_links:
	end_url = '/'.join([ROOT_PAGE, l])
	end_fname = os.path.join(thread_fs_root, l.split('/')[-1])
	t = threading.Thread(target = dwnl, args = (end_url, end_fname))
	t.start()
	while threading.active_count() > 30:
	sleep(0.5)

	if __name__ == '__main__':
	threads = get_threads()
	mkdir_p(SAVEDIR)
	print "Working on", threads
	for t in threads:
	l = get_links(t)
	download(t, l)