moluapple/cchere_posts_downloader.py

## cchere_posts_downloader.py
#!/usr/bin/python
# -*- coding:utf-8 -*-

""" download all cchere posts of the author from pre-collected postlist
    currently the list was collected manually
    TODO: get the postlist from cchere user home automatically
    NOTE: Fanqiang is needed as the exists of GFW
    code based on: https://code.google.com/p/cchere-thread-saver/
"""

import urllib2
from lxml import html
import cookielib
import re

def getAuthor(ele):
    """ get author from element
    """
    r = ele.xpath('div/div/a')
    return r[0].text


def zJ_PE(alltext):
    """ from the entire source, get the meaningful center part
        and decoded it as in the javascript function zJ_PE
        return a unicode version of the center part
    """
    # get the center part of the page, encrypted
    texts = alltext.partition('ls=\"')
    alltext = texts[2][3:]

    texts2 = alltext.partition('\";')
    alltext = texts2[0]

    # decode, re-implement of js zJ_PE function
    list1 = ['~', '#', '<', '@', '>', '!', '&', '*', '(', ')', ':', ';', '=', ',', '|', '+']
    list2 = ['%1', '%2', '%3', '%4', '%5', '%6', '%7', '%8', '%9', '%A', '%B', '%C', '%D', '%E', 'e', '%20']
    for i in range(0, len(list1)):
        alltext = alltext.replace(list1[i], list2[i])

    decodedtext = urllib2.unquote(alltext) # decoded text is in utf8, but still a str
    decodedtext = unicode(decodedtext, 'utf8')  # change it to unicode

    return decodedtext


def parseHTML(url):
    """ the cookie was tracked by firebug
        return the entire source htmlstring
    """
    cookie = 'cchome=***** use your cookie here *****; expires=Sunday, 02-Sep-2012 09:28:55; path=/; domain=.cchere.com'
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
    urllib2.install_opener(opener)
    req = urllib2.Request(url)
    req.add_header('Cookie', cookie)
    alltext = urllib2.urlopen(req).read()
    return alltext


def getPageContents(htmlstring, baseurl="http://www.cchere.com"):
    """ get PageContents element list from htmlstring
    """
    doc = html.fromstring(htmlstring)
    doc.make_links_absolute(baseurl)
    alllist = doc.find_class("pContent")  # list of posts
    # next remove all those recycled (not shown) posts
    alllist = [ele for ele in alllist if len(ele.xpath('div/div/a')) > 0]
    return alllist


def generateHTML(allposts, outfilename):
    """ put all posts in the given list in to a simple
        html file.
    """
    begstr = "<html><body>"
    allstr = ""
    for p in allposts:
        allstr = allstr + html.tostring(p)
    endstr = "</body></html>"
    with open(outfilename, "w") as f:
        f.write(begstr + allstr + endstr)


def downloadPostList(postlist, authorToPick):
    """ postlist:list in the form of [thread,page_index]
        authorToPick: extract only this author's posts
    """
    for post in postlist:
        for i in range(1, post[1] + 1):
            url = "https://www.ccthere.com/thread/%s/%s" % (post[0], i)
            outfilename = str(post[0]) + '_' + str(i) + '.html'
            htmlstring = parseHTML(url)
            # if special javascript found, change url to cchere
            if 'var ls="' in htmlstring:
                url = "https://www.cchere.com/thread/%s/%s" % (post[0], i)
                htmlstring = parseHTML(url)
                htmlstring = zJ_PE(htmlstring)
            allposts = getPageContents(htmlstring)
            sameauthorlist = [ele for ele in allposts if authorToPick == getAuthor(ele)]
            # if this page have author's posts, save to file
            if len(sameauthorlist): generateHTML(sameauthorlist, outfilename)


if __name__ == '__main__':
    postlist = [[307479, 1], [307786, 4], [307863, 2], [308916, 2], [310336, 2], [312480, 3], [329532, 4]]
    authorToPick = u"HeiDaoRen" # 此处贴代码时 ID 用拼音代替了
    downloadPostList(postlist, authorToPick)
	#!/usr/bin/python
	# -- coding:utf-8 --

	""" download all cchere posts of the author from pre-collected postlist
	currently the list was collected manually
	TODO: get the postlist from cchere user home automatically
	NOTE: Fanqiang is needed as the exists of GFW
	code based on: https://code.google.com/p/cchere-thread-saver/
	"""

	import urllib2
	from lxml import html
	import cookielib
	import re

	def getAuthor(ele):
	""" get author from element
	"""
	r = ele.xpath('div/div/a')
	return r[0].text


	def zJ_PE(alltext):
	""" from the entire source, get the meaningful center part
	and decoded it as in the javascript function zJ_PE
	return a unicode version of the center part
	"""
	# get the center part of the page, encrypted
	texts = alltext.partition('ls=\"')
	alltext = texts[2][3:]

	texts2 = alltext.partition('\";')
	alltext = texts2[0]

	# decode, re-implement of js zJ_PE function
	list1 = ['~', '#', '<', '@', '>', '!', '&', '*', '(', ')', ':', ';', '=', ',', '\|', '+']
	list2 = ['%1', '%2', '%3', '%4', '%5', '%6', '%7', '%8', '%9', '%A', '%B', '%C', '%D', '%E', 'e', '%20']
	for i in range(0, len(list1)):
	alltext = alltext.replace(list1[i], list2[i])

	decodedtext = urllib2.unquote(alltext) # decoded text is in utf8, but still a str
	decodedtext = unicode(decodedtext, 'utf8') # change it to unicode

	return decodedtext


	def parseHTML(url):
	""" the cookie was tracked by firebug
	return the entire source htmlstring
	"""
	cookie = 'cchome=*** use your cookie here ***; expires=Sunday, 02-Sep-2012 09:28:55; path=/; domain=.cchere.com'
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
	urllib2.install_opener(opener)
	req = urllib2.Request(url)
	req.add_header('Cookie', cookie)
	alltext = urllib2.urlopen(req).read()
	return alltext


	def getPageContents(htmlstring, baseurl="http://www.cchere.com"):
	""" get PageContents element list from htmlstring
	"""
	doc = html.fromstring(htmlstring)
	doc.make_links_absolute(baseurl)
	alllist = doc.find_class("pContent") # list of posts
	# next remove all those recycled (not shown) posts
	alllist = [ele for ele in alllist if len(ele.xpath('div/div/a')) > 0]
	return alllist


	def generateHTML(allposts, outfilename):
	""" put all posts in the given list in to a simple
	html file.
	"""
	begstr = "<html><body>"
	allstr = ""
	for p in allposts:
	allstr = allstr + html.tostring(p)
	endstr = "</body></html>"
	with open(outfilename, "w") as f:
	f.write(begstr + allstr + endstr)


	def downloadPostList(postlist, authorToPick):
	""" postlist:list in the form of [thread,page_index]
	authorToPick: extract only this author's posts
	"""
	for post in postlist:
	for i in range(1, post[1] + 1):
	url = "https://www.ccthere.com/thread/%s/%s" % (post[0], i)
	outfilename = str(post[0]) + '_' + str(i) + '.html'
	htmlstring = parseHTML(url)
	# if special javascript found, change url to cchere
	if 'var ls="' in htmlstring:
	url = "https://www.cchere.com/thread/%s/%s" % (post[0], i)
	htmlstring = parseHTML(url)
	htmlstring = zJ_PE(htmlstring)
	allposts = getPageContents(htmlstring)
	sameauthorlist = [ele for ele in allposts if authorToPick == getAuthor(ele)]
	# if this page have author's posts, save to file
	if len(sameauthorlist): generateHTML(sameauthorlist, outfilename)


	if __name__ == '__main__':
	postlist = [[307479, 1], [307786, 4], [307863, 2], [308916, 2], [310336, 2], [312480, 3], [329532, 4]]
	authorToPick = u"HeiDaoRen" # 此处贴代码时 ID 用拼音代替了
	downloadPostList(postlist, authorToPick)