Skip to content

Instantly share code, notes, and snippets.

@moluapple
Created August 26, 2012 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save moluapple/3479853 to your computer and use it in GitHub Desktop.
Save moluapple/3479853 to your computer and use it in GitHub Desktop.
[python2]cchere user posts downloader
#!/usr/bin/python
# -*- coding:utf-8 -*-
""" download all cchere posts of the author from pre-collected postlist
currently the list was collected manually
TODO: get the postlist from cchere user home automatically
NOTE: Fanqiang is needed as the exists of GFW
code based on: https://code.google.com/p/cchere-thread-saver/
"""
import urllib2
from lxml import html
import cookielib
import re
def getAuthor(ele):
""" get author from element
"""
r = ele.xpath('div/div/a')
return r[0].text
def zJ_PE(alltext):
""" from the entire source, get the meaningful center part
and decoded it as in the javascript function zJ_PE
return a unicode version of the center part
"""
# get the center part of the page, encrypted
texts = alltext.partition('ls=\"')
alltext = texts[2][3:]
texts2 = alltext.partition('\";')
alltext = texts2[0]
# decode, re-implement of js zJ_PE function
list1 = ['~', '#', '<', '@', '>', '!', '&', '*', '(', ')', ':', ';', '=', ',', '|', '+']
list2 = ['%1', '%2', '%3', '%4', '%5', '%6', '%7', '%8', '%9', '%A', '%B', '%C', '%D', '%E', 'e', '%20']
for i in range(0, len(list1)):
alltext = alltext.replace(list1[i], list2[i])
decodedtext = urllib2.unquote(alltext) # decoded text is in utf8, but still a str
decodedtext = unicode(decodedtext, 'utf8') # change it to unicode
return decodedtext
def parseHTML(url):
""" the cookie was tracked by firebug
return the entire source htmlstring
"""
cookie = 'cchome=***** use your cookie here *****; expires=Sunday, 02-Sep-2012 09:28:55; path=/; domain=.cchere.com'
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
urllib2.install_opener(opener)
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
alltext = urllib2.urlopen(req).read()
return alltext
def getPageContents(htmlstring, baseurl="http://www.cchere.com"):
""" get PageContents element list from htmlstring
"""
doc = html.fromstring(htmlstring)
doc.make_links_absolute(baseurl)
alllist = doc.find_class("pContent") # list of posts
# next remove all those recycled (not shown) posts
alllist = [ele for ele in alllist if len(ele.xpath('div/div/a')) > 0]
return alllist
def generateHTML(allposts, outfilename):
""" put all posts in the given list in to a simple
html file.
"""
begstr = "<html><body>"
allstr = ""
for p in allposts:
allstr = allstr + html.tostring(p)
endstr = "</body></html>"
with open(outfilename, "w") as f:
f.write(begstr + allstr + endstr)
def downloadPostList(postlist, authorToPick):
""" postlist:list in the form of [thread,page_index]
authorToPick: extract only this author's posts
"""
for post in postlist:
for i in range(1, post[1] + 1):
url = "https://www.ccthere.com/thread/%s/%s" % (post[0], i)
outfilename = str(post[0]) + '_' + str(i) + '.html'
htmlstring = parseHTML(url)
# if special javascript found, change url to cchere
if 'var ls="' in htmlstring:
url = "https://www.cchere.com/thread/%s/%s" % (post[0], i)
htmlstring = parseHTML(url)
htmlstring = zJ_PE(htmlstring)
allposts = getPageContents(htmlstring)
sameauthorlist = [ele for ele in allposts if authorToPick == getAuthor(ele)]
# if this page have author's posts, save to file
if len(sameauthorlist): generateHTML(sameauthorlist, outfilename)
if __name__ == '__main__':
postlist = [[307479, 1], [307786, 4], [307863, 2], [308916, 2], [310336, 2], [312480, 3], [329532, 4]]
authorToPick = u"HeiDaoRen" # 此处贴代码时 ID 用拼音代替了
downloadPostList(postlist, authorToPick)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment