Skip to content

Instantly share code, notes, and snippets.

@morrah
Created December 19, 2012 11:41
Show Gist options
  • Save morrah/4336117 to your computer and use it in GitHub Desktop.
Save morrah/4336117 to your computer and use it in GitHub Desktop.
2ch.hk parser provides Thread- and Post-classes output
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re, urllib, urllib2
from cookielib import CookieJar, DefaultCookiePolicy
class WebPage:
DEFAULT_HEADERS = {
"Content-type": "application/x-www-form-urlencoded",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13",
"Connection": "Keep-Alive", "Keep-Alive": "115"
}
cj = CookieJar( DefaultCookiePolicy(rfc2965=True, strict_ns_domain=DefaultCookiePolicy.DomainStrict) )
_source = None
_headers = None
_url = None
def __init__(self, url, headers=None):
if headers==None:
self._headers = self.DEFAULT_HEADERS
self._url = url
def clear_cookies(self):
self.cj.clear_session_cookies()
self.cj.clear()
def download(self):
req = urllib2.Request(self._url, None, self._headers)
self.cj.add_cookie_header(req)
try:
response = urllib2.urlopen(req)
self.cj.extract_cookies(response, req)
self._source = response.read()
return self._source
except Exception, err:
print '[!] Exception: ' + str(err)
return None
def get_source(self):
return self._source
def get_url(self):
return self._url
class Post():
def __init__(self, id, type, name, posttime, subject, postmessage):
self._id = id
self._type = type
self._name = name
self._posttime = posttime
self._subject = subject
self._postmessage = postmessage
def get_id(self):
return self._id
def get_type(self):
return self._type
def get_name(self):
return self._name
def get_posttime(self):
return self._posttime
def get_subject(self):
return self._subject
def get_postmessage(self):
return self._postmessage
class Thread():
def __init__(self, thread_id, matchPos):
self._thread_id = thread_id
self._matchPos = matchPos
self._posts = []
def add_post(self, Post):
self._posts.append(Post)
def get_post(self, index):
return self._posts[index]
def get_post_all(self):
return self._posts
def get_thread_id(self):
return self._thread_id
def get_match_pos(self):
return self._matchPos
def get_post_count(self):
return len(self._posts)
class BoardPage(WebPage):
_threads = []
def __init__(self, url, headers=None):
if headers==None:
self._headers = self.DEFAULT_HEADERS
self._url = url
self._parse_source( self.download() )
def get_thread_all(self):
return self._threads
def _parse_source(self, source):
THREAD_PATTERN = '<div id="thread_(?P<thread_id>\d*?)" class="thread">.*?'
self._threads = self._parse_threads(THREAD_PATTERN, source)
# seems like OPPOST is dublicated further in source with POST-format
# OPPOST_PATTERN = '<div id="post_(?P<post_id>\d*?)" class="(?P<type>oppost)">'
# OPPOST_PATTERN += '.*?<span class="name">(?P<post_name>.*?)</span>'
# OPPOST_PATTERN += '.*?<span class="posttime">(?P<post_posttime>.*?)</span>'
# OPPOST_PATTERN += '.*?<span class="subject">(?P<post_subject>.*?)</span>'
# OPPOST_PATTERN += '.*?<blockquote id="m\d*?" class="postMessage">(?P<post_postmessage>.*?)</blockquote>'
# self._threads = self._parse_posts(OPPOST_PATTERN, self._threads, source)
POST_PATTERN = '<table id="post_(?P<post_id>\d*?)" class="(?P<type>post)">'
POST_PATTERN += '.*?<span class="name">(?P<post_name>.*?)</span>'
POST_PATTERN += '.*?<span class="subject">(?P<post_subject>.*?)</span>'
POST_PATTERN += '.*?<span class="posttime">(?P<post_posttime>.*?)</span>'
POST_PATTERN += '.*?<blockquote id="m\d*?" class="postMessage">(?P<post_postmessage>.*?)</blockquote>'
self._threads = self._parse_posts(POST_PATTERN, self._threads, source)
def _parse_threads(self, thread_pattern, source):
# in: regexp-pattern;
# out: [Thread(thread_id, match_pos)]
threads = []
pattern = re.compile(thread_pattern)
for match in pattern.finditer(source):
threads.append( Thread( match.group('thread_id'), match.start() ) )
return threads
def _parse_posts(self, post_pattern, threads, source):
# in: regexp-pattern, threads[Thread()];
# out: threads[Thread()._posts[Post(post_id,type,post_name,post_posttime,post_subject,post_postmessage)]]
pattern = re.compile(post_pattern, re.DOTALL)
for thread_num in range(len(threads)):
s_pos = self._threads[thread_num].get_match_pos()
if thread_num+1 == len(threads):
e_pos = len(source)
else:
e_pos = threads[thread_num+1].get_match_pos()
for match in pattern.finditer(source, s_pos, e_pos):
new_oppost = Post(match.group('post_id'),
match.group('type'),
match.group('post_name'),
match.group('post_posttime'),
match.group('post_subject'),
match.group('post_postmessage'))
threads[thread_num].add_post(new_oppost)
return threads
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from huikach_parser import *
b = BoardPage('http://2ch.hk/b')
dump_page = open('b.txt', 'w')
for thrd in b.get_thread_all():
dump_page.write( 'thread id%s with %s posts inside\n' % (thrd.get_thread_id(), thrd.get_post_count()) )
for pst in thrd.get_post_all():
dump_page.write( ' > post id%s by %s - %s %s\n' % (pst.get_id(), pst.get_name(), pst.get_subject(), pst.get_posttime()) )
dump_page.write( ' > %s\n' % (pst.get_postmessage()) )
dump_page.close
pr = BoardPage('http://2ch.hk/pr/res/221414.html')
dump_page = open('221414.txt', 'w')
for thrd in pr.get_thread_all():
dump_page.write( 'thread id%s with %s posts inside\n' % (thrd.get_thread_id(), thrd.get_post_count()) )
for pst in thrd.get_post_all():
dump_page.write( ' > post id%s by %s - %s %s\n' % (pst.get_id(), pst.get_name(), pst.get_subject(), pst.get_posttime()) )
dump_page.write( ' > %s\n' % (pst.get_postmessage()) )
dump_page.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment