Last active
August 29, 2015 14:10
-
-
Save KaiCode2/e2a09ddcf529f4cdb2f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from urllib2 import urlopen | |
import re | |
import bs4 | |
from bs4 import BeautifulSoup | |
import cookielib, urllib2 | |
from cookielib import CookieJar | |
import datetime | |
import xml.etree.ElementTree as ET | |
from xml.etree.ElementTree import Element, tostring, SubElement | |
import xml.dom.minidom as xml | |
from xml.dom.minidom import parse, parseString | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
url = 'http://asyl-bilim.kz/forum/3' | |
cj = CookieJar() | |
forumLinks = [] | |
links = [] | |
contentFromArticles = [] | |
forumArticles = [] | |
def getInitialForum(): | |
print 'Getting initial forum content...' | |
print '--------------------------------' | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
sourceCode = opener.open(url).read() | |
titles = re.findall(r'<tr id="tt(.*?)">(.*?)</tr>', sourceCode) | |
wrappedLinks = re.findall(r'<a class="threadLink" href="(.*?)">(.*?)</a>', sourceCode) | |
i = 0 | |
while (i < len(wrappedLinks)): | |
links.append(wrappedLinks[i][0]) | |
i = i + 1 | |
def getForumInstaces(): | |
forumOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
forumOpener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
print 'Grabing individual posts...' | |
print '--------------------------------' | |
for link in links: | |
soup = BeautifulSoup(forumOpener.open(link).read().decode('utf-8')) | |
message = soup.find("td", {"class": "posttdMessage"}) | |
author = soup.find("td", {"class": "postTdInfo"}) | |
returnTuple = (re.findall(r'<span class="ucoz-forum-post" edit-url="" id="ucoz-forum-post-(.*?)">(.*?)</span>', str(message)), re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link) | |
if returnTuple[0] == []: | |
returnTuple = ('error getting message, either non-UTF-8 or no message in page', re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link) | |
contentFromArticles.append(returnTuple) | |
def makeXML(): | |
print 'Creating XML' | |
print '--------------------------------' | |
generated_on = str(datetime.datetime.now()) | |
root = Element('Corpus') | |
root.set('language', 'Russian') | |
root.set('name', 'asyl-bilim') | |
root.set('content as of', generated_on) | |
root.set('xmlns', url) | |
for article in contentFromArticles: | |
entry = SubElement(root, 'entry') | |
if article[0][0] == "e": | |
entry.set('article-ID', 'N/A') | |
else: | |
entry.set('article-ID', article[0][0][0]) | |
entry.text = article[0][0][1] | |
if article[1] == []: | |
entry.set('User', 'User`s info is annonymous') | |
else: | |
entry.set(article[1][0][0], article[1][0][1]) #postRankIco | |
entry.set(article[1][1][0], article[1][1][1]) #postName | |
entry.set(article[1][2][0], article[1][2][1]) | |
entry.set('url', article[2]) | |
xmlString = ET.tostring(root, encoding="utf-8", method="xml") | |
print xmlString | |
print 'Thanks for using my own custom parser :)' | |
def main(): | |
getInitialForum() | |
getForumInstaces() | |
makeXML() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment