Skip to content

Instantly share code, notes, and snippets.

@KaiCode2
Last active August 29, 2015 14:10
Show Gist options
  • Save KaiCode2/e2a09ddcf529f4cdb2f5 to your computer and use it in GitHub Desktop.
Save KaiCode2/e2a09ddcf529f4cdb2f5 to your computer and use it in GitHub Desktop.
import urllib2
from urllib2 import urlopen
import re
import bs4
from bs4 import BeautifulSoup
import cookielib, urllib2
from cookielib import CookieJar
import datetime
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, tostring, SubElement
import xml.dom.minidom as xml
from xml.dom.minidom import parse, parseString
import sys
reload(sys)
sys.setdefaultencoding('utf8')
url = 'http://asyl-bilim.kz/forum/3'
cj = CookieJar()
forumLinks = []
links = []
contentFromArticles = []
forumArticles = []
def getInitialForum():
print 'Getting initial forum content...'
print '--------------------------------'
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
sourceCode = opener.open(url).read()
titles = re.findall(r'<tr id="tt(.*?)">(.*?)</tr>', sourceCode)
wrappedLinks = re.findall(r'<a class="threadLink" href="(.*?)">(.*?)</a>', sourceCode)
i = 0
while (i < len(wrappedLinks)):
links.append(wrappedLinks[i][0])
i = i + 1
def getForumInstaces():
forumOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
forumOpener.addheaders = [('User-agent', 'Mozilla/5.0')]
print 'Grabing individual posts...'
print '--------------------------------'
for link in links:
soup = BeautifulSoup(forumOpener.open(link).read().decode('utf-8'))
message = soup.find("td", {"class": "posttdMessage"})
author = soup.find("td", {"class": "postTdInfo"})
returnTuple = (re.findall(r'<span class="ucoz-forum-post" edit-url="" id="ucoz-forum-post-(.*?)">(.*?)</span>', str(message)), re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link)
if returnTuple[0] == []:
returnTuple = ('error getting message, either non-UTF-8 or no message in page', re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link)
contentFromArticles.append(returnTuple)
def makeXML():
print 'Creating XML'
print '--------------------------------'
generated_on = str(datetime.datetime.now())
root = Element('Corpus')
root.set('language', 'Russian')
root.set('name', 'asyl-bilim')
root.set('content as of', generated_on)
root.set('xmlns', url)
for article in contentFromArticles:
entry = SubElement(root, 'entry')
if article[0][0] == "e":
entry.set('article-ID', 'N/A')
else:
entry.set('article-ID', article[0][0][0])
entry.text = article[0][0][1]
if article[1] == []:
entry.set('User', 'User`s info is annonymous')
else:
entry.set(article[1][0][0], article[1][0][1]) #postRankIco
entry.set(article[1][1][0], article[1][1][1]) #postName
entry.set(article[1][2][0], article[1][2][1])
entry.set('url', article[2])
xmlString = ET.tostring(root, encoding="utf-8", method="xml")
print xmlString
print 'Thanks for using my own custom parser :)'
def main():
getInitialForum()
getForumInstaces()
makeXML()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment