KaiCode2/forumToXMLScraper.py

## forumToXMLScraper.py
import urllib2
from urllib2 import urlopen
import re
import bs4
from bs4 import BeautifulSoup
import cookielib, urllib2
from cookielib import CookieJar
import datetime
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, tostring, SubElement
import xml.dom.minidom as xml
from xml.dom.minidom import parse, parseString
import sys

reload(sys)
sys.setdefaultencoding('utf8')

url = 'http://asyl-bilim.kz/forum/3'
cj = CookieJar()
forumLinks = []
links = []
contentFromArticles = []
forumArticles = []


def getInitialForum():
	print 'Getting initial forum content...'
	print '--------------------------------'
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
	sourceCode = opener.open(url).read()
	titles = re.findall(r'<tr id="tt(.*?)">(.*?)</tr>', sourceCode)
	wrappedLinks = re.findall(r'<a class="threadLink" href="(.*?)">(.*?)</a>', sourceCode)

	i = 0
	while (i < len(wrappedLinks)):
		links.append(wrappedLinks[i][0])
		i = i + 1


def getForumInstaces():
	forumOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	forumOpener.addheaders = [('User-agent', 'Mozilla/5.0')]
	print 'Grabing individual posts...'
	print '--------------------------------'
	for link in links:
		soup = BeautifulSoup(forumOpener.open(link).read().decode('utf-8'))
		message = soup.find("td", {"class": "posttdMessage"})
		author = soup.find("td", {"class": "postTdInfo"})
		returnTuple = (re.findall(r'<span class="ucoz-forum-post" edit-url="" id="ucoz-forum-post-(.*?)">(.*?)</span>', str(message)), re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link)
		if returnTuple[0] == []:
			returnTuple = ('error getting message, either non-UTF-8 or no message in page', re.findall(r'<div class="(.*?)">(.*?)</div>', str(author)), link)
		contentFromArticles.append(returnTuple)


def makeXML():
	print 'Creating XML'
	print '--------------------------------'
	generated_on = str(datetime.datetime.now())
	root = Element('Corpus')
	root.set('language', 'Russian')
	root.set('name', 'asyl-bilim')
	root.set('content as of', generated_on)
	root.set('xmlns', url)
	for article in contentFromArticles:
		entry = SubElement(root, 'entry')
		if article[0][0] == "e":
			entry.set('article-ID', 'N/A')
		else:
			entry.set('article-ID', article[0][0][0])
			entry.text = article[0][0][1]
		if article[1] == []:
			entry.set('User', 'User`s info is annonymous')
		else:
			entry.set(article[1][0][0], article[1][0][1]) #postRankIco
			entry.set(article[1][1][0], article[1][1][1]) #postName
			entry.set(article[1][2][0], article[1][2][1])

		entry.set('url', article[2])

	xmlString = ET.tostring(root, encoding="utf-8", method="xml")
	print xmlString
	print 'Thanks for using my own custom parser :)'

def main():
	getInitialForum()
	getForumInstaces()
	makeXML()

main()
	import urllib2
	from urllib2 import urlopen
	import re
	import bs4
	from bs4 import BeautifulSoup
	import cookielib, urllib2
	from cookielib import CookieJar
	import datetime
	import xml.etree.ElementTree as ET
	from xml.etree.ElementTree import Element, tostring, SubElement
	import xml.dom.minidom as xml
	from xml.dom.minidom import parse, parseString
	import sys

	reload(sys)
	sys.setdefaultencoding('utf8')

	url = 'http://asyl-bilim.kz/forum/3'
	cj = CookieJar()
	forumLinks = []
	links = []
	contentFromArticles = []
	forumArticles = []


	def getInitialForum():
	print 'Getting initial forum content...'
	print '--------------------------------'
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
	sourceCode = opener.open(url).read()
	titles = re.findall(r'<tr id="tt(.?)">(.?)</tr>', sourceCode)
	wrappedLinks = re.findall(r'<a class="threadLink" href="(.?)">(.?)</a>', sourceCode)

	i = 0
	while (i < len(wrappedLinks)):
	links.append(wrappedLinks[i][0])
	i = i + 1


	def getForumInstaces():
	forumOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	forumOpener.addheaders = [('User-agent', 'Mozilla/5.0')]
	print 'Grabing individual posts...'
	print '--------------------------------'
	for link in links:
	soup = BeautifulSoup(forumOpener.open(link).read().decode('utf-8'))
	message = soup.find("td", {"class": "posttdMessage"})
	author = soup.find("td", {"class": "postTdInfo"})
	returnTuple = (re.findall(r'<span class="ucoz-forum-post" edit-url="" id="ucoz-forum-post-(.?)">(.?)</span>', str(message)), re.findall(r'<div class="(.?)">(.?)</div>', str(author)), link)
	if returnTuple[0] == []:
	returnTuple = ('error getting message, either non-UTF-8 or no message in page', re.findall(r'<div class="(.?)">(.?)</div>', str(author)), link)
	contentFromArticles.append(returnTuple)


	def makeXML():
	print 'Creating XML'
	print '--------------------------------'
	generated_on = str(datetime.datetime.now())
	root = Element('Corpus')
	root.set('language', 'Russian')
	root.set('name', 'asyl-bilim')
	root.set('content as of', generated_on)
	root.set('xmlns', url)
	for article in contentFromArticles:
	entry = SubElement(root, 'entry')
	if article[0][0] == "e":
	entry.set('article-ID', 'N/A')
	else:
	entry.set('article-ID', article[0][0][0])
	entry.text = article[0][0][1]
	if article[1] == []:
	entry.set('User', 'User`s info is annonymous')
	else:
	entry.set(article[1][0][0], article[1][0][1]) #postRankIco
	entry.set(article[1][1][0], article[1][1][1]) #postName
	entry.set(article[1][2][0], article[1][2][1])

	entry.set('url', article[2])

	xmlString = ET.tostring(root, encoding="utf-8", method="xml")
	print xmlString
	print 'Thanks for using my own custom parser :)'

	def main():
	getInitialForum()
	getForumInstaces()
	makeXML()

	main()