organisciak/process.py

## process.py
'''
Author: Peter Organisciak

Convert Day of DH (or other Wordpress) export to Mallet import format.
[url] [user] [post text]

Use in the following way:
>> python process.py input-file output-file --split [post|author]

For the split argument, choose either post (a document representation is the words of a post) or author (a document representation is the words that an author has written).

'''
import re
from bs4 import BeautifulSoup
import argparse
import codecs

def main():
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(dest="input", help="Location of Wordpress XML export file.")
	parser.add_argument(dest="output",   help="Location of file export. Overwrites existing.")
	parser.add_argument('--split', dest="split_on", type=str, default='post',
			help="[post|author] Define what constitutes a document. Default 'post'.")
	args = parser.parse_args()

	docs = {}

	#Get a streamwriter to get around unicode issues
	streamwriter = codecs.getwriter('utf8')
	#Import file
	file = open(args.input, 'rb')
	dump = file.read()
	file.close()
	dump = BeautifulSoup(dump)
	posts = dump.find_all('item')
	for post in posts:
		doc_id = post.find('link').get_text()
		author = post.find('dc:creator').get_text()
		text = post.find('content:encoded').get_text(strip=True)
		status = post.find('wp:status').get_text()
		post_type = post.find('wp:post_type').get_text()
		if (status == 'publish' and post_type == 'post'):
			#Since Beautiful Soup doesn't strip tags from CDATA, create a new object
			text = BeautifulSoup(text)
			text = text.get_text(strip=True)
			text = clean_text_for_mallet(text)
			if args.split_on == 'post':
				docs[doc_id] = {"author":author, "text":text}
			elif args.split_on == 'author':
				#If splitting by author, save a large file for either author
				doc_id = author
				docs[doc_id] = docs.setdefault(doc_id, {})
				text = docs[doc_id].setdefault(text, "") + " " + text
				docs[doc_id] = {"author":author, "text":text}
			else:
				raise Exception("invalid split_on argument. Requires either 'post' or author")


	output = codecs.open(args.output, encoding='utf-8', mode='w+')
	errors = 0
	total = 0
	for doc_id, doc in docs.iteritems():
		try:
			new_line = u"{0}	{1}	{2}".format(doc_id, doc["author"], doc["text"])
		except:
			errors +=1
			print "Error {2} ({3} processed) - problem with {0} by {1}".format(doc_id, doc["author"], errors, total)
		total += 1
		output.write(streamwriter(new_line)+"\n")
	output.close()
	#posts = re.findall(r"\<item\>.*?\<\/item\>",text, flags=re.DOTALL)
	#print len(posts)
	#for post in posts:
	#	author = re.findall(r"\<dc\:creator\>.*?\<\/dc:creator\>",post, flags=re.DOTALL)[0]
	#	print author

def clean_text_for_mallet(text):
	#Remove all line breaks
	text = re.sub("\n|\r|\[.*?\]", "", text)
	#replace contraction apostrophes with a blank string
	text = re.sub(r"(\w)'(\w)",r"\1\2", text)
	#Standardize whitespace (get around Mallet import issues, especially u\2028)
	text = ' '.join(text.split())
	return text
if __name__=='__main__':
	main()
	'''
	Author: Peter Organisciak

	Convert Day of DH (or other Wordpress) export to Mallet import format.
	[url] [user] [post text]

	Use in the following way:
	>> python process.py input-file output-file --split [post\|author]

	For the split argument, choose either post (a document representation is the words of a post) or author (a document representation is the words that an author has written).

	'''
	import re
	from bs4 import BeautifulSoup
	import argparse
	import codecs

	def main():
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(dest="input", help="Location of Wordpress XML export file.")
	parser.add_argument(dest="output", help="Location of file export. Overwrites existing.")
	parser.add_argument('--split', dest="split_on", type=str, default='post',
	help="[post\|author] Define what constitutes a document. Default 'post'.")
	args = parser.parse_args()

	docs = {}

	#Get a streamwriter to get around unicode issues
	streamwriter = codecs.getwriter('utf8')
	#Import file
	file = open(args.input, 'rb')
	dump = file.read()
	file.close()
	dump = BeautifulSoup(dump)
	posts = dump.find_all('item')
	for post in posts:
	doc_id = post.find('link').get_text()
	author = post.find('dc:creator').get_text()
	text = post.find('content:encoded').get_text(strip=True)
	status = post.find('wp:status').get_text()
	post_type = post.find('wp:post_type').get_text()
	if (status == 'publish' and post_type == 'post'):
	#Since Beautiful Soup doesn't strip tags from CDATA, create a new object
	text = BeautifulSoup(text)
	text = text.get_text(strip=True)
	text = clean_text_for_mallet(text)
	if args.split_on == 'post':
	docs[doc_id] = {"author":author, "text":text}
	elif args.split_on == 'author':
	#If splitting by author, save a large file for either author
	doc_id = author
	docs[doc_id] = docs.setdefault(doc_id, {})
	text = docs[doc_id].setdefault(text, "") + " " + text
	docs[doc_id] = {"author":author, "text":text}
	else:
	raise Exception("invalid split_on argument. Requires either 'post' or author")


	output = codecs.open(args.output, encoding='utf-8', mode='w+')
	errors = 0
	total = 0
	for doc_id, doc in docs.iteritems():
	try:
	new_line = u"{0} {1} {2}".format(doc_id, doc["author"], doc["text"])
	except:
	errors +=1
	print "Error {2} ({3} processed) - problem with {0} by {1}".format(doc_id, doc["author"], errors, total)
	total += 1
	output.write(streamwriter(new_line)+"\n")
	output.close()
	#posts = re.findall(r"\<item\>.*?\<\/item\>",text, flags=re.DOTALL)
	#print len(posts)
	#for post in posts:
	# author = re.findall(r"\<dc\:creator\>.*?\<\/dc:creator\>",post, flags=re.DOTALL)[0]
	# print author

	def clean_text_for_mallet(text):
	#Remove all line breaks
	text = re.sub("\n\|\r\|\[.*?\]", "", text)
	#replace contraction apostrophes with a blank string
	text = re.sub(r"(\w)'(\w)",r"\1\2", text)
	#Standardize whitespace (get around Mallet import issues, especially u\2028)
	text = ' '.join(text.split())
	return text
	if __name__=='__main__':
	main()