Skip to content

Instantly share code, notes, and snippets.

@organisciak
Created February 24, 2016 19:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save organisciak/14af35a4e93e75c18286 to your computer and use it in GitHub Desktop.
Save organisciak/14af35a4e93e75c18286 to your computer and use it in GitHub Desktop.
Script to process a Wordpress Export for Mallet
'''
Author: Peter Organisciak
Convert Day of DH (or other Wordpress) export to Mallet import format.
[url] [user] [post text]
Use in the following way:
>> python process.py input-file output-file --split [post|author]
For the split argument, choose either post (a document representation is the words of a post) or author (a document representation is the words that an author has written).
'''
import re
from bs4 import BeautifulSoup
import argparse
import codecs
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(dest="input", help="Location of Wordpress XML export file.")
parser.add_argument(dest="output", help="Location of file export. Overwrites existing.")
parser.add_argument('--split', dest="split_on", type=str, default='post',
help="[post|author] Define what constitutes a document. Default 'post'.")
args = parser.parse_args()
docs = {}
#Get a streamwriter to get around unicode issues
streamwriter = codecs.getwriter('utf8')
#Import file
file = open(args.input, 'rb')
dump = file.read()
file.close()
dump = BeautifulSoup(dump)
posts = dump.find_all('item')
for post in posts:
doc_id = post.find('link').get_text()
author = post.find('dc:creator').get_text()
text = post.find('content:encoded').get_text(strip=True)
status = post.find('wp:status').get_text()
post_type = post.find('wp:post_type').get_text()
if (status == 'publish' and post_type == 'post'):
#Since Beautiful Soup doesn't strip tags from CDATA, create a new object
text = BeautifulSoup(text)
text = text.get_text(strip=True)
text = clean_text_for_mallet(text)
if args.split_on == 'post':
docs[doc_id] = {"author":author, "text":text}
elif args.split_on == 'author':
#If splitting by author, save a large file for either author
doc_id = author
docs[doc_id] = docs.setdefault(doc_id, {})
text = docs[doc_id].setdefault(text, "") + " " + text
docs[doc_id] = {"author":author, "text":text}
else:
raise Exception("invalid split_on argument. Requires either 'post' or author")
output = codecs.open(args.output, encoding='utf-8', mode='w+')
errors = 0
total = 0
for doc_id, doc in docs.iteritems():
try:
new_line = u"{0} {1} {2}".format(doc_id, doc["author"], doc["text"])
except:
errors +=1
print "Error {2} ({3} processed) - problem with {0} by {1}".format(doc_id, doc["author"], errors, total)
total += 1
output.write(streamwriter(new_line)+"\n")
output.close()
#posts = re.findall(r"\<item\>.*?\<\/item\>",text, flags=re.DOTALL)
#print len(posts)
#for post in posts:
# author = re.findall(r"\<dc\:creator\>.*?\<\/dc:creator\>",post, flags=re.DOTALL)[0]
# print author
def clean_text_for_mallet(text):
#Remove all line breaks
text = re.sub("\n|\r|\[.*?\]", "", text)
#replace contraction apostrophes with a blank string
text = re.sub(r"(\w)'(\w)",r"\1\2", text)
#Standardize whitespace (get around Mallet import issues, especially u\2028)
text = ' '.join(text.split())
return text
if __name__=='__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment