Created
February 24, 2016 19:05
-
-
Save organisciak/14af35a4e93e75c18286 to your computer and use it in GitHub Desktop.
Script to process a Wordpress Export for Mallet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Author: Peter Organisciak | |
Convert Day of DH (or other Wordpress) export to Mallet import format. | |
[url] [user] [post text] | |
Use in the following way: | |
>> python process.py input-file output-file --split [post|author] | |
For the split argument, choose either post (a document representation is the words of a post) or author (a document representation is the words that an author has written). | |
''' | |
import re | |
from bs4 import BeautifulSoup | |
import argparse | |
import codecs | |
def main(): | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument(dest="input", help="Location of Wordpress XML export file.") | |
parser.add_argument(dest="output", help="Location of file export. Overwrites existing.") | |
parser.add_argument('--split', dest="split_on", type=str, default='post', | |
help="[post|author] Define what constitutes a document. Default 'post'.") | |
args = parser.parse_args() | |
docs = {} | |
#Get a streamwriter to get around unicode issues | |
streamwriter = codecs.getwriter('utf8') | |
#Import file | |
file = open(args.input, 'rb') | |
dump = file.read() | |
file.close() | |
dump = BeautifulSoup(dump) | |
posts = dump.find_all('item') | |
for post in posts: | |
doc_id = post.find('link').get_text() | |
author = post.find('dc:creator').get_text() | |
text = post.find('content:encoded').get_text(strip=True) | |
status = post.find('wp:status').get_text() | |
post_type = post.find('wp:post_type').get_text() | |
if (status == 'publish' and post_type == 'post'): | |
#Since Beautiful Soup doesn't strip tags from CDATA, create a new object | |
text = BeautifulSoup(text) | |
text = text.get_text(strip=True) | |
text = clean_text_for_mallet(text) | |
if args.split_on == 'post': | |
docs[doc_id] = {"author":author, "text":text} | |
elif args.split_on == 'author': | |
#If splitting by author, save a large file for either author | |
doc_id = author | |
docs[doc_id] = docs.setdefault(doc_id, {}) | |
text = docs[doc_id].setdefault(text, "") + " " + text | |
docs[doc_id] = {"author":author, "text":text} | |
else: | |
raise Exception("invalid split_on argument. Requires either 'post' or author") | |
output = codecs.open(args.output, encoding='utf-8', mode='w+') | |
errors = 0 | |
total = 0 | |
for doc_id, doc in docs.iteritems(): | |
try: | |
new_line = u"{0} {1} {2}".format(doc_id, doc["author"], doc["text"]) | |
except: | |
errors +=1 | |
print "Error {2} ({3} processed) - problem with {0} by {1}".format(doc_id, doc["author"], errors, total) | |
total += 1 | |
output.write(streamwriter(new_line)+"\n") | |
output.close() | |
#posts = re.findall(r"\<item\>.*?\<\/item\>",text, flags=re.DOTALL) | |
#print len(posts) | |
#for post in posts: | |
# author = re.findall(r"\<dc\:creator\>.*?\<\/dc:creator\>",post, flags=re.DOTALL)[0] | |
# print author | |
def clean_text_for_mallet(text): | |
#Remove all line breaks | |
text = re.sub("\n|\r|\[.*?\]", "", text) | |
#replace contraction apostrophes with a blank string | |
text = re.sub(r"(\w)'(\w)",r"\1\2", text) | |
#Standardize whitespace (get around Mallet import issues, especially u\2028) | |
text = ' '.join(text.split()) | |
return text | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment