Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extracts all sentences from posts and pages in exported WordPress XML.
#!/usr/bin/env python
import sys
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from nltk import tokenize
from xml.etree import ElementTree
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
}
def main(filename):
root = ElementTree.parse(filename).getroot()
for post in root.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', namespaces=NAMESPACES).text
soup = BeautifulSoup('<html>' + content + '</html>', 'html.parser')
for element in soup.find_all(['pre', 'h1', 'h2']):
element.decompose()
sentences = tokenize.sent_tokenize(soup.get_text())
for sentence in sentences:
if '\n' in sentence or '[contact-form]' in sentence: continue
print sentence.encode('utf8')
if __name__ == "__main__":
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment