Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Extracts all sentences from posts and pages in exported WordPress XML.
#!/usr/bin/env python
import sys
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from nltk import tokenize
from xml.etree import ElementTree
'content': '',
'wp': '',
def main(filename):
root = ElementTree.parse(filename).getroot()
for post in root.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
content = post.find('content:encoded', namespaces=NAMESPACES).text
soup = BeautifulSoup('<html>' + content + '</html>', 'html.parser')
for element in soup.find_all(['pre', 'h1', 'h2']):
sentences = tokenize.sent_tokenize(soup.get_text())
for sentence in sentences:
if '\n' in sentence or '[contact-form]' in sentence: continue
print sentence.encode('utf8')
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment