Skip to content

Instantly share code, notes, and snippets.

@wolfmanstout
Created July 30, 2017 22:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wolfmanstout/f090b8ff125a48d6febdb97aa7c0b4d5 to your computer and use it in GitHub Desktop.
Save wolfmanstout/f090b8ff125a48d6febdb97aa7c0b4d5 to your computer and use it in GitHub Desktop.
Extracts all sentences from posts and pages in exported WordPress XML.
#!/usr/bin/env python
import sys
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from nltk import tokenize
from xml.etree import ElementTree
NAMESPACES = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
}
def main(filename):
root = ElementTree.parse(filename).getroot()
for post in root.find('channel').findall('item'):
post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
if post_type not in ('post', 'page'):
continue
content = post.find('content:encoded', namespaces=NAMESPACES).text
soup = BeautifulSoup('<html>' + content + '</html>', 'html.parser')
for element in soup.find_all(['pre', 'h1', 'h2']):
element.decompose()
sentences = tokenize.sent_tokenize(soup.get_text())
for sentence in sentences:
if '\n' in sentence or '[contact-form]' in sentence: continue
print sentence.encode('utf8')
if __name__ == "__main__":
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment