wolfmanstout/extract_sentences.py

## extract_sentences.py
#!/usr/bin/env python

import sys
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from nltk import tokenize
from xml.etree import ElementTree

NAMESPACES = {
    'content': 'http://purl.org/rss/1.0/modules/content/',
    'wp': 'http://wordpress.org/export/1.2/',
}


def main(filename):
    root = ElementTree.parse(filename).getroot()
    for post in root.find('channel').findall('item'):
        post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
        if post_type not in ('post', 'page'):
            continue
        content = post.find('content:encoded', namespaces=NAMESPACES).text
        soup = BeautifulSoup('<html>' + content + '</html>', 'html.parser')
        for element in soup.find_all(['pre', 'h1', 'h2']):
            element.decompose()
        sentences = tokenize.sent_tokenize(soup.get_text())
        for sentence in sentences:
            if '\n' in sentence or '[contact-form]' in sentence: continue
            print sentence.encode('utf8')


if __name__ == "__main__":
    main(sys.argv[1])
	#!/usr/bin/env python

	import sys
	from bs4 import BeautifulSoup
	from bs4 import SoupStrainer
	from nltk import tokenize
	from xml.etree import ElementTree

	NAMESPACES = {
	'content': 'http://purl.org/rss/1.0/modules/content/',
	'wp': 'http://wordpress.org/export/1.2/',
	}


	def main(filename):
	root = ElementTree.parse(filename).getroot()
	for post in root.find('channel').findall('item'):
	post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
	if post_type not in ('post', 'page'):
	continue
	content = post.find('content:encoded', namespaces=NAMESPACES).text
	soup = BeautifulSoup('<html>' + content + '</html>', 'html.parser')
	for element in soup.find_all(['pre', 'h1', 'h2']):
	element.decompose()
	sentences = tokenize.sent_tokenize(soup.get_text())
	for sentence in sentences:
	if '\n' in sentence or '[contact-form]' in sentence: continue
	print sentence.encode('utf8')


	if __name__ == "__main__":
	main(sys.argv[1])