Skip to content

Instantly share code, notes, and snippets.

@peregrinogris
Created March 4, 2013 22:08
Show Gist options
  • Save peregrinogris/5086139 to your computer and use it in GitHub Desktop.
Save peregrinogris/5086139 to your computer and use it in GitHub Desktop.
Split text into sentences and output that json.
from BeautifulSoup import BeautifulSoup
import json
import sys
if len(sys.argv) < 2:
print sys.argv[0] + ' <input file>'
else:
in_file = sys.argv[1]
if len(sys.argv) < 3:
tag = 'div'
else:
tag = sys.argv[2]
soup = BeautifulSoup(open('./'+sys.argv[1]))
out = []
last_dot = False
for text in soup.findAll(tag):
lines = text.getText(" ").lstrip().rstrip()
if len(lines) > 0:
last_dot = lines[-1] == "."
lines = lines.split('.')
for i in range(len(lines)):
line = lines[i]
if len(line) > 1:
line = line.lstrip().rstrip()
if i < len(lines)-1:
line = line+"."
elif last_dot:
line = line+"."
out.append(line)
print json.dumps(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment