fnielsen/url_to_sentence_collector_sentences.py

## url_to_sentence_collector_sentences.py
#!//usr/bin/env python
"""
Usage:
  url_to_sentence_collector_sentences <url>

Description:
  https://commonvoice.mozilla.org/sentence-collector/#/how-to

"""
from docopt import docopt
from requests import get
from lxml.etree import HTML
from re import sub
from nltk import sent_tokenize, word_tokenize


arguments = docopt(__doc__)

url = arguments["<url>"]

# Retsinformation.dk now uses Javascript to get the body text.
if url.startswith('https://www.retsinformation.dk/'):
    url = url[:31] + "api/document/" + url[31:]

# Get the webpage
response = get(url)
if not response.ok:
    print(response.status)
    exit(0)

# Get the HTML from the response
if url.startswith('https://www.retsinformation.dk/'):
    data = response.json()
    html = data[0]['documentHtml']
else:
    html = response.content

# Extract raw text
tree = HTML(html)
texts = tree.xpath("//text()")
text = " ".join(texts)
cleaned_text = sub(r"\s+", " ", text)

# Parse sentence. Only sentence with less than 14 words and now numbers
# can be used.
sentences = sent_tokenize(cleaned_text)
for sentence in sentences:
    words = word_tokenize(sentence)
    if len(words) <= 14 and not any(char.isdigit() for char in sentence):
        print(sentence)
	#!//usr/bin/env python
	"""
	Usage:
	url_to_sentence_collector_sentences <url>

	Description:
	https://commonvoice.mozilla.org/sentence-collector/#/how-to

	"""
	from docopt import docopt
	from requests import get
	from lxml.etree import HTML
	from re import sub
	from nltk import sent_tokenize, word_tokenize


	arguments = docopt(__doc__)

	url = arguments["<url>"]

	# Retsinformation.dk now uses Javascript to get the body text.
	if url.startswith('https://www.retsinformation.dk/'):
	url = url[:31] + "api/document/" + url[31:]

	# Get the webpage
	response = get(url)
	if not response.ok:
	print(response.status)
	exit(0)

	# Get the HTML from the response
	if url.startswith('https://www.retsinformation.dk/'):
	data = response.json()
	html = data[0]['documentHtml']
	else:
	html = response.content

	# Extract raw text
	tree = HTML(html)
	texts = tree.xpath("//text()")
	text = " ".join(texts)
	cleaned_text = sub(r"\s+", " ", text)

	# Parse sentence. Only sentence with less than 14 words and now numbers
	# can be used.
	sentences = sent_tokenize(cleaned_text)
	for sentence in sentences:
	words = word_tokenize(sentence)
	if len(words) <= 14 and not any(char.isdigit() for char in sentence):
	print(sentence)