Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fnielsen/d69ee6b880386466098f9974091ccd80 to your computer and use it in GitHub Desktop.
Save fnielsen/d69ee6b880386466098f9974091ccd80 to your computer and use it in GitHub Desktop.
#!//usr/bin/env python
"""
Usage:
url_to_sentence_collector_sentences <url>
Description:
https://commonvoice.mozilla.org/sentence-collector/#/how-to
"""
from docopt import docopt
from requests import get
from lxml.etree import HTML
from re import sub
from nltk import sent_tokenize, word_tokenize
arguments = docopt(__doc__)
url = arguments["<url>"]
# Retsinformation.dk now uses Javascript to get the body text.
if url.startswith('https://www.retsinformation.dk/'):
url = url[:31] + "api/document/" + url[31:]
# Get the webpage
response = get(url)
if not response.ok:
print(response.status)
exit(0)
# Get the HTML from the response
if url.startswith('https://www.retsinformation.dk/'):
data = response.json()
html = data[0]['documentHtml']
else:
html = response.content
# Extract raw text
tree = HTML(html)
texts = tree.xpath("//text()")
text = " ".join(texts)
cleaned_text = sub(r"\s+", " ", text)
# Parse sentence. Only sentence with less than 14 words and now numbers
# can be used.
sentences = sent_tokenize(cleaned_text)
for sentence in sentences:
words = word_tokenize(sentence)
if len(words) <= 14 and not any(char.isdigit() for char in sentence):
print(sentence)
@fnielsen
Copy link
Author

Abbreviations and symbol-containing sentences is not excluded, e.g., "EU" and "/".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment