Skip to content

Instantly share code, notes, and snippets.

@dkav6
Created October 25, 2021 23:39
Show Gist options
  • Save dkav6/ce0ebf9a7ff4736cb94a792054b16f0c to your computer and use it in GitHub Desktop.
Save dkav6/ce0ebf9a7ff4736cb94a792054b16f0c to your computer and use it in GitHub Desktop.
def get_sentences(url):
"""
Takes a url and outputs a list of sentences from the text of the website
"""
try:
article = Article(url)
article.download()
article.parse()
except:
pass
if article:
sentences = tokenize.sent_tokenize(article.text)
sentences = [re.sub(r'\d+', '', sentence) for sentence in sentences]
sentences = [re.sub(r'[^\w\s]','',sentence) for sentence in sentences]
sentences = [sentence for sentence in sentences if len(sentence) > 20]
duplicates = list(set([s for s in sentences if sentences.count(s) > 1]))
cleaned_sentences = list(set(sentences))
return cleaned_sentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment