Created
October 25, 2021 23:39
-
-
Save dkav6/ce0ebf9a7ff4736cb94a792054b16f0c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_sentences(url): | |
""" | |
Takes a url and outputs a list of sentences from the text of the website | |
""" | |
try: | |
article = Article(url) | |
article.download() | |
article.parse() | |
except: | |
pass | |
if article: | |
sentences = tokenize.sent_tokenize(article.text) | |
sentences = [re.sub(r'\d+', '', sentence) for sentence in sentences] | |
sentences = [re.sub(r'[^\w\s]','',sentence) for sentence in sentences] | |
sentences = [sentence for sentence in sentences if len(sentence) > 20] | |
duplicates = list(set([s for s in sentences if sentences.count(s) > 1])) | |
cleaned_sentences = list(set(sentences)) | |
return cleaned_sentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment