This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_md') | |
def annotate_text(doc): | |
ls = [] | |
for ent in doc.ents: | |
entry = dict() | |
entry['text'] = ent.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def filter_tweets(selected): | |
''' | |
Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information) | |
And also filter out any tweet that is longer than 200 characters. | |
''' | |
filtered = [] | |
url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" | |
for text in selected.text: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for post in subreddit.hot(limit=5): | |
print(post.title) | |
print() |
NewerOlder