This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for post in subreddit.hot(limit=5): | |
print(post.title) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def filter_tweets(selected): | |
''' | |
Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information) | |
And also filter out any tweet that is longer than 200 characters. | |
''' | |
filtered = [] | |
url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" | |
for text in selected.text: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_md') | |
def annotate_text(doc): | |
ls = [] | |
for ent in doc.ents: | |
entry = dict() | |
entry['text'] = ent.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.colab import drive | |
drive.mount('/content/drive') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def import_documents_set_iob(train_file_path): | |
with open(train_file_path, encoding="utf8") as f: | |
tokens_in_file = f.readlines() | |
# construct list of list train set format | |
new_train_set = [] | |
for index_token,token in enumerate(tokens_in_file): | |
# detect new document | |
is_new_document = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Utils functions to extract features | |
def word2features(sent, i): | |
word = sent[i][0] | |
#postag = sent[i][1] | |
features = { | |
'bias': 1.0, | |
'word.lower()': word.lower(), | |
'word[-3:]': word[-3:], | |
'word[-2:]': word[-2:], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
crf = sklearn_crfsuite.CRF( | |
algorithm='lbfgs', | |
c1=0.1, | |
c2=0.1, | |
max_iterations=100, | |
all_possible_transitions=True | |
) | |
crf.fit(X_train, y_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Evaluation of trained model | |
# Start remove 'O' labels | |
labels = list(crf.classes_) | |
labels.remove('O') | |
print("trained labels :",labels) | |
# start prediction and calculate f-score | |
y_pred = crf.predict(X_test) | |
print (metrics.flat_f1_score(y_test, y_pred, | |
average='weighted', labels=labels,zero_division=True)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#convert raw sentences into list of tuples (token and empty) | |
def sents2tuples(sents): | |
res = [] | |
for sent in sents: | |
tokens = word_tokenize(sent) | |
res.append([(token,'') for token in tokens]) | |
return res | |
#with sent2tuples, preprocessing will work just fine with new text | |
def preprocess( texts): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
samples = ["Facebook has a price target of $ 20 for this quarter", | |
"$ AAPL is gaining a new momentum"] | |
for doc in ner.pipe(samples): | |
for ent in doc.ents: | |
print(ent.label_, ent.text) | |
print() |
OlderNewer