Skip to content

Instantly share code, notes, and snippets.

View khaledadrani's full-sized avatar
💭
Doing stuff

Khaled Adrani khaledadrani

💭
Doing stuff
View GitHub Profile
@khaledadrani
khaledadrani / praw_hot_posts.py
Last active December 13, 2021 09:15
starter code for praw python
for post in subreddit.hot(limit=5):
print(post.title)
print()
@khaledadrani
khaledadrani / filter_tweets_quality.py
Last active December 21, 2021 17:52
Filter_bad_tweets
import re
def filter_tweets(selected):
'''
Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
And also filter out any tweet that is longer than 200 characters.
'''
filtered = []
url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
for text in selected.text:
import spacy
nlp = spacy.load('en_core_web_md')
def annotate_text(doc):
ls = []
for ent in doc.ents:
entry = dict()
entry['text'] = ent.text
@khaledadrani
khaledadrani / mount_drive_colab.py
Created January 17, 2022 13:30
mount google drive to google colab
from google.colab import drive
drive.mount('/content/drive')
def import_documents_set_iob(train_file_path):
with open(train_file_path, encoding="utf8") as f:
tokens_in_file = f.readlines()
# construct list of list train set format
new_train_set = []
for index_token,token in enumerate(tokens_in_file):
# detect new document
is_new_document = False
# Utils functions to extract features
def word2features(sent, i):
word = sent[i][0]
#postag = sent[i][1]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# Evaluation of trained model
# Start remove 'O' labels
labels = list(crf.classes_)
labels.remove('O')
print("trained labels :",labels)
# start prediction and calculate f-score
y_pred = crf.predict(X_test)
print (metrics.flat_f1_score(y_test, y_pred,
average='weighted', labels=labels,zero_division=True))
#convert raw sentences into list of tuples (token and empty)
def sents2tuples(sents):
res = []
for sent in sents:
tokens = word_tokenize(sent)
res.append([(token,'') for token in tokens])
return res
#with sent2tuples, preprocessing will work just fine with new text
def preprocess( texts):
samples = ["Facebook has a price target of $ 20 for this quarter",
"$ AAPL is gaining a new momentum"]
for doc in ner.pipe(samples):
for ent in doc.ents:
print(ent.label_, ent.text)
print()