Khaled Adrani khaledadrani

## praw_hot_posts.py
for post in subreddit.hot(limit=5):
    print(post.title)
    print()

## filter_tweets_quality.py
import re

def filter_tweets(selected):
  '''
  Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
  And also filter out any tweet that is longer than 200 characters.
  '''
  filtered = []
  url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  for text in selected.text:

## pre-annotation-spacy.py
import spacy

nlp = spacy.load('en_core_web_md')

def annotate_text(doc):
  ls = []
  for ent in doc.ents:

    entry = dict()
    entry['text'] = ent.text

## mount_drive_colab.py
from google.colab import drive
drive.mount('/content/drive')

## import_document_set_iob.py
def import_documents_set_iob(train_file_path):
    with open(train_file_path,  encoding="utf8") as f:
        tokens_in_file = f.readlines()

    # construct list of list train set format
    new_train_set = []

    for index_token,token in enumerate(tokens_in_file):
        # detect new document
        is_new_document = False

## crf_features.py
# Utils functions to extract features
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],

## train_crf.py
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)

## evaluate_crf.py
# Evaluation of trained model
# Start remove 'O' labels
labels = list(crf.classes_)
labels.remove('O')
print("trained labels :",labels)

# start prediction and calculate f-score
y_pred = crf.predict(X_test)
print (metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels,zero_division=True))

## crf_model_usage.py
#convert raw sentences into list of tuples (token and empty)
def sents2tuples(sents):
      res = []
      for sent in sents:
        tokens = word_tokenize(sent)
        res.append([(token,'') for token in tokens])
      return res

#with sent2tuples, preprocessing will work just fine with new text
def preprocess( texts):

## trf_usage.py
samples = ["Facebook has a price target of $ 20 for this quarter",
         "$ AAPL is gaining a new momentum"]


for doc in ner.pipe(samples):
  for ent in doc.ents:
      print(ent.label_, ent.text)
  print()
	import re

	def filter_tweets(selected):
	'''
	Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
	And also filter out any tweet that is longer than 200 characters.
	'''
	filtered = []
	url_pattern = "http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
	for text in selected.text:
	import spacy

	nlp = spacy.load('en_core_web_md')

	def annotate_text(doc):
	ls = []
	for ent in doc.ents:

	entry = dict()
	entry['text'] = ent.text
	def import_documents_set_iob(train_file_path):
	with open(train_file_path, encoding="utf8") as f:
	tokens_in_file = f.readlines()

	# construct list of list train set format
	new_train_set = []

	for index_token,token in enumerate(tokens_in_file):
	# detect new document
	is_new_document = False
	# Utils functions to extract features
	def word2features(sent, i):
	word = sent[i][0]
	#postag = sent[i][1]

	features = {
	'bias': 1.0,
	'word.lower()': word.lower(),
	'word[-3:]': word[-3:],
	'word[-2:]': word[-2:],
	crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=100,
	all_possible_transitions=True
	)
	crf.fit(X_train, y_train)
	# Evaluation of trained model
	# Start remove 'O' labels
	labels = list(crf.classes_)
	labels.remove('O')
	print("trained labels :",labels)

	# start prediction and calculate f-score
	y_pred = crf.predict(X_test)
	print (metrics.flat_f1_score(y_test, y_pred,
	average='weighted', labels=labels,zero_division=True))
	#convert raw sentences into list of tuples (token and empty)
	def sents2tuples(sents):
	res = []
	for sent in sents:
	tokens = word_tokenize(sent)
	res.append([(token,'') for token in tokens])
	return res

	#with sent2tuples, preprocessing will work just fine with new text
	def preprocess( texts):
	samples = ["Facebook has a price target of $ 20 for this quarter",
	"$ AAPL is gaining a new momentum"]


	for doc in ner.pipe(samples):
	for ent in doc.ents:
	print(ent.label_, ent.text)
	print()