Created
January 17, 2022 13:40
-
-
Save khaledadrani/10306a4f909ec5d045688111ebd38f50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Utils functions to extract features | |
def word2features(sent, i): | |
word = sent[i][0] | |
#postag = sent[i][1] | |
features = { | |
'bias': 1.0, | |
'word.lower()': word.lower(), | |
'word[-3:]': word[-3:], | |
'word[-2:]': word[-2:], | |
'word.isupper()': word.isupper(), | |
'word.istitle()': word.istitle(), | |
'word.isdigit()': word.isdigit(), | |
# 'postag': postag, | |
# 'postag[:2]': postag[:2], | |
} | |
if i > 0: | |
word1 = sent[i-1][0] | |
#postag1 = sent[i-1][1] | |
features.update({ | |
'-1:word.lower()': word1.lower(), | |
'-1:word.istitle()': word1.istitle(), | |
'-1:word.isupper()': word1.isupper(), | |
# '-1:postag': postag1, | |
# '-1:postag[:2]': postag1[:2], | |
}) | |
else: | |
features['BOS'] = True | |
if i < len(sent)-1: | |
word1 = sent[i+1][0] | |
#postag1 = sent[i+1][1] | |
features.update({ | |
'+1:word.lower()': word1.lower(), | |
'+1:word.istitle()': word1.istitle(), | |
'+1:word.isupper()': word1.isupper(), | |
# '+1:postag': postag1, | |
# '+1:postag[:2]': postag1[:2], | |
}) | |
else: | |
features['EOS'] = True | |
return features | |
def sent2features(sent): | |
return [word2features(sent, i) for i in range(len(sent))] | |
def sent2labels(sent): | |
#return [label for token, postag, label in sent] | |
return [label for token, label in sent] | |
def sent2tokens(sent): | |
#return [token for token, postag, label in sent] | |
return [token for token, label in sent] | |
print ("example extracted features from single word :",sent2features(train_sents[0])[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment