def get_word_features(word, word_id): | |
word_lower = word.lower() | |
return [word_id + '=' + word_lower, | |
word_id + '.is_digit=%s' % word_lower.isdigit(), | |
word_id + '.has_digit=%s' % (bool(re.search('\d+', word_lower))), | |
word_id + '.is_number=%s' % (bool(re.match(r'\b\d+\b', word_lower))), | |
word_id + '.has_plus=%s' % (word_lower[-1] == '+'), | |
word_id + '.is_range=%s' % (bool(re.match(r'\b[0-9]+-[0-9]+\b', word_lower))), | |
word_id + '.is_age=%s' % (word_lower[:3] == 'age'), | |
word_id + '.is_age_group=%s' % (word_lower in ['kid', 'child', 'newborn', 'adult', 'teen', 'toddler', 'tween', 'baby', 'infant', 'senior']), | |
word_id + '.is_numeric=%s' % (word_lower in ['one','two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])] | |
def word2features(sentence, pos): | |
features = get_word_features(sentence[pos], 'curr_word') | |
if pos > 0: | |
features += get_word_features(sentence[pos - 1], 'prev_word') | |
else: | |
features.append('BOS') | |
if pos < len(sentence) - 1: | |
features += get_word_features(sentence[pos + 1], 'next_word') | |
else: | |
features.append('EOS') | |
return features | |
def sent2features(sent): | |
return [word2features(sent, i) for i in range(len(sent))] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment