Skip to content

Instantly share code, notes, and snippets.

@Ab1992ao
Created May 17, 2021 09:07
Show Gist options
  • Save Ab1992ao/49f93e75fb2f1a0db7b4e17b9ad52ba7 to your computer and use it in GitHub Desktop.
Save Ab1992ao/49f93e75fb2f1a0db7b4e17b9ad52ba7 to your computer and use it in GitHub Desktop.
prepare ner data for multitask learning pipe
def load_ner_data(ner_path, seq_len=24):
data = pd.read_csv(ner_path, encoding= 'unicode_escape', sep=',')
data = data.fillna(method='ffill')
grouped_s = data.groupby('Sentence #', as_index=True)['Word'].apply(lambda g: ' '.join(g))
grouped_t = data.groupby('Sentence #', as_index=True)['Tag'].apply(lambda g: ' '.join(g))
ner_tr = pd.DataFrame({}, columns=['sentence', 'tag'] )
ner_tr['sentence'] = [st for st in grouped_s.values if len(st.split())<=seq_len]
ner_tr['tag'] = [ tg.split() for tg in grouped_t if len(tg.split())<=seq_len]
tag2idx = {t: i for i,t in enumerate(data.Tag.unique())}
num_tags = len(tag2idx)
y = [[tag2idx[w] for w in s] for s in ner_tr['tag']]
y = pad_sequences(maxlen = seq_len, sequences=y, padding='post', value=tag2idx["O"])
ptargs = [to_categorical(i, num_classes=num_tags) for i in y]
ptexts = np.array(ner_tr['sentence'])
return ptexts, ptargs, num_tags, tag2idx, ner_tr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment