Skip to content

Instantly share code, notes, and snippets.

@rloredo
Last active December 9, 2021 13:11
Show Gist options
  • Save rloredo/e8fd44ef5f80c98820eb3b2990e7097e to your computer and use it in GitHub Desktop.
Save rloredo/e8fd44ef5f80c98820eb3b2990e7097e to your computer and use it in GitHub Desktop.
Use Stanza library to tokenize and lemmatize texts
import stanza
import pandas as pd
#Load a dataframe with text in one column
df = pd.DataFrame({'label':[1], 'text' : ['Hi Juan Carlos'] })
#Initialize the engine. In this case in Portuguese
nlp_pt = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')
#Tokenize, lemmatize and POS
doc_list = []
error_elements = []
print('Processing')
for i, e in tqdm(enumerate(df.text.values)):
try:
doc_list.append((nlp_pt(e), i))
except:
print('Error in element {}'.format(i))
error_elements.append((e,i))
#Extract all lemmas
def get_lemma(tuple_list):
'''
Get list of pos elements [pos, text]
Use extract_pos to get all pos of one type
'''
upos = []
for doc, i in tqdm(tuple_list):
upos.append (([[word.upos, word.lemma] for sent in doc.sentences for word in sent.words], i))
return upos
pos = get_lemma(doc_list)
df_temp = pd.DataFrame(pos, columns=['user_pos', 'ind_original'])
#Extract an specific part of speech
def extract_pos(list_of_pos, list_pos_names, to_set = False):
'''
Extract all the lemmas from a list of pos
Use to_set to get unique values
'''
if to_set == True:
return set([x[1] for x in list_of_pos if x[0] in list_pos_names])
else:
return [x[1] for x in list_of_pos if x[0] in list_pos_names]
df_temp['user_noun_verbs'] = [extract_pos(x, ['NOUN', 'VERB', 'PROPN']) for x in df_temp['user_pos']]
df_temp['user_adjs'] = [extract_pos(x, ['ADJ']) for x in df_temp['user_pos']]
df_temp['user_advs'] = [extract_pos(x, ['ADV']) for x in df_temp['user_pos']]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment