Skip to content

Instantly share code, notes, and snippets.

@avriiil
Created May 5, 2021 10:30
Show Gist options
  • Save avriiil/34e3c27243ecd533dab5985ec20f8b6a to your computer and use it in GitHub Desktop.
Save avriiil/34e3c27243ecd533dab5985ec20f8b6a to your computer and use it in GitHub Desktop.
# define function to map across dask partitions
def map_lemmas(df, mle):
def get_lemmas_nested(tokenized_text):
disambig = mle.disambiguate(tokenized_text)
try:
lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
return lemmas
except:
return np.nan
df.tweet_text = df.tweet_text.apply(get_lemmas_nested)
return df
# create mapped ddf
mapped = ddf.map_partitions(map_lemmas,
mle=dask.delayed(MLEDisambiguator.pretrained)(),
meta=ddf)
# run with performance report
with distributed.performance_report():
ddf = mapped.persist()
distributed.wait(ddf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment