Skip to content

Instantly share code, notes, and snippets.

@avriiil
Created May 5, 2021 10:35
Show Gist options
  • Save avriiil/58f4f464980e6fea7c7eace33a296d84 to your computer and use it in GitHub Desktop.
Save avriiil/58f4f464980e6fea7c7eace33a296d84 to your computer and use it in GitHub Desktop.
from once_per_worker import once_per_worker
from camel_tools.disambig.mle import MLEDisambiguator
# create dask.delayed object around Disambiguator
loaded_disambiguator = once_per_worker(lambda: MLEDisambiguator.pretrained())
# define mapping function with disambiguator as second argument
def map_lemmas(df, disambiguator):
def get_lemmas_nested(tokenized_text):
disambig = disambiguator.disambiguate(tokenized_text)
try:
lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
return lemmas
except:
return np.nan
df.tweet_text = df.tweet_text.apply(get_lemmas_nested)
return df
# define mapped ddf by passing loaded_disambiguator as second argument
mapped = ddf.map_partitions(map_lemmas,
loaded_disambiguator,
meta=ddf)
# run with performance report
with distributed.performance_report():
ddf = mapped.persist()
distributed.wait(ddf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment