Created
May 5, 2021 10:35
-
-
Save avriiil/58f4f464980e6fea7c7eace33a296d84 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from once_per_worker import once_per_worker | |
from camel_tools.disambig.mle import MLEDisambiguator | |
# create dask.delayed object around Disambiguator | |
loaded_disambiguator = once_per_worker(lambda: MLEDisambiguator.pretrained()) | |
# define mapping function with disambiguator as second argument | |
def map_lemmas(df, disambiguator): | |
def get_lemmas_nested(tokenized_text): | |
disambig = disambiguator.disambiguate(tokenized_text) | |
try: | |
lemmas = [d.analyses[0].analysis['lex'] for d in disambig] | |
return lemmas | |
except: | |
return np.nan | |
df.tweet_text = df.tweet_text.apply(get_lemmas_nested) | |
return df | |
# define mapped ddf by passing loaded_disambiguator as second argument | |
mapped = ddf.map_partitions(map_lemmas, | |
loaded_disambiguator, | |
meta=ddf) | |
# run with performance report | |
with distributed.performance_report(): | |
ddf = mapped.persist() | |
distributed.wait(ddf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment