Skip to content

Instantly share code, notes, and snippets.

@avriiil
avriiil / reduce_ortho_ambiguity.py
Created April 5, 2021 19:19
Function to reduce orthographic ambiguity of Arabic text
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
def ortho_normalize(text):
text = normalize_alef_maksura_ar(text)
text = normalize_alef_ar(text)
text = normalize_teh_marbuta_ar(text)
return text
@avriiil
avriiil / dediacritization.py
Created April 5, 2021 19:21
Remove diacritics from Arabic text
# import the dediacritization tool
from camel_tools.utils.dediac import dediac_ar
# apply to your text column
df.tweet_text = df.tweet_text.apply(dediac_ar)
@avriiil
avriiil / simple-word-tokenizer.py
Created April 5, 2021 19:24
Perform a simple word tokenizer on Arabic text
from camel_tools.tokenizers.word import simple_word_tokenize
df.tweet_text = df.tweet_text.apply(simple_word_tokenize)
@avriiil
avriiil / morphological-analysis.py
Last active April 5, 2021 19:40
Run a morphological analysis on a single Arabic word
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)
analyses = analyzer.analyze('وبعقدنا')
for analysis in analyses:
print(analysis, '\n')
@avriiil
avriiil / morphological-disambiguator-sentence.py
Last active April 5, 2021 20:33
Disambiguate a single Arabic sentence
from camel_tools.disambig.mle import MLEDisambiguator
# instantiate the Maximum Likelihood Disambiguator
mle = MLEDisambiguator.pretrained()
# The disambiguator expects pre-tokenized text
sentence = simple_word_tokenize('نجح بايدن في الانتخابات')
disambig = mle.disambiguate(sentence)
@avriiil
avriiil / get-lemmas.py
Created April 5, 2021 19:47
Get lemmas of Arabic string
def get_lemmas(tokenized_text):
disambig = mle.disambiguate(tokenized_text)
lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
return lemmas
@avriiil
avriiil / morphological-tokenization.py
Created April 5, 2021 19:50
Perform morphological tokenization on Arabic text
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
# atbseg scheme
tokenizer = MorphologicalTokenizer(mle, scheme='atbseg')
tokens = tokenizer.tokenize(df.tweet_text.iloc[0])
print(tokens)
# atbtok scheme
tokenizer = MorphologicalTokenizer(mle, scheme='atbtok')
tokens = tokenizer.tokenize(df.tweet_text.iloc[0])
# define function to map across dask partitions
def map_lemmas(df, mle):
def get_lemmas_nested(tokenized_text):
disambig = mle.disambiguate(tokenized_text)
try:
lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
return lemmas
except:
return np.nan
from once_per_worker import once_per_worker
from camel_tools.disambig.mle import MLEDisambiguator
# create dask.delayed object around Disambiguator
loaded_disambiguator = once_per_worker(lambda: MLEDisambiguator.pretrained())
# define mapping function with disambiguator as second argument
def map_lemmas(df, disambiguator):
def get_lemmas_nested(tokenized_text):
disambig = disambiguator.disambiguate(tokenized_text)
# importing libraries
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, models
# cast tweets to numpy array
docs = df.tweet_text.to_numpy()
# create dictionary of all words in all documents