-
-
Save sixtyfive/08a9eaa45f59298b8d0eb0dc19c20fc7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@regex_replacements = [ | |
# remove Latin punctuation, Arabic numerals and control characters | |
[/[[:punct:][:digit:][:cntrl:]]/, ' '], | |
# remove sic | |
[/sic/, ''], | |
# remove Arabic and non-standard punctuation characters | |
[/[؟،ـ–…]/, ' '], | |
# remove Indian numerals | |
[/[٠١٢٣٤٥٦٧٨٩]/, ' '], | |
# remove sukūn, šadda, fatḥa, fatḥatān, ḍamma, ḍammatān, kasra, kasratān | |
[/[\u0651\u0652\u064e\u064b\u064f\u064c\u0650\u064d]/, ''], | |
# replace ālif and hamza with ālif only (wise?) | |
['[ﺃﺇﺁٱ]', 'ﺍ'], | |
# replace lām-ālif-ligature with discrete lām and ālif | |
['ﻻ', 'ﻻ'], | |
# isolated-form baʾ's should be the regular variant | |
['ﺏ', 'ﺏ'], | |
# take care of stuttering (not real words, we don't want to count them) | |
[/ (ﺖﺣ|ﺎﻠﻋ|ﺦﻣ|ﻞﻟ|ﺥ|ﺱ|ﻉ|ﻑ|ﻡ|ﻭ|ﻱ) /, ' '], | |
# split the definite article "al" and whatever follows it while making sure not to rip apart the | |
# following (all given without hamza, which has already been removed earlier in all instances): | |
# - aḷḷāh, ilah | |
# - ala | |
# - allī, allaḏī/alaḏī, allatī/alatī, allaḏīna, allatīna, allawātī | |
# - alān | |
# - all words ending in "al + tāʾ marbūṭa" | |
[/(^| )(ﺎﻟ)(?!(ﻞﻫ|ﻩ|ﺓ|ﻯ|ﻞﻳ|ﻝﺬﻳ|ﺬﻳ|ﻞﺘﻳ|ﺖﻳ|ﻝﺬﻴﻧ|ﻞﺘﻴﻧ|ﻝﻭﺎﺘﻳ|ﺎﻧ))/, ' \2 '], | |
# pull apart "particle bi/li/ka/fa/wa/sa + definite article al + word" | |
[/(^| )(ﺏ|ﻝ|ﻙ|ﻑ|ﻭ|ﺱ)(ﺎﻟ)(.*?)( |$)/, ' \1 \2 \3 '] | |
] | |
# For debugging only | |
#=begin | |
corpus = "266,2049,ٱﻞﻠﻫ,-,-" | |
@regex_replacements.each {|search,replace| | |
puts "with (#{corpus}), gsub search = (#{search}), replace = (#{replace}) results in (#{corpus.gsub(search, replace)})" | |
} | |
#=end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment