Skip to content

Instantly share code, notes, and snippets.

@TRex22
Last active April 22, 2019 16:09
Show Gist options
  • Save TRex22/e6b3ca1021fe5274ea448fe619604b47 to your computer and use it in GitHub Desktop.
Save TRex22/e6b3ca1021fe5274ea448fe619604b47 to your computer and use it in GitHub Desktop.
Useful Unicode Regex (Ruby)
module StringSanitizer
extend self
##
# This is used for different kinds of cleanup of a string
# Use cases:
# 1. When creating reports in different formats with varying support for unicode.
# 2. For Data inputs to remove whitespace and strange characters which are not "language"
# 3. External services which may not be able to handle different character sets
# 4. Comparisons - when comapring two strings which have whitespace and weird unicode
#
# Examples:
#
# str = "bob, g, Germany 🇩🇪"
# StringSanitizer.call(str, strategy: :letters_accents_and_numbers_only)
# => "bob g Germany"
#
# str = "Добро пожаловать в джунгли. "
# StringSanitizer.call(str, strategy: :letters_accents_and_numbers_only)
# => "Добро пожаловать в джунгли"
#
# str = "ABCDEпожаловатьF"
# StringSanitizer.call(str, strategy: :ascii_only)
# => "ABCDEF"
# List of useful REGEX
# https://www.regular-expressions.info/refunicode.html
# https://stackoverflow.com/questions/24672834/how-do-i-remove-emoji-from-string
PICTURE_REGEX = /[\u{1f300}-\u{1f5ff}]/
ENCLOSED_CHAR_REGEX = /[\u{2500}-\u{2BEF}]/
EMOTICONS_REGEX = /[\u{1f600}-\u{1f64f}]/
DINGBATS_REGEX = /[\u{2702}-\u{27b0}]/
TRANSPORT_AND_MAP_REGEX = /[\u{1f680}-\u{1f6ff}]/
REGIONAL_INDICATOR_SYMBOL_REGEX = /[\u{1f1e6}-\u{1f1ff}]/ # flags
SPACE_BEFORE_COMMA_REGEX = / \,/
# Inverted Regex
ASCII_ONLY_REGEX = /[^\u{0000}-\u{007f}]/
LETTERS_ACCENTS_AND_DIGITS_ONLY_REGEX = /[^\p{L}\p{M}\d+\s]/
ASCII_AND_LANGUAGE_CHARACTERS_ONLY_REGEX = /[^\u{0000}-\u{007f}\p{L}\p{M}\d+]/
def call_multiple(str, strategies: [:strip_extra_whitespace_only])
strategies.each do |strategy|
str = self.call(str, strategy: strategy)
end
str
end
def call(str, strategy: :strip_extra_whitespace_only)
return str unless str.is_a?(String)
str = str.force_encoding('utf-8').encode
case(strategy)
when :ascii_and_language_symbols_only
str = ascii_and_language_symbols_only(str)
when :ascii_only
str = ascii_only(str)
when :letters_accents_and_numbers_only
str = letters_accents_and_numbers_only(str)
when :strip_special_unicode_classes_only
str = strip_special_unicode_classes_only(str)
when :strip_html_tags
str = strip_html_tags(str)
else
str = strip_space_before_comma(str)
end
str.squish
end
private
# Dangerous to just strip all characters which are not-ascii because then we cannot
# support other locales like ru, de, etc ...
def ascii_and_language_symbols_only(str)
str = strip_space_before_comma(str)
str.gsub(ASCII_AND_LANGUAGE_CHARACTERS_ONLY_REGEX, '')
end
def ascii_only(str)
str = strip_space_before_comma(str)
str.gsub(ASCII_ONLY_REGEX, '')
end
def letters_accents_and_numbers_only(str)
str = strip_space_before_comma(str)
str.gsub(LETTERS_ACCENTS_AND_DIGITS_ONLY_REGEX, '')
end
def strip_special_unicode_classes_only(str)
str = strip_space_before_comma(str)
str = str.gsub(PICTURE_REGEX, '')
str = str.gsub(ENCLOSED_CHAR_REGEX, '')
str = str.gsub(EMOTICONS_REGEX, '')
str = str.gsub(DINGBATS_REGEX, '')
str = str.gsub(TRANSPORT_AND_MAP_REGEX, '')
str.gsub(REGIONAL_INDICATOR_SYMBOL_REGEX, '')
end
def strip_space_before_comma(str)
str.gsub(SPACE_BEFORE_COMMA_REGEX, '')
end
def strip_html_tags(str)
ActionController::Base.helpers.strip_tags(str)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment