Created
September 24, 2020 23:06
-
-
Save seamustuohy/2e61b6ee39925c4012cf2ef6a3d9976e to your computer and use it in GitHub Desktop.
Simply script to build a powershell regex to check for homoglyphs of a specific string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright © 2018 seamus tuohy, <code@seamustuohy.com> | |
# | |
# This program is free software: you can redistribute it and/or modify it | |
# under the terms of the GNU General Public License as published by the Free | |
# Software Foundation, either version 3 of the License, or (at your option) | |
# any later version. | |
# | |
# This program is distributed in the hope that it will be useful, but WITHOUT | |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. | |
# YOU WILL NEED THIS | |
# http://regexhero.net/reference/ | |
# This creates a powershell compatible regular expression that you can use to check for homoglyphs of a specific string. | |
import argparse | |
import re | |
import logging | |
logging.basicConfig(level=logging.ERROR) | |
log = logging.getLogger(__name__) | |
def main(): | |
args = parse_arguments() | |
set_logging(args.verbose, args.debug) | |
# homoglyphs = get_homoglyphs() | |
# regex = create_regex(args.string, homoglyphs) | |
if args.puny_only is True: | |
homoglyphs = get_puny_homoglyphs() | |
else: | |
homoglyphs = get_homoglyphs() | |
# print(homoglyphs) | |
nm_homoglyphs = make_homoglyph_named_matches(homoglyphs) | |
#print(nm_homoglyphs) | |
#print("") | |
regex = create_nm_regex(args.string, nm_homoglyphs) | |
#print("") | |
print('"{0}"'.format(regex)) | |
def make_homoglyph_named_matches(homoglyphs): | |
nm = {} | |
for letter,strings in homoglyphs.items(): | |
current_hg = set(nm.get(letter.lower(), [])) | |
for i in strings: | |
current_hg.add(i) | |
nm[letter.lower()] = current_hg | |
nm_define = '(?<{0}>[{1}])' | |
nm_regex = {} | |
for letter,strings in nm.items(): | |
nm_regex[letter] = nm_define.format(letter, ''.join(strings)) | |
return nm_regex | |
def get_regex_rejecting_name_match(string): | |
name_regex = " ".join(["[{0}{1}]{2}".format(i[0].upper(), i[0].lower(), i[1:]) for i in string.split()]) | |
return "(?!{0})".format(name_regex) | |
def create_nm_regex(string, nm_homoglyphs): | |
homo_regex = "" | |
# \p{name} :: Matches any single character in the Unicode general category or named block specified by name. | |
# https://www.regular-expressions.info/unicode.html | |
# (?<name1-name2>pattern) :: Defines a balancing group definition. | |
whitespace_define = '(?<WC>[\p{M}\p{Z}\p{P}\p{C}]*)' | |
# \k<name> :: Named backreference. Matches the value of a named expression. | |
whitespace_checks = "\k<WC>" | |
nm_hg_checks = "\k<{0}>" | |
letter_set = set() | |
for i in string: | |
# Define the character string upon first use | |
if not re.match("[a-zA-Z]", i): # Don't replace non-ascii | |
match_set = i | |
elif i.lower() in letter_set: | |
match_set = nm_hg_checks.format(i.lower()) | |
else: | |
match_set = nm_homoglyphs.get(i.lower(), None) | |
letter_set.add(i.lower()) | |
if match_set is None: | |
match_set = i | |
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set) | |
regex_not_original_name = get_regex_rejecting_name_match(string) | |
homo_regex = regex_not_original_name + whitespace_define + homo_regex | |
# Remove multi-space checks for beginning and end of string | |
homo_regex = homo_regex.replace('\k<WC> \k<WC>', '\k<WC> ') | |
return homo_regex | |
def create_regex(string, homoglyphs): | |
homo_regex = "" | |
whitespace_define = '(?<WC>[\p{M}\p{Z}\p{P}\p{C}]*)' | |
whitespace_checks = "\k<WC>" | |
for i in string: | |
all_letter_variations = list(set(homoglyphs.get(i.lower(), []) + homoglyphs.get(i.upper(), []))) | |
match_set = "[{0}]".format(''.join(all_letter_variations)) | |
if match_set != []: | |
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set) | |
else: | |
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, i) | |
homo_regex = whitespace_define + homo_regex | |
return homo_regex | |
def get_puny_homoglyphs(): | |
initial_homoglyphs = get_homoglyphs() | |
puny_homoglyphs = {} | |
for i,x in initial_homoglyphs.items(): | |
_ph = set([i.encode('idna').decode('idna') for i in x]) | |
puny_homoglyphs[i] = ''.join(list(_ph)) | |
return puny_homoglyphs | |
def get_homoglyphs(): | |
homoglyph_strings = { | |
" ": "\s", | |
"A": "𝗔𝖠𝙰𝘈A𝘼𝜜ꭺᗅ𝒜ꓮ𝔸Ꭺ𝓐𝚨ÅÁ𝔄𝝖𝐴À𝞐𐊠ᴀÂ𝐀ÃА𖽀𝑨𝕬𝛢AÄΑ", | |
"B": "𝛣𝞑ᗷß𝗕ꞴB𝖡𝜝𐊡Β𝑩𝔹𝓑𝔅в𝘉ᛒ𝐵𝙱𝝗ꓐВ𝐁ᏼ𝚩ℬBβ𐌁𐊂𝘽ʙ𝕭Ᏼ", | |
"C": "Ⲥ𝑪𑣩🝌Cℭ𝙲𝒞ꓚ𝓒Ꮯ𐊢ℂ𝐶C𐔜𝗖𝐂Ⅽ𐌂𑣲С𝘾𝘊𝖢𝕮Ϲ𐐕𐐠", | |
"D": "𝓓𝗗ᗞ𝔻𝘿Đᗪ𝐷𝙳𝖣𝒟ĎꓓDⅅⅮ𝕯𝔇ᴅ𝐃𝑫𝘋DᎠꭰ", | |
"E": "𝙀ÈĚ𝔈Éᴇ𝘌𝔼Е𑢦𝜠Ēℰ⋿𝝚ĔΕË𝛦𝑬𝚬𝗘𝞔ꭼĖE𝕰EĘ𑢮𝖤𝙴𐊆𝓔𝐸ꓰÊ𝐄Ꭼⴹ", | |
"F": "𝙁𝔽𝑭F𝙵ꓝ𝐹ᖴ𝟊𐊇𝐅𝈓Ꞙ𝕱𝔉𝓕𝖥ℱ𝗙𑢢F𑣂𐔥𐊥Ϝ𝘍", | |
"G": "𝘎Gԍ𝗚ɢ𝐺𝔾𝙂𝑮𝕲Ꮐնꮐ𝒢ᏻꓖԌ𝓖G𝔊𝐆𝙶𝖦Ᏻ", | |
"H": "𝞖𝐇𝝜𝗛ℍ𝛨𝘏Ⲏ𝖧𝜢𝙷ꓧһнᎻℋꮋ𝚮Hᕼ𝓗𝐻𝑯ʜ𝙃Η𐋏H𝕳Нℌ", | |
"I": "ι𝝸Ⅰiᛁꭵاӏ𝚤Ι𰰠𝐢𝑖І𝕚𝚒lᎥ˛𐐠⍳𝜄I𝗂ιіꙇⅰ𝛊ɪ𝖎ī𝙞iͺ𝒾𝓲ɩℹ𝔦𝗶𑣃𝜾𝞲ⅈı𝘪I𝒊", | |
"J": "𝔍𝓙Ꭻ𝕁ᴊJ𝒥𝙹𝑱Ϳ𝙅յ𝐽Jꭻ𝗝𝕵Јᒍ𝐉ꓙ𝖩Ʝ𝘑", | |
"K": "𝙆𝛫𝑲𝐾𝕂𝒦𝞙𝓚𝖪𝘒К𝝟ᛕꓗ𝙺𐔘KK𝚱𝔎𝐊𝗞Ⲕ𝕶ᏦΚ𝜥K", | |
"L": "ι𐑃LⳐ𝕃𝙇𑢣L𝐋𝓛𝔏𝕷𝈪l𐐛𝙻𝐿ⳑʟⅬ𝑳𐔦ꓡ𝘓Ꮮᒪℒ𝖫ꮮ𖼖ⅼ𝗟𑢲", | |
"M": "𝜧Ꮇℳ𝛭𝝡Μ𝞛𝑀𝗠𝙼𝔐Ϻ𝚳𐊰𝐌𝙈Ⅿ𝘔ᗰМ𝖬𐌑𝓜MꓟᛖⲘ𝑴𝕸M𝕄", | |
"N": "𐔓𝑵𝝢𝙽N𝚴𝒩𝞜𝙉𝕹ℕ𝐍Ⲛ𝛮𝘕𝔑𝖭𝜨Nɴ𝗡ꓠ𝑁𝓝Ν", | |
"O": "οΟoՕО0𱠠OoOо𐐠", | |
"P": "𝙋𝑷𝜬Ꮲ𝞠ꮲ𝚸ℙ𝘗𝙿РᑭΡꓑ𝐏𝝦ᴩ𝓟𐊕𝖯𝛲ⲢᴘPP𝔓𝑃𝒫𝗣𝕻", | |
"Q": "Qℚ𝖰𝙌𝚀𝗤𝘘𝐐ႳႭ𝑄ⵕ𝕼𝑸𝒬𝔔𝓠Q", | |
"R": "ꭱ𐒴𝘙R𝑅ℝꮢᖇℛᚱℜ𝚁𝐑ƦR𝈖ꓣ𝕽𝙍𖼵Ꭱ𝖱𝗥𝑹Ꮢʀ𝓡", | |
"S": "Ꮥ𝚂𝗦𝖲ႽЅ𝐒𝑺𝕊S𝕾𝑆𖼺𐊖S𝙎𝒮𝓢ꓢss𝘚ᏚՏѕ𝔖𐐠", | |
"T": "⟙𝞣𑢼Ꭲ𐊗𝞽Τтᴛ𝐓𝒯𝙏𝖳𝜏Ⲧτ𝕋ꭲ𝑻𝗧𝑇T𐊱𐌕𖼊T𝛕𝚻𝝉⊤Т𝔗ꓔ𝜯𝝩𝘛𝛵🝨𝞃𝕿𝓣𝚃", | |
"U": "𖽂𝔘𝓤𐓎𝚄ՍUU𝗨𝑼Ա𝙐⋃u𑢸𝑈μ𝖀υ𝐔ሀ∪𝕌𝖴ꓴᑌ𝒰𝘜", | |
"V": "𝙑𝑽ꓦᏙ𝚅ѴⅤ𝐕𝔙𖼈V𝕍ꛟ𝖁𝗩𝘝𝈍V۷٧𑢠𐔝ⴸ𝖵𝑉𝓥𝒱ᐯ", | |
"W": "𝘞𝗪𝖂𝐖𑣦Ԝ𝖶W𝓦𝕎wꓪW𝙒wᏔ𝚆𝑊𝔚𑣯𝒲𝑾Ꮃ", | |
"X": "x𝕏𐌗𐊴Ꭓ𝒳𝛸X𝖃𝜲𝔛ꓫ𑣬𝘟𝓧Ⅹ𝐗𝞦Χ𝚇𐔧𝑋╳𐊐𝚾𝗫ᚷ𝙓XⲬⵝ𝝬𝑿χ𐌢Х𝖷᙭", | |
"Y": "Ꭹ𝝪𝒀ʏy𝖄Ү𝐘ϒ𝒴γ𖽃𝙔𝚼𝔜𑢤Ꮍ𝚈𝞤ꓬy𝓨у𝘠𝛶YY𝑌УⲨ𝗬𐊲𝜰Υ𝖸𝕐", | |
"Z": "𝚉𝐙𝒵𝙕ℨℤ𝘡𝞕𝒁𝖹Ꮓ𝛧𝓩𑢩Ζ𝜡𐋵𝖅ꓜZ𝝛𝗭𝑍𑣥𝚭Z", | |
"a": "𝒶ã⍺α𝜶𝛼ǎɑâ𝖆𝖺𝑎а𝐚𝛂𝗮aáạä𝓪àăåȧa𝒂𝞪𝕒𝔞𝚊𝝰ą𝙖𝘢", | |
"b": "𝗯𝖇ЬḇƅᏏᖯḅ𝓫𝕓d𝑏ḃlɓ𝘣𝙗Ƅ𝐛b𝒃𝒷𝖻𝔟b𝚋ʙ", | |
"c": "𐐽ᴄⲥ𝖼𝘤𝒸𝙘𝒄𝓬ꮯᏟϲ𝐜с𝕔𝗰Ⅽ𝔠c𝚌𝑐𝖈ⅽc𐐠", | |
"d": "𝗱ꓒ𝙙𝕕ԁᏧ𝒹ɗ𝖽𝘥ḏďd𝒅dɖl𝚍ᑯⅾ𝓭𝐝ḓ𝑑ժḑḋ𝔡đcḍb𝖉ⅆ", | |
"e": "ꬲ𝖊𝕖𝚎℮êė𝔢ⅇȩҽ𝖾ē𝒆ḛĕ𝑒ɇ𝓮ẹℯ𝙚ę𝘦ée𝐞ëèеěce𝗲", | |
"f": "𝔣𝙛ꞙ𝒻𝚏ƒf𝑓𝗳ẝf𝕗𝒇𝟋𝓯ք𝘧ꬵſ𝖿𝐟𝖋ϝḟ", | |
"g": "ɡᶃ𝗴𝔤ɢǧ𝐠g𝘨qģ𝕘gնցġℊ𝗀ĝ𝒈ǥ𝚐ƍ𝓰𝙜𝑔𝖌ğǵ", | |
"h": "ħȟհᏂⱨ𝚑ẖһ𝔥𝒽lḥḩ𝖍ℎ𝕙𝘩𝗁𝐡ɦ𝒉𝗵hhĥḧ𝓱𝙝ḣḫ", | |
"i": "ι𝝸Ⅰiᛁɨꭵاӏ𝚤𰰠𝐢𝑖𝕚𝚒1lȋᎥ˛𐐠⍳𝜄𝗂ιіꙇⅰ𝛊ɪ𝖎ỉīĭ𝙞iͺ𝒾𝓲íɩℹ𝔦𝗶𑣃𝜾𝞲ịǐïⅈı𝘪Iì𝒊", | |
"j": "𝔧𝚓jϳ𝗷𝐣𝙟𝒋𝗃յ𝒿𝑗𝖏ɉ𝘫ʝјⅉ𝕛j𝓳", | |
"k": "𝐤𝑘𝗄𝗸𝚔ḳḵ𝓴𝓀kκⱪ𝕜k𝔨𝖐𝒌ķ𝙠𝘬", | |
"m": "ᴍmmṁⅿḿṃɱrn", | |
"n": "𝓃n𝚗ñr𝐧𝒏ռ𝙣mꞑ𝗇𝘯ṅńņ𝓷𝗻ǹɴnṇň𝑛ṉո𝕟𝖓𝔫", | |
"p": "ƥ𝗉ṗᏢ𝝆ṕ𝒑𝛒𝕡𝔭p𝚙ρ𝝔⍴𝜌𝞀𝓹ƿϱⲣ𝑝P𝖕𝞺p𝓅𝐩р𝙥𝞎𝘱𝛠𝗽𝟈𝜚", | |
"q": "𝐪𝖖g𝑞գqʠq𝘲Ⴍ𝚚ԛ𝕢𝓆𝔮𝗾𝒒𝗊Ⴓ𝙦զ𝓺", | |
"r": "𝓇𝐫ṛrᴦꭈ𝑟ɼṙṟ𝘳ꭇ𝗿ȑ𝗋Իгɾŕɍȓ𝔯ⲅŗr𝒓ř𝙧ʀɽ𝚛ꮁ𝖗𝕣𝓻", | |
"s": "ꜱ𑣁𝘀ႽЅṣƽ𝓼ŝṡ𐑈Sʂ𝑠𐐠ś𝙨𝓈S𝖘ss𝕤Ꮪ𝐬𝔰𝗌𝚜ѕ𝒔ꮪ𝘴șšՏ", | |
"t": "𝐭𝒕𝑡ṫᎢ𝖙ț𝘁𝓽ƫτ𝔱ţ𝙩t𝓉𝗍𝘵𝚝ṭt𝕥ŧ", | |
"u": "𝕦𑣘ůūǔùUꭎuՍUųűư𝗎ꞟʉսûԱ𝖚𝐮𝞄𝘂𝘶𝛖𐓶𝜐ú⋃uũȗụ𝒖𝓊𝔲üυμ𝝊ʋ𝑢ŭȕ𝞾𝓾𝙪𝚞ᴜꭒ", | |
"v": "𝒗⋁𝚟𑣀𝗏ѵѴ𝜈𝞶𝑣𝓋𝐯𝔳𝘃v𝝼vⱱνטⱴ𝖛ᴠ𝘷∨ⅴ𝙫𝕧ṽꮩ𝓿ṿᶌ𑜆𝛎𝝂", | |
"w": "𑜊ẅ𝑤𑜎𝘄ẘ𝖜ɯ𝒘𝔀W𝗐𝘸vw𝚠ẇẁ𝐰ẉWwẃ𝔴ԝꮃ𝕨աⱳ𑜏𝙬Ꮃŵᴡ𝓌ѡ", | |
"x": "x𝐱⤬𝘅𝙭𝓍𝔁ᕽⅩᕁ𝗑𝖝𝑥᙮х𝚡𝔵×⤫ⅹχ𝒙𝘹x⨯𝕩", | |
"y": "𝛄𝕪𝓎ʏɣ𝗒y𝒚Үŷγƴ𝚢ỿ𝛾𑣜ℽ𝞬ɏꭚẏ𝔂𝔶ყ𝝲ỵ𝘆ү𝖞ȳyýÿу𝘺𝙮𝑦YYᶌΥ𝐲𝜸", | |
"z": "𝖟𝕫𝙯ꮓź𝘻zᏃ𝗓𝔃𝘇ʐƶż𝐳ⱬẕ𝓏𝒛ᴢẓ𝑧𝚣𑣄𝔷z" | |
} | |
homoglyph_set = {} | |
for name, glyphs in homoglyph_strings.items(): | |
homoglyph_set[name] = list(set(glyphs)) | |
return homoglyph_set | |
# Command Line Functions below this point | |
def set_logging(verbose=False, debug=False): | |
if debug == True: | |
log.setLevel("DEBUG") | |
elif verbose == True: | |
log.setLevel("INFO") | |
def parse_arguments(): | |
parser = argparse.ArgumentParser("Get a summary of some text") | |
parser.add_argument("--verbose", "-v", | |
help="Turn verbosity on", | |
action='store_true') | |
parser.add_argument("--debug", "-d", | |
help="Turn debugging on", | |
action='store_true') | |
parser.add_argument("--string", "-s", | |
help="string to transform into regex", | |
required=True) | |
parser.add_argument("--puny_only", "-p", | |
help="use only puny compliant chars", | |
action='store_true') | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment