Skip to content

Instantly share code, notes, and snippets.

@seamustuohy
Created September 24, 2020 23:06
Show Gist options
  • Save seamustuohy/2e61b6ee39925c4012cf2ef6a3d9976e to your computer and use it in GitHub Desktop.
Save seamustuohy/2e61b6ee39925c4012cf2ef6a3d9976e to your computer and use it in GitHub Desktop.
Simply script to build a powershell regex to check for homoglyphs of a specific string
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright © 2018 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
# YOU WILL NEED THIS
# http://regexhero.net/reference/
# This creates a powershell compatible regular expression that you can use to check for homoglyphs of a specific string.
import argparse
import re
import logging
logging.basicConfig(level=logging.ERROR)
log = logging.getLogger(__name__)
def main():
args = parse_arguments()
set_logging(args.verbose, args.debug)
# homoglyphs = get_homoglyphs()
# regex = create_regex(args.string, homoglyphs)
if args.puny_only is True:
homoglyphs = get_puny_homoglyphs()
else:
homoglyphs = get_homoglyphs()
# print(homoglyphs)
nm_homoglyphs = make_homoglyph_named_matches(homoglyphs)
#print(nm_homoglyphs)
#print("")
regex = create_nm_regex(args.string, nm_homoglyphs)
#print("")
print('"{0}"'.format(regex))
def make_homoglyph_named_matches(homoglyphs):
nm = {}
for letter,strings in homoglyphs.items():
current_hg = set(nm.get(letter.lower(), []))
for i in strings:
current_hg.add(i)
nm[letter.lower()] = current_hg
nm_define = '(?<{0}>[{1}])'
nm_regex = {}
for letter,strings in nm.items():
nm_regex[letter] = nm_define.format(letter, ''.join(strings))
return nm_regex
def get_regex_rejecting_name_match(string):
name_regex = " ".join(["[{0}{1}]{2}".format(i[0].upper(), i[0].lower(), i[1:]) for i in string.split()])
return "(?!{0})".format(name_regex)
def create_nm_regex(string, nm_homoglyphs):
homo_regex = ""
# \p{name} :: Matches any single character in the Unicode general category or named block specified by name.
# https://www.regular-expressions.info/unicode.html
# (?<name1-name2>pattern) :: Defines a balancing group definition.
whitespace_define = '(?<WC>[\p{M}\p{Z}\p{P}\p{C}]*)'
# \k<name> :: Named backreference. Matches the value of a named expression.
whitespace_checks = "\k<WC>"
nm_hg_checks = "\k<{0}>"
letter_set = set()
for i in string:
# Define the character string upon first use
if not re.match("[a-zA-Z]", i): # Don't replace non-ascii
match_set = i
elif i.lower() in letter_set:
match_set = nm_hg_checks.format(i.lower())
else:
match_set = nm_homoglyphs.get(i.lower(), None)
letter_set.add(i.lower())
if match_set is None:
match_set = i
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set)
regex_not_original_name = get_regex_rejecting_name_match(string)
homo_regex = regex_not_original_name + whitespace_define + homo_regex
# Remove multi-space checks for beginning and end of string
homo_regex = homo_regex.replace('\k<WC> \k<WC>', '\k<WC> ')
return homo_regex
def create_regex(string, homoglyphs):
homo_regex = ""
whitespace_define = '(?<WC>[\p{M}\p{Z}\p{P}\p{C}]*)'
whitespace_checks = "\k<WC>"
for i in string:
all_letter_variations = list(set(homoglyphs.get(i.lower(), []) + homoglyphs.get(i.upper(), [])))
match_set = "[{0}]".format(''.join(all_letter_variations))
if match_set != []:
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, match_set)
else:
homo_regex = "{0}{1}{2}".format(homo_regex, whitespace_checks, i)
homo_regex = whitespace_define + homo_regex
return homo_regex
def get_puny_homoglyphs():
initial_homoglyphs = get_homoglyphs()
puny_homoglyphs = {}
for i,x in initial_homoglyphs.items():
_ph = set([i.encode('idna').decode('idna') for i in x])
puny_homoglyphs[i] = ''.join(list(_ph))
return puny_homoglyphs
def get_homoglyphs():
homoglyph_strings = {
" ": "\s",
"A": "𝗔𝖠𝙰𝘈A𝘼𝜜ꭺᗅ𝒜ꓮ𝔸Ꭺ𝓐𝚨ÅÁ𝔄𝝖𝐴À𝞐𐊠ᴀÂ𝐀ÃА𖽀𝑨𝕬𝛢AÄΑ",
"B": "𝛣𝞑ᗷß𝗕ꞴB𝖡𝜝𐊡Β𝑩𝔹𝓑𝔅в𝘉ᛒ𝐵𝙱𝝗ꓐВ𝐁ᏼ𝚩ℬBβ𐌁𐊂𝘽ʙ𝕭Ᏼ",
"C": "Ⲥ𝑪𑣩🝌Cℭ𝙲𝒞ꓚ𝓒Ꮯ𐊢ℂ𝐶C𐔜𝗖𝐂Ⅽ𐌂𑣲С𝘾𝘊𺀠𝖢𝕮Ϲ𐐕𐐠",
"D": "𝓓𝗗ᗞ𝔻𝘿Đᗪ𝐷𝙳𝖣𝒟ĎꓓDⅅⅮ𝕯𝔇ᴅ𝐃𝑫𝘋DᎠꭰ",
"E": "𝙀ÈĚ𝔈Éᴇ𝘌𝔼Е𑢦𝜠Ēℰ⋿𝝚ĔΕË𝛦𝑬𝚬𝗘𝞔ꭼĖE𝕰EĘ𑢮𝖤𝙴𐊆𝓔𝐸ꓰÊ𝐄Ꭼⴹ",
"F": "𝙁𝔽𝑭F𝙵ꓝ𝐹ᖴ𝟊𐊇𝐅𝈓Ꞙ𝕱𝔉𝓕𝖥ℱ𝗙𑢢F𑣂𐔥𐊥Ϝ𝘍",
"G": "𝘎Gԍ𝗚ɢ𝐺𝔾𝙂𝑮𝕲Ꮐնꮐ𝒢ᏻꓖԌ𝓖G𝔊𝐆𝙶𝖦Ᏻ",
"H": "𝞖𝐇𝝜𝗛ℍ𝛨𝘏Ⲏ𝖧𝜢𝙷ꓧһнᎻℋꮋ𝚮Hᕼ𝓗𝐻𝑯ʜ𝙃Η𐋏H𝕳Нℌ",
"I": "ι𝝸Ⅰiᛁꭵاӏ𝚤Ι𰰠𝐢𝑖І𝕚𝚒lᎥ˛𐐠⍳𝜄I𝗂ιіꙇⅰ𝛊ɪ𝖎ī𝙞iͺ𝒾𝓲ɩℹ𝔦𝗶𑣃𝜾𝞲ⅈı𝘪I𝒊",
"J": "𝔍𝓙Ꭻ𝕁ᴊJ𝒥𝙹𝑱Ϳ𝙅յ𝐽Jꭻ𝗝𝕵Јᒍ𝐉ꓙ𝖩Ʝ𝘑",
"K": "𝙆𝛫𝑲𝐾𝕂𝒦𝞙𝓚𝖪𝘒К𝝟ᛕꓗ𝙺𐔘KK𝚱𝔎𝐊𝗞Ⲕ𝕶ᏦΚ𝜥K",
"L": "ι𐑃LⳐ𝕃𝙇𑢣L𝐋𝓛𝔏𝕷𝈪l𐐛𝙻𝐿ⳑʟⅬ𝑳𐔦ꓡ𝘓Ꮮᒪℒ𝖫ꮮ𖼖ⅼ𝗟𑢲",
"M": "𝜧Ꮇℳ𝛭𝝡Μ𝞛𝑀𝗠𝙼𝔐Ϻ𝚳𐊰𝐌𝙈Ⅿ𝘔ᗰМ𝖬𐌑𝓜MꓟᛖⲘ𝑴𝕸M𝕄",
"N": "𐔓𝑵𝝢𝙽N𝚴𝒩𝞜𝙉𝕹ℕ𝐍Ⲛ𝛮𝘕𝔑𝖭𝜨Nɴ𝗡ꓠ𝑁𝓝Ν",
"O": "οΟoՕО0𱠠OoOо𐐠",
"P": "𝙋𝑷𝜬Ꮲ𝞠ꮲ𝚸ℙ𝘗𝙿РᑭΡꓑ𝐏𝝦ᴩ𝓟𐊕𝖯𝛲ⲢᴘPP𝔓𝑃𝒫𝗣𝕻",
"Q": "Qℚ𝖰𝙌𝚀𝗤𝘘𝐐ႳႭ𝑄ⵕ𝕼𝑸𝒬𝔔𝓠Q",
"R": "ꭱ𐒴𝘙R𝑅ℝꮢᖇℛᚱℜ𝚁𝐑ƦR𝈖ꓣ𝕽𝙍𖼵Ꭱ𝖱𝗥𝑹Ꮢʀ𝓡",
"S": "Ꮥ𵠠𝚂𝗦𝖲ႽЅ𝐒𝑺𝕊S𝕾𝑆𖼺𐊖S𝙎𝒮𝓢ꓢss𝘚ᏚՏѕ𝔖𐐠",
"T": "⟙𝞣𑢼Ꭲ𐊗𝞽Τтᴛ𝐓𝒯𝙏𝖳𝜏Ⲧτ𝕋ꭲ𝑻𝗧𝑇T𐊱𐌕𖼊T𝛕𝚻𝝉⊤Т𝔗ꓔ𝜯𝝩𝘛𝛵🝨𝞃𝕿𝓣𝚃",
"U": "𖽂𝔘𝓤𐓎𝚄ՍUU𝗨𝑼Ա𝙐⋃u𑢸𝑈μ𝖀υ𝐔ሀ∪𝕌𝖴ꓴᑌ𝒰𝘜",
"V": "𝙑𝑽ꓦᏙ𝚅ѴⅤ𝐕𝔙𖼈V𝕍ꛟ𝖁𝗩𝘝𝈍V۷٧𑢠𐔝ⴸ𝖵𝑉𝓥𝒱ᐯ",
"W": "𝘞𝗪𝖂𝐖𑣦Ԝ𝖶W𝓦𝕎wꓪW𝙒wᏔ𝚆𝑊𝔚𑣯𝒲𝑾Ꮃ",
"X": "x𝕏𐌗𐊴Ꭓ𝒳𝛸X𝖃𝜲𝔛ꓫ𑣬𝘟𝓧Ⅹ𝐗𝞦Χ𝚇𐔧𝑋╳𐊐𝚾𝗫ᚷ𝙓XⲬⵝ𝝬𝑿χ𐌢Х𝖷᙭",
"Y": "Ꭹ𝝪𝒀ʏy𝖄Ү𝐘ϒ𝒴γ𖽃𝙔𝚼𝔜𑢤Ꮍ𝚈𝞤ꓬy𝓨у𝘠𝛶YY𝑌УⲨ𝗬𐊲𝜰Υ𝖸𝕐",
"Z": "𝚉𝐙𝒵𝙕ℨℤ𝘡𝞕𝒁𝖹Ꮓ𝛧𝓩𑢩Ζ𝜡𐋵𝖅ꓜZ𝝛𝗭𝑍𑣥𝚭Z",
"a": "𝒶ã⍺α𝜶𝛼ǎɑâ𝖆𝖺𝑎а𝐚𝛂𝗮aáạä𝓪àăåȧa𝒂𝞪𝕒𝔞𝚊𝝰ą𝙖𝘢",
"b": "𝗯𝖇ЬḇƅᏏᖯḅ𝓫𝕓d𝑏ḃlɓ𝘣𝙗Ƅ𝐛b𝒃𝒷𝖻𝔟b𝚋ʙ",
"c": "𐐽ᴄⲥ𝖼𝘤𝒸𝙘𝒄𝓬ꮯᏟϲ𝐜с𝕔𝗰Ⅽ𝔠𺀠c𝚌𝑐𝖈ⅽc𐐠",
"d": "𝗱ꓒ𝙙𝕕ԁᏧ𝒹ɗ𝖽𝘥ḏďd𝒅dɖl𝚍ᑯⅾ𝓭𝐝ḓ𝑑ժḑḋ𝔡đcḍb𝖉ⅆ",
"e": "ꬲ𝖊𝕖𝚎℮êė𝔢ⅇȩҽ𝖾ē𝒆ḛĕ𝑒ɇ𝓮ẹℯ𝙚ę𝘦ée𝐞ëèеěce𝗲",
"f": "𝔣𝙛ꞙ𝒻𝚏ƒf𝑓𝗳ẝf𝕗𝒇𝟋𝓯ք𝘧ꬵſ𝖿𝐟𝖋ϝḟ",
"g": "ɡᶃ𝗴𝔤ɢǧ𝐠g𝘨qģ𝕘gնցġℊ𝗀ĝ𝒈ǥ𝚐ƍ𝓰𝙜𝑔𝖌ğǵ",
"h": "ħȟհᏂⱨ𝚑ẖһ𝔥𝒽lḥḩ𝖍ℎ𝕙𝘩𝗁𝐡ɦ𝒉𝗵hhĥḧ𝓱𝙝ḣḫ",
"i": "ι𝝸Ⅰiᛁɨꭵاӏ𝚤𰰠𝐢𝑖𝕚𝚒1lȋᎥ˛𐐠⍳𝜄𝗂ιіꙇⅰ𝛊ɪ𝖎ỉīĭ𝙞iͺ𝒾𝓲íɩℹ𝔦𝗶𑣃𝜾𝞲ịǐïⅈı𝘪Iì𝒊",
"j": "𝔧𝚓jϳ𝗷𝐣𝙟𝒋𝗃յ𝒿𝑗𝖏ɉ𝘫ʝјⅉ𝕛j𝓳",
"k": "𝐤𝑘𝗄𝗸𝚔ḳḵ𝓴𝓀kκⱪ𝕜k𝔨𝖐𝒌ķ𝙠𝘬",
"m": "ᴍmmṁⅿḿṃɱrn",
"n": "𝓃n𝚗ñr𝐧𝒏ռ𝙣mꞑ𝗇𝘯ṅńņ𝓷𝗻ǹɴnṇň𝑛ṉո𝕟𝖓𝔫",
"p": "ƥ𝗉ṗᏢ𝝆ṕ𝒑𝛒𝕡𝔭p𝚙ρ𝝔⍴𝜌𝞀𝓹ƿϱⲣ𝑝P𝖕𝞺p𝓅𝐩р𝙥𝞎𝘱𝛠𝗽𝟈𝜚",
"q": "𝐪𝖖g𝑞գqʠq𝘲Ⴍ𝚚ԛ𝕢𝓆𝔮𝗾𝒒𝗊Ⴓ𝙦զ𝓺",
"r": "𝓇𝐫ṛrᴦꭈ𝑟ɼṙṟ𝘳ꭇ𝗿ȑ𝗋Իгɾŕɍȓ𝔯ⲅŗr𝒓ř𝙧ʀɽ𝚛ꮁ𝖗𝕣𝓻",
"s": "𵠠ꜱ𑣁𝘀ႽЅṣƽ𝓼ŝṡ𐑈Sʂ𝑠𐐠ś𝙨𝓈S𝖘ss𝕤Ꮪ𝐬𝔰𝗌𝚜ѕ𝒔ꮪ𝘴șšՏ",
"t": "𝐭𝒕𝑡ṫᎢ𝖙ț𝘁𝓽ƫτ𝔱ţ𝙩t𝓉𝗍𝘵𝚝ṭt𝕥ŧ",
"u": "𝕦𑣘ůūǔùUꭎuՍUųűư𝗎ꞟʉսûԱ𝖚𝐮𝞄𝘂𝘶𝛖𐓶𝜐ú⋃uũȗụ𝒖𝓊𝔲üυμ𝝊ʋ𝑢ŭȕ𝞾𝓾𝙪𝚞ᴜꭒ",
"v": "𝒗⋁𝚟𑣀𝗏ѵѴ𝜈𝞶𝑣𝓋𝐯𝔳𝘃v𝝼vⱱνטⱴ𝖛ᴠ𝘷∨ⅴ𝙫𝕧ṽꮩ𝓿ṿᶌ𑜆𝛎𝝂",
"w": "𑜊ẅ𝑤𑜎𝘄ẘ𝖜ɯ𝒘𝔀W𝗐𝘸vw𝚠ẇẁ𝐰ẉWwẃ𝔴ԝꮃ𝕨աⱳ𑜏𝙬Ꮃŵᴡ𝓌ѡ",
"x": "x𝐱⤬𝘅𝙭𝓍𝔁ᕽⅩᕁ𝗑𝖝𝑥᙮х𝚡𝔵×⤫ⅹχ𝒙𝘹x⨯𝕩",
"y": "𝛄𝕪𝓎ʏɣ𝗒y𝒚Үŷγƴ𝚢ỿ𝛾𑣜ℽ𝞬ɏꭚẏ𝔂𝔶ყ𝝲ỵ𝘆ү𝖞ȳyýÿу𝘺𝙮𝑦YYᶌΥ𝐲𝜸",
"z": "𝖟𝕫𝙯ꮓź𝘻zᏃ𝗓𝔃𝘇ʐƶż𝐳ⱬẕ𝓏𝒛ᴢẓ𝑧𝚣𑣄𝔷z"
}
homoglyph_set = {}
for name, glyphs in homoglyph_strings.items():
homoglyph_set[name] = list(set(glyphs))
return homoglyph_set
# Command Line Functions below this point
def set_logging(verbose=False, debug=False):
if debug == True:
log.setLevel("DEBUG")
elif verbose == True:
log.setLevel("INFO")
def parse_arguments():
parser = argparse.ArgumentParser("Get a summary of some text")
parser.add_argument("--verbose", "-v",
help="Turn verbosity on",
action='store_true')
parser.add_argument("--debug", "-d",
help="Turn debugging on",
action='store_true')
parser.add_argument("--string", "-s",
help="string to transform into regex",
required=True)
parser.add_argument("--puny_only", "-p",
help="use only puny compliant chars",
action='store_true')
args = parser.parse_args()
return args
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment