Created
July 13, 2021 12:28
-
-
Save JeffreyMFarley/3084f8c6119806e7d646884c823e4815 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BULLET = 0xB7 | |
def buildPunctuationReplace(): | |
table = {0xa0: ' ', # non-breaking space | |
0xa6: '|', | |
0xb4: '\'', | |
0xb6: '*', | |
0xd7: 'x', | |
0x2022: BULLET, # bullet | |
0x2023: BULLET, # triangular bullet | |
0x2024: '.', # one dot leader | |
0x2026: '', # ellipses | |
0x2027: '*', # hyphenation point | |
0x2028: ' ', # Line separator | |
0x2029: ' ', # Paragraph separator | |
0x202f: ' ', # Narrow no-break space | |
0x2032: "'", # prime | |
0x2033: '"', # double prime | |
0x2035: "'", # reversed prime | |
0x2036: '"', # reversed double prime | |
0x2039: '<', | |
0x203a: '>', | |
0x2043: BULLET, # hyphen bullet | |
0x2044: '/', | |
0x204e: '*', | |
0x2053: '~', | |
0x205F: ' ', # Medium Mathematical Space | |
0x2060: ' ', # Word-Joiner | |
0x2219: BULLET, # bullet operator | |
0x25CB: BULLET, # white circle | |
0x25A1: BULLET, # white square | |
0x25CF: BULLET, # black circle | |
0x25E6: BULLET, # white bullet | |
0x2610: BULLET, # ballot box | |
0x2612: BULLET, # ballot box with x | |
0x3000: ' ', # Ideographic Space | |
} | |
table.update({c: ' ' for c in range(0x2000, 0x200b)}) # Unicode spaces | |
table.update({c: None for c in range(0x200b, 0x200e)}) # Zero-width spaces | |
table.update({c: '-' for c in range(0x2010, 0x2015)}) # Unicode hyphens | |
table.update({c: "'" for c in range(0x2018, 0x201b)}) # smart single quotes | |
table.update({c: '"' for c in range(0x201c, 0x201f)}) # smart double quotes | |
return table | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If the translation table is character to character, use