Last active
March 6, 2018 04:01
-
-
Save daformat/a65f114f5c3ad4fe6ae6a16f447f67dc to your computer and use it in GitHub Desktop.
Testing which utf spaces matches against which regular expression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Spaces information was found on http://jkorpela.fi/chars/spaces.html | |
// and https://en.wikipedia.org/wiki/Tab_key#Unicode | |
{ | |
// Display booleans or ✅ ❌ emojis | |
const useEmojiForBooleans = true; | |
// Spaces are to be tested against the following regular expressions | |
const testRegexps = [ | |
/\s/, // White-space charcater class | |
/[ \t]/, // | |
/\S/, | |
/[^ \t]/, | |
/[ \u200A\u200B\u205F\u1680\u2000\u2001\u2002\u2009\u2008\t]/ | |
]; | |
// We're testing against utf escaping sequences, the rest is informative | |
const utfSpaces = [ | |
{ | |
name: 'Space', | |
utf: '\u0020', | |
breaking: true, | |
width: 'Typically 1/4 em' | |
}, | |
{ | |
name: 'Thin space', | |
utf: '\u2009', | |
breaking: true, | |
width: '1/5 em, can be 1/6 em' | |
}, | |
{ | |
name: 'Hair space', | |
utf: '\u200A', | |
breaking: true, | |
width: 'Narrower than a thin space (less than 1/5 em or 1/6em)' | |
}, | |
{ | |
name: 'Zero width space', | |
utf: '\u200B', | |
breaking: true, | |
width: 'None (invisible character)' | |
}, | |
{ | |
name: 'Medium mathematical space', | |
utf: '\u205F', | |
breaking: true, | |
width: '4/18 em' | |
}, | |
{ | |
name: 'Ogham space mark', | |
utf: '\u1680', | |
breaking: true, | |
width: 'Usually represented by a 1em dash' | |
}, | |
{ | |
name: 'Mongolian vowel separator', | |
utf: '\u180E', | |
breaking: false, | |
width: 'None (invisible character)' | |
}, | |
{ | |
name: 'EN quad', | |
utf: '\u2000', | |
breaking: true, | |
width: '1 en (1/2 em)' | |
}, | |
{ | |
name: 'EM quad', | |
utf: '\u2001', | |
breaking: true, | |
width: '1 em' | |
}, | |
{ | |
name: 'EN space', | |
utf: '\u2002', | |
breaking: true, | |
width: '1 en (1/2 em)' | |
}, | |
{ | |
name: 'EM space', | |
utf: '\u2003', | |
breaking: false, | |
width: '1 em' | |
}, | |
{ | |
name: 'Three-per-em space', | |
utf: '\u2004', | |
breaking: false, | |
width: '1/3 em' | |
}, | |
{ | |
name: 'Four-per-em space', | |
utf: '\u2005', | |
breaking: false, | |
width: '1/4 em' | |
}, | |
{ | |
name: 'Six-per-em space', | |
utf: '\u2006', | |
breaking: false, | |
width: '1/6 em' | |
}, | |
{ | |
name: 'Non breaking space', | |
utf: '\u00A0', | |
html: [' ', ' ', ' ', ' '], | |
breaking: false, | |
width: 'Typically 1/4 em, same as a regular space but usually not adjusted with justification' | |
}, | |
{ | |
name: 'Narrow no-break space', | |
utf: '\u202F', | |
html: ['&nnbsp;', ' '], | |
breaking: false, | |
width: 'Narrower than a non-breaking or breaking space' | |
}, | |
{ | |
name: 'Figure space', | |
utf: '\u2007', | |
html: [' '], | |
breaking: false, | |
width: 'The width of digits (tabular space)' | |
}, | |
{ | |
name: 'Punctuation space', | |
utf: '\u2008', | |
breaking: true, | |
width: 'Width of a period (.)' | |
}, | |
{ | |
name: 'Word joiner', | |
utf: '\u2060', | |
html: ['&wj;', '&8288;'], | |
breaking: false, | |
width: 'None (invisible character)' | |
}, | |
{ | |
name: 'Ideographic space', | |
utf: '\u3000', | |
breaking: false, | |
width: 'The width of ideographic (CJK) characters' | |
}, | |
{ | |
name: 'Zero width no-break space (BOM often intepreted as)', | |
utf: '\uFEFF', | |
breaking: false, | |
width: 'None (invisible character)' | |
}, | |
{ | |
name: 'Character tabulation', | |
utf: '\u0009', | |
html: ['	', '	'], | |
width: 'Up to the next tab stop' | |
}, | |
{ | |
name: 'Line tabulation', | |
utf: '\u000B', | |
width: 'doesn’t apply (vertical)' | |
} | |
]; | |
// Emojy helper | |
emojify = bool => { | |
return ( | |
typeof bool !== "undefined" && useEmojiForBooleans ? ( | |
bool ? '✅' : '❌' | |
) : | |
bool | |
); | |
} | |
testTable = utfSpaces.map(space => { | |
const testedSpace = { | |
name: space.name, | |
utf: escape(space.utf).replace('%', '\\').replace(/^(\\)([^u]{2})/, '$1u00$2'), | |
'utf unescaped': space.utf, | |
html: space.html && space.html.join(' ') || `&#${(space.utf).charCodeAt(0)};`, | |
breaking: emojify(space.breaking) | |
}; | |
for (regexp of testRegexps) { | |
testedSpace[`${regexp}`] = emojify(regexp.test(space.utf)) | |
}; | |
return testedSpace; | |
}) | |
console.table(testTable); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment