Skip to content

Instantly share code, notes, and snippets.

@daformat
Last active March 6, 2018 04:01
Show Gist options
  • Save daformat/a65f114f5c3ad4fe6ae6a16f447f67dc to your computer and use it in GitHub Desktop.
Save daformat/a65f114f5c3ad4fe6ae6a16f447f67dc to your computer and use it in GitHub Desktop.
Testing which utf spaces matches against which regular expression
// Spaces information was found on http://jkorpela.fi/chars/spaces.html
// and https://en.wikipedia.org/wiki/Tab_key#Unicode
{
// Display booleans or ✅ ❌ emojis
const useEmojiForBooleans = true;
// Spaces are to be tested against the following regular expressions
const testRegexps = [
/\s/, // White-space charcater class
/[ \t]/, //
/\S/,
/[^ \t]/,
/[ \u200A\u200B\u205F\u1680\u2000\u2001\u2002\u2009\u2008\t]/
];
// We're testing against utf escaping sequences, the rest is informative
const utfSpaces = [
{
name: 'Space',
utf: '\u0020',
breaking: true,
width: 'Typically 1/4 em'
},
{
name: 'Thin space',
utf: '\u2009',
breaking: true,
width: '1/5 em, can be 1/6 em'
},
{
name: 'Hair space',
utf: '\u200A',
breaking: true,
width: 'Narrower than a thin space (less than 1/5 em or 1/6em)'
},
{
name: 'Zero width space',
utf: '\u200B',
breaking: true,
width: 'None (invisible character)'
},
{
name: 'Medium mathematical space',
utf: '\u205F',
breaking: true,
width: '4/18 em'
},
{
name: 'Ogham space mark',
utf: '\u1680',
breaking: true,
width: 'Usually represented by a 1em dash'
},
{
name: 'Mongolian vowel separator',
utf: '\u180E',
breaking: false,
width: 'None (invisible character)'
},
{
name: 'EN quad',
utf: '\u2000',
breaking: true,
width: '1 en (1/2 em)'
},
{
name: 'EM quad',
utf: '\u2001',
breaking: true,
width: '1 em'
},
{
name: 'EN space',
utf: '\u2002',
breaking: true,
width: '1 en (1/2 em)'
},
{
name: 'EM space',
utf: '\u2003',
breaking: false,
width: '1 em'
},
{
name: 'Three-per-em space',
utf: '\u2004',
breaking: false,
width: '1/3 em'
},
{
name: 'Four-per-em space',
utf: '\u2005',
breaking: false,
width: '1/4 em'
},
{
name: 'Six-per-em space',
utf: '\u2006',
breaking: false,
width: '1/6 em'
},
{
name: 'Non breaking space',
utf: '\u00A0',
html: [' ', ' ', ' ', ' '],
breaking: false,
width: 'Typically 1/4 em, same as a regular space but usually not adjusted with justification'
},
{
name: 'Narrow no-break space',
utf: '\u202F',
html: ['&nnbsp;', ' '],
breaking: false,
width: 'Narrower than a non-breaking or breaking space'
},
{
name: 'Figure space',
utf: '\u2007',
html: [' '],
breaking: false,
width: 'The width of digits (tabular space)'
},
{
name: 'Punctuation space',
utf: '\u2008',
breaking: true,
width: 'Width of a period (.)'
},
{
name: 'Word joiner',
utf: '\u2060',
html: ['&wj;', '&8288;'],
breaking: false,
width: 'None (invisible character)'
},
{
name: 'Ideographic space',
utf: '\u3000',
breaking: false,
width: 'The width of ideographic (CJK) characters'
},
{
name: 'Zero width no-break space (BOM often intepreted as)',
utf: '\uFEFF',
breaking: false,
width: 'None (invisible character)'
},
{
name: 'Character tabulation',
utf: '\u0009',
html: ['	', '	'],
width: 'Up to the next tab stop'
},
{
name: 'Line tabulation',
utf: '\u000B',
width: 'doesn’t apply (vertical)'
}
];
// Emojy helper
emojify = bool => {
return (
typeof bool !== "undefined" && useEmojiForBooleans ? (
bool ? '✅' : '❌'
) :
bool
);
}
testTable = utfSpaces.map(space => {
const testedSpace = {
name: space.name,
utf: escape(space.utf).replace('%', '\\').replace(/^(\\)([^u]{2})/, '$1u00$2'),
'utf unescaped': space.utf,
html: space.html && space.html.join(' ') || `&#${(space.utf).charCodeAt(0)};`,
breaking: emojify(space.breaking)
};
for (regexp of testRegexps) {
testedSpace[`${regexp}`] = emojify(regexp.test(space.utf))
};
return testedSpace;
})
console.table(testTable);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment