Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
collapses alternation groups in a regex
const trie = require('trie-prefix-tree');
const tldList = require('tlds');
//const corpus = tldList;
const corpus = [ 'ca', 'cat', 'cot', 'car', 'catastrophe', 'catastrophic', 'dog' ];
//const corpus = [ 'cat' ];
const tlds = trie( corpus ).tree();
//
// { a: { $: 1 } } -> a
// { a: { b: { $: 1 } } } -> ab
// { a: { b: { $: 1 }, c: { $: 1 } } } -> a(?:b|c)
// { a: { b: { $: 1 }, c: { $: 1 }, $: 1 } } -> a(?:b|c)?
// { a: { b: { c: { $: 1 }, $: 1 }, c: { $: 1 }, $: 1 } } -> a(?:bc?|c)?
// { a: { b: { c: { d: { $: 1 } }, $: 1 }, c: { $: 1 }, $: 1 } } -> a(?:b(?:c|d)?|c)?
//
const ks = (a,b)=>a.length===b.length?a.localeCompare(b):b.length-a.length;
const kk = o => Object.keys(o).sort(ks);
const print = oo => {
let o = JSON.parse(JSON.stringify(oo));
const keys = kk(o);
const collapse = so => {
kk(so).forEach( k => {
collapse( so[ k ] );
const ck = kk( so[ k ] );
if ( ck.length === 1 && ck[ 0 ] === '$' ) {
so[ k ] = so[ k ][ ck[ 0 ] ];
}
const key = ck.length === 1 && ck[ 0 ] !== '$' ? `${ k }${ ck[ 0 ] }` : k;
if ( key !== k ) {
so[ key ] = so[ k ][ ck[ 0 ] ];
delete so[ k ];
}
} )
};
collapse( o );
console.log( JSON.stringify( o, null, 2 ) );
const toList = so => {
return kk(so).map( k => {
if ( so[ k ] === 1 ) {
return [ k, k.length ];
}
const c = toList( so[ k ] );
return [ k, c.reduce( ( sum, [k, n, s] ) => sum + n + k.length, 0 ), c ];
} );
}
const sort = l => {
if ( ! Array.isArray( l ) ) {
return;
}
l.forEach( sort );
l.sort( ( a, b ) => b[1] - a[1] );
};
const nullsToEnd = l => l.map( item => {
if ( ! Array.isArray( item ) ) {
return item;
}
const next = [ ...item.filter( a => a !== null ), ...item.filter( a => a === null ) ];
return nullsToEnd( next );
} );
const finish = l => l.map( ( [ a, b, c ] ) => {
if ( Array.isArray( c ) ) {
return [ a, finish( c ) ];
}
if ( c ) {
return [ a, c ];
}
if ( a === '$' ) {
return null;
}
return [ a ];
} );
const r = l => {
if ( ! Array.isArray( l ) ) {
return l;
}
if ( l.every( i => Array.isArray( i ) && i.length === 1 && 'string' === typeof i[ 0 ] ) ) {
return [ l.map( i => i[ 0 ] ).join( '|' ) ];
}
if ( l.length === 2 && 'string' === typeof l[ 0 ] ) {
const inner = l[ 1 ];
const maybe = inner[ inner.length - 1 ] === null;
const innerLength = maybe ? inner.length - 1 : inner.length;
if ( innerLength === 1 ) {
return maybe
? `(?:${ l[ 0 ] }(?:${ r( l[ 1 ][ 0 ] ) })?)`
: `(?:${ l[ 0 ] }${ r( l[ 1 ][ 0 ] ) })`
}
return maybe
? `(?:${ l[ 0 ] }(?:${ l[ 1 ].slice( 0, -1 ).map( r ).join( '|' ) })?)`
: `(?:${ l[ 0 ] }(?:${ l[ 1 ].map( r ).join( '|' ) }))`;
}
return l.map( r );
};
let output = o;
output = toList( o );
sort( output );
output = finish( output );
output = nullsToEnd( output );
output = r( output ).join('|');
return output;
}
console.log('^(?:' + print(tlds) + ')$' );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment