Skip to content

Instantly share code, notes, and snippets.

@gf3
Last active Dec 21, 2017
Embed
What would you like to do?
Validate and correct misspelled emails using a fuzzy matcher
/* eslint-env jasmine */
import { validate, suggest } from './email';
describe('email', () => {
describe('validate', () => {
it('should correctly validate correct email addresses', (done) => {
const promises = [
validate('user@gmail.com'),
validate('user+dix@host.ca'),
validate('user+dix@sub.domain.tld'),
validate('gianni@butt.zone')
];
Promise.all(promises).then(done, done.fail);
});
it('should correctly validate incorrect email addresses', (done) => {
const promises = [
validate('@').then(() => { throw 'fail'; }, () => true),
validate('user name@domain.com').then(() => { throw 'fail'; }, () => true),
validate('user@domain lol.com').then(() => { throw 'fail'; }, () => true),
validate('user@domain.c om').then(() => { throw 'fail'; }, () => true),
validate('gmail.com').then(() => { throw 'fail'; }, () => true),
validate('@gmail.com').then(() => { throw 'fail'; }, () => true),
validate('user@gmail.').then(() => { throw 'fail'; }, () => true),
validate('user@.com').then(() => { throw 'fail'; }, () => true)
];
Promise.all(promises).then(done, done.fail);
});
it('should correctly validate an email address with unicode characters', (done) => {
validate('snædis@💩.com').then(done, done.fail);
});
});
describe('suggest', () => {
it('should not suggest correct domains & tlds', (done) => {
const promises = [
suggest('user@gmail.com').then(e => { throw `fail: ${e}`; }, () => true),
suggest('user@hotmail.co.uk').then(e => { throw `fail: ${e}`; }, () => true)
];
Promise.all(promises).then(done, done.fail);
});
it('should not suggest correct but unknown domains & tlds', (done) => {
const promises = [
suggest('user@universe.com').then(e => { throw `fail: ${e}`; }, () => true)
];
Promise.all(promises).then(done, done.fail);
});
it('should not suggest emails that are too far gone', (done) => {
const promises = [
suggest('user@gmaillolobuzz').then(e => { throw `fail: ${e}`; }, () => true),
suggest('user@mailhotmail.co.uk.mailhotmail').then(e => { throw `fail: ${e}`; }, () => true)
];
Promise.all(promises).then(done, done.fail);
});
it('should suggest misspelled domains', (done) => {
const promises = [
suggest('user@gnail.com').then(e => expect(e).toEqual('user@gmail.com'), () => { throw 'fail: user@gnail.com'; }),
suggest('user@hotmale.co.uk').then(e => expect(e).toEqual('user@hotmail.co.uk'), () => { throw 'fail: user@hotmale.co.uk'; })
];
Promise.all(promises).then(done, done.fail);
});
it('should suggest misspelled TLDs', (done) => {
const promises = [
suggest('user@gmail.cmo').then(e => expect(e).toEqual('user@gmail.com'), () => { throw 'fail: user@gmail.cmo'; }),
suggest('user@gmail.cm').then(e => expect(e).toEqual('user@gmail.com'), () => { throw 'fail: user@gmail.cm'; }),
suggest('user@hotmail.ci.uk').then(e => expect(e).toEqual('user@hotmail.co.uk'), () => { throw 'fail: user@hotmail.ci.uk'; }),
suggest('user@hotmail.couk').then(e => expect(e).toEqual('user@hotmail.co.uk'), () => { throw 'fail: user@hotmail.couk'; })
];
Promise.all(promises).then(done, done.fail);
});
it('should suggest misspelled TLDs for unknown domains', (done) => {
const promises = [
suggest('gianni@universe.cmo').then(e => expect(e).toEqual('gianni@universe.com'), () => { throw 'fail: gianni@universe.cmo'; }),
suggest('gianni@universecom').then(e => expect(e).toEqual('gianni@universe.com'), () => { throw 'fail: gianni@universecom'; })
];
Promise.all(promises).then(done, done.fail);
});
it('should suggest misspelled domains and TLDs', (done) => {
const promises = [
suggest('user@gmailcom').then(e => expect(e).toEqual('user@gmail.com'), () => { throw 'fail: user@gmailcom'; }),
suggest('user@gnailcon').then(e => expect(e).toEqual('user@gmail.com'), () => { throw 'fail: user@gnailcon'; }),
suggest('user@hotmailco.uk').then(e => expect(e).toEqual('user@hotmail.co.uk'), () => { throw 'fail: user@hotmailco.uk'; })
];
Promise.all(promises).then(done, done.fail);
});
});
});
/**
* @flow
*/
import { sift4 } from './sift';
//-----------------------------------------------------------------------------
// Misc.
//-----------------------------------------------------------------------------
export const EMAIL = /^\S+@\S+\.\S+$/u;
const DOMAINS = [
/* Default domains included */
'aol.com', 'att.net', 'comcast.net', 'facebook.com', 'gmail.com', 'gmx.com', 'googlemail.com',
'google.com', 'hotmail.com', 'hotmail.co.uk', 'mac.com', 'me.com', 'mail.com', 'msn.com',
'live.com', 'sbcglobal.net', 'verizon.net', 'yahoo.com', 'yahoo.co.uk',
/* Other global domains */
'email.com', 'games.com' /* AOL */, 'gmx.net', 'hush.com', 'hushmail.com', 'icloud.com', 'inbox.com',
'lavabit.com', 'love.com' /* AOL */, 'outlook.com', 'pobox.com', 'rocketmail.com' /* Yahoo */,
'safe-mail.net', 'wow.com' /* AOL */, 'ygm.com' /* AOL */, 'ymail.com' /* Yahoo */, 'zoho.com', 'fastmail.fm',
'yandex.com',
/* United States ISP domains */
'bellsouth.net', 'charter.net', 'comcast.net', 'cox.net', 'earthlink.net', 'juno.com',
/* British ISP domains */
'btinternet.com', 'virginmedia.com', 'blueyonder.co.uk', 'freeserve.co.uk', 'live.co.uk',
'ntlworld.com', 'o2.co.uk', 'orange.net', 'sky.com', 'talktalk.co.uk', 'tiscali.co.uk',
'virgin.net', 'wanadoo.co.uk', 'bt.com',
/* Domains used in Asia */
'sina.com', 'qq.com', 'naver.com', 'hanmail.net', 'daum.net', 'nate.com', 'yahoo.co.jp', 'yahoo.co.kr', 'yahoo.co.id', 'yahoo.co.in', 'yahoo.com.sg', 'yahoo.com.ph',
/* French ISP domains */
'hotmail.fr', 'live.fr', 'laposte.net', 'yahoo.fr', 'wanadoo.fr', 'orange.fr', 'gmx.fr', 'sfr.fr', 'neuf.fr', 'free.fr',
/* German ISP domains */
'gmx.de', 'hotmail.de', 'live.de', 'online.de', 't-online.de' /* T-Mobile */, 'web.de', 'yahoo.de',
/* Russian ISP domains */
'mail.ru', 'rambler.ru', 'yandex.ru', 'ya.ru', 'list.ru',
/* Belgian ISP domains */
'hotmail.be', 'live.be', 'skynet.be', 'voo.be', 'tvcablenet.be', 'telenet.be',
/* Argentinian ISP domains */
'hotmail.com.ar', 'live.com.ar', 'yahoo.com.ar', 'fibertel.com.ar', 'speedy.com.ar', 'arnet.com.ar',
/* Domains used in Mexico */
'hotmail.com', 'gmail.com', 'yahoo.com.mx', 'live.com.mx', 'yahoo.com', 'hotmail.es', 'live.com', 'hotmail.com.mx', 'prodigy.net.mx', 'msn.com',
/* Domains used in Brazil */
'yahoo.com.br', 'hotmail.com.br', 'outlook.com.br', 'uol.com.br', 'bol.com.br', 'terra.com.br', 'ig.com.br', 'itelefonica.com.br', 'r7.com', 'zipmail.com.br', 'globo.com', 'globomail.com', 'oi.com.br'
];
const HOSTS = [
'aim', 'aol', 'att', 'bellsouth', 'btinternet', 'charter', 'comcast', 'cox',
'earthlink', 'gmail', 'google', 'googlemail', 'icloud', 'mac', 'me', 'msn',
'optonline', 'optusnet', 'qq', 'rocketmail', 'rogers', 'sbcglobal', 'shaw',
'sky', 'sympatico', 'telus', 'verizon', 'web', 'xtra', 'ymail'
];
const TLDS = [
'com', 'com.au', 'com.tw', 'ca', 'co.nz', 'co.uk', 'de', 'fr', 'it', 'ru',
'net', 'org', 'edu', 'gov', 'jp', 'nl', 'kr', 'se', 'eu', 'ie', 'co.il',
'us', 'at', 'be', 'dk', 'hk', 'es', 'gr', 'ch', 'no', 'cz', 'in', 'net',
'net.au', 'info', 'biz', 'mil', 'co.jp', 'sg', 'hu', 'uk'
];
//-----------------------------------------------------------------------------
// Validate
//-----------------------------------------------------------------------------
export function validate(email: string): Promise<void> {
if (!EMAIL.test(email)) {
return Promise.reject();
}
return Promise.resolve();
}
//-----------------------------------------------------------------------------
// Suggest
//-----------------------------------------------------------------------------
type Match = {
distance: number;
match: string;
};
function findClosest(haystack: Array<string>, needle: string, threshold: number = 2): ?Match {
const closest: ?Match = haystack.reduce((prev: ?Match, d: string) => {
const distance = sift4(needle, d, 5, 13);
const current: Match = {
distance,
match: d
};
if (!prev) {
return current;
}
return (current.distance < prev.distance) ? current : prev;
}, undefined);
if (closest && closest.distance > threshold) {
return undefined;
}
return closest;
}
export function suggest(email: string): Promise<string> {
const match = /(\S+?@)(\S+?(\.\S{2,}|\S{3})?)$/u.exec(email);
if (!match) {
return Promise.reject();
}
const [, user, domain, tld] = match;
// Check full domain
if (DOMAINS.includes(domain)) {
return Promise.reject();
}
const closestDomain = findClosest(DOMAINS, domain);
if (closestDomain) {
return Promise.resolve(`${user}${closestDomain.match}`);
}
// Check host and top-level domains
if (tld) {
const host = domain.slice(0, (-1 * tld.length));
const closestHost = findClosest(HOSTS, host);
const strippedTld = (tld[0] === '.')
? tld.slice(1)
: tld;
const threshold = (domain.indexOf('.') >= 0) ? 2 : 1;
const closestTld = findClosest(TLDS, strippedTld, threshold);
if (!closestHost) {
if (!closestTld) {
return Promise.reject();
}
let suggest = `${user}${host}.${closestTld.match}`;
if (suggest === email) {
return Promise.reject();
}
return Promise.resolve(suggest);
}
else if (!closestTld || ((closestHost.distance === 0) && (closestTld.distance === 0))) {
return Promise.reject();
}
return Promise.resolve(`${user}${closestHost.match}.${closestTld.match}`);
}
return Promise.reject();
}
/* eslint-disable */
// Sift4 - common version
// online algorithm to compute the distance between two strings in O(n)
// maxOffset is the number of characters to search for matching letters
// maxDistance is the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
export function sift4(s1, s2, maxOffset, maxDistance) {
if (!s1 || !s1.length) {
if (!s2) {
return 0;
}
return s2.length;
}
if (!s2 || !s2.length) {
return s1.length;
}
var l1 = s1.length;
var l2 = s2.length;
var c1 = 0; //cursor for string 1
var c2 = 0; //cursor for string 2
var lcss = 0; //largest common subsequence
var local_cs = 0; //local common substring
var trans = 0; //number of transpositions ('ab' vs 'ba')
var offset_arr = []; //offset pair array, for computing the transpositions
while ((c1 < l1) && (c2 < l2)) {
if (s1.charAt(c1) == s2.charAt(c2)) {
local_cs++;
var isTrans = false;
//see if current match is a transposition
var i = 0;
while (i < offset_arr.length) {
var ofs = offset_arr[i];
if (c1 <= ofs.c1 || c2 <= ofs.c2) {
// when two matches cross, the one considered a transposition is the one with the largest difference in offsets
isTrans = Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1);
if (isTrans) {
trans++;
} else {
if (!ofs.trans) {
ofs.trans = true;
trans++;
}
}
break;
} else {
if (c1 > ofs.c2 && c2 > ofs.c1) {
offset_arr.splice(i, 1);
} else {
i++;
}
}
}
offset_arr.push({
c1: c1,
c2: c2,
trans: isTrans
});
} else {
lcss += local_cs;
local_cs = 0;
if (c1 != c2) {
c1 = c2 = Math.min(c1, c2); //using min allows the computation of transpositions
}
//if matching characters are found, remove 1 from both cursors (they get incremented at the end of the loop)
//so that we can have only one code block handling matches
for (var i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) {
if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) {
c1 += i - 1;
c2--;
break;
}
if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) {
c1--;
c2 += i - 1;
break;
}
}
}
c1++;
c2++;
if (maxDistance) {
var temporaryDistance = Math.max(c1, c2) - lcss + trans;
if (temporaryDistance >= maxDistance) return Math.round(temporaryDistance);
}
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
if ((c1 >= l1) || (c2 >= l2)) {
lcss += local_cs;
local_cs = 0;
c1 = c2 = Math.min(c1, c2);
}
}
lcss += local_cs;
return Math.round(Math.max(l1, l2) - lcss + trans); //add the cost of transpositions to the final result
}
/* eslint-enable */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment