Last active
August 29, 2024 12:00
-
-
Save bacloud22/454cf8617d5749a38deb21c499d9eefe to your computer and use it in GitHub Desktop.
This is a very strong text sanitizer which accepts plain natural language or even HTML. The pipeline is very efficient and probably contains all you need to accept clean HTML. You would omit language detection if you don't need it, as it is a resource consuming operation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// @flow | |
import { promisify } from 'util' | |
import { tidy } from 'htmltidy2' | |
import DOMPurify from 'isomorphic-dompurify' | |
import { createRequire } from 'module' | |
import { stringTransformer } from './strings.js' | |
const require = createRequire(import.meta.url) | |
const { LanguageDetectorBuilder } = require('../node_modules/@pemistahl/lingua/node/lingua') | |
const langMap = { | |
Arabic: 'ar', | |
English: 'en', | |
French: 'fr', | |
} | |
const detector = LanguageDetectorBuilder.fromLanguages(...Object.keys(langMap)) | |
.withPreloadedLanguageModels() | |
.build() | |
const tidyP = promisify(tidy) | |
const tidyOpts = { | |
hideComments: false, | |
'show-body-only': true, | |
'vertical-space': 'auto' | |
} | |
export const safeText = async (params) => { | |
// remove quilljs default <p>...</p> wrapping p tags (and all other ps / no problem) | |
// params.text = params.text.replaceAll(/<\/?p[^>]*>/g, '') | |
params.text = params.text.replaceAll(/<p><br><\/p>/g, '<br>') | |
params.text = params.text.replaceAll(/<br><br>/g, '<br>') | |
const clean = params.text.replace(/<[^>]*>?/gm, ' ').trim() | |
const language = detector.detectLanguageOf(clean) | |
let text = new stringTransformer(params.text).sanitizeHTML().linkify().decancer().valueOf() | |
try { | |
text = await tidyP(text, tidyOpts) | |
} catch (error) { | |
errors.push('FixHTML') | |
console.log(`tidy: ${error.message}`) | |
} | |
text = DOMPurify.sanitize(text) | |
return { | |
clean, | |
// language: 'en', | |
language: langMap[language] ?? 'und', | |
text, | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// @flow | |
import decancer from 'decancer' | |
import linkifyHtml from 'linkify-html' | |
import { createRequire } from 'module' | |
import sanitizeHtml from 'sanitize-html' | |
import slug from 'slug' | |
import { html, nonLatin, reb, rew } from '../../constraints/regex.js' | |
const require = createRequire(import.meta.url) | |
// const purifier = require('html5-purifier') | |
const naughtyWords = require('naughty-words') | |
const badWords = naughtyWords.ar.concat(naughtyWords.fr).concat(naughtyWords.en) | |
const Filter = require('bad-words'), | |
filter = new Filter() | |
filter.addWords(...badWords) | |
///////////////////////////////////THESE ARE HELPERS, FUNCTIONS THAT I CALL INSIDE THE PIPELINE//////////// | |
function sanitize(str: string) { | |
str = str.replace(/<h1/g, '<h3').replace(/<h2/g, '<h4') | |
return sanitizeHtml(str, { | |
allowedAttributes: { | |
a: ['href', 'name', 'target'], | |
}, | |
allowedTags: html.allowedTags, | |
}) | |
} | |
function cleanSensitive(blob) { | |
const whitelisted = [] | |
for (const regexW in rew) { | |
if (Object.prototype.hasOwnProperty.call(rew, regexW)) { | |
blob = blob.replace( | |
rew[regexW], | |
function (match, index) { | |
this.push({ i: index, m: match }) | |
return '' | |
}.bind(whitelisted), | |
) | |
} | |
} | |
const maskStr = (match) => new Array(match.length + 1).join('X') | |
for (const regexB in reb) { | |
if (Object.prototype.hasOwnProperty.call(reb, regexB)) { | |
blob = blob.replace(reb[regexB], maskStr) | |
} | |
} | |
whitelisted.forEach((w) => { | |
blob = blob.slice(0, w.i) + w.m + blob.slice(w.i) | |
}) | |
return blob | |
} | |
const settings = decancer.options({ | |
retainArabic: true, | |
disableBidi: true, | |
}) | |
// Chain wrapper for Strings | |
// I believe operations these are fragile to arbitrary NLP strings. | |
// Even if tested, we'll try-catch for errors and favour the original text if it fails. | |
function stringTransformer(s: string) { | |
let internal = String(s) | |
this.decancer = function () { | |
try { | |
internal = decancer(internal, settings).toString() | |
} catch (error) { | |
return this | |
} | |
return this | |
} | |
this.badWords = function () { | |
try { | |
internal = filter.clean(internal) | |
} catch (error) { | |
return this | |
} | |
return this | |
} | |
this.sanitizeHTML = function () { | |
try { | |
internal = sanitize(internal) | |
} catch (error) { | |
return this | |
} | |
return this | |
} | |
this.cleanSensitive = function () { | |
try { | |
internal = cleanSensitive(internal) | |
} catch (error) { | |
return this | |
} | |
return this | |
} | |
this.linkify = function () { | |
try { | |
internal = linkifyHtml(internal) | |
} catch (error) { | |
return this | |
} | |
return this | |
} | |
this.valueOf = function () { | |
// remove extra spaces | |
// return internal.replace(/\s{2,}/g, ' ').trim() | |
return internal.trim() | |
} | |
} | |
// Remove non latin | |
// Credit | |
// Author: rjanjic | |
// Source: https://stackoverflow.com/a/22075070 | |
let wordsInText = (text) => text.match(nonLatin) | |
// Turn a bad title to a good one | |
// "hello this is a-- nice @ tit buttyyyy it is very longgggggggggg" | |
// 'hello this is a nice tit hello' | |
function toTitle(longBadTitle: string, limit = 60) { | |
// Remove non latin | |
longBadTitle = longBadTitle.charAt(0).toUpperCase() + longBadTitle.slice(1) | |
longBadTitle = wordsInText(longBadTitle).join(' ') | |
if (longBadTitle < 10) throw Error('very bad title') | |
if (longBadTitle.length < limit) return longBadTitle | |
let type = '' | |
let title = longBadTitle.split(' ').reduce((acc, word) => { | |
if (!acc) return word | |
if (acc.length >= limit || acc.length > limit - 3) return acc | |
if (acc.length + word.length >= limit) { | |
if (word.length < 6) return acc + ' ' + word | |
return (acc + ' ' + word).slice(0, limit) | |
} else { | |
return acc + ' ' + word | |
} | |
}, type) | |
return title | |
} | |
/** | |
* Generate initials from an email string | |
* Like "sracer2024@yahoo.com" => "S2" | |
*/ | |
function initials(email_: string): string { | |
let email = | |
email_ | |
.split('@')[0] | |
.replace(/[0-9]/g, '') | |
.split(/[.\-_]/) || [] | |
if (email.length === 1) { | |
return email[0].slice(0, 2).toUpperCase() | |
} | |
email = ((email.shift()[0] || '') + (email.pop()[0] || '')).toUpperCase() | |
return email | |
} | |
// https://www.npmjs.com/package/text-ellipsis | |
// var short = textEllipsis('a very long text', 10); | |
// console.log(short); | |
// // "a very ..." | |
// var short = textEllipsis('a very long text', 10, { side: 'start' }); | |
// console.log(short); | |
// // "...ng text" | |
// var short = textEllipsis('a very long text', 10, { textEllipsis: ' END' }); | |
// console.log(short); | |
// // "a very END" | |
function truncate(str, maxLength, { side = 'end', ellipsis = '...' } = {}) { | |
if (str.length > maxLength) { | |
switch (side) { | |
case 'start': | |
return ellipsis + str.slice(-(maxLength - ellipsis.length)) | |
case 'end': | |
default: | |
return str.slice(0, maxLength - ellipsis.length) + ellipsis | |
} | |
} | |
return str | |
} | |
function slugify(str, opt) { | |
return opt ? slug(str, opt) : slug(str) | |
} | |
export { initials, stringTransformer, toTitle, truncate, slugify } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment