Skip to content

Instantly share code, notes, and snippets.

@bacloud22
Last active August 29, 2024 12:00
Show Gist options
  • Save bacloud22/454cf8617d5749a38deb21c499d9eefe to your computer and use it in GitHub Desktop.
Save bacloud22/454cf8617d5749a38deb21c499d9eefe to your computer and use it in GitHub Desktop.
This is a very strong text sanitizer which accepts plain natural language or even HTML. The pipeline is very efficient and probably contains all you need to accept clean HTML. You would omit language detection if you don't need it, as it is a resource consuming operation.
// @flow
import { promisify } from 'util'
import { tidy } from 'htmltidy2'
import DOMPurify from 'isomorphic-dompurify'
import { createRequire } from 'module'
import { stringTransformer } from './strings.js'
const require = createRequire(import.meta.url)
const { LanguageDetectorBuilder } = require('../node_modules/@pemistahl/lingua/node/lingua')
const langMap = {
Arabic: 'ar',
English: 'en',
French: 'fr',
}
const detector = LanguageDetectorBuilder.fromLanguages(...Object.keys(langMap))
.withPreloadedLanguageModels()
.build()
const tidyP = promisify(tidy)
const tidyOpts = {
hideComments: false,
'show-body-only': true,
'vertical-space': 'auto'
}
export const safeText = async (params) => {
// remove quilljs default <p>...</p> wrapping p tags (and all other ps / no problem)
// params.text = params.text.replaceAll(/<\/?p[^>]*>/g, '')
params.text = params.text.replaceAll(/<p><br><\/p>/g, '<br>')
params.text = params.text.replaceAll(/<br><br>/g, '<br>')
const clean = params.text.replace(/<[^>]*>?/gm, ' ').trim()
const language = detector.detectLanguageOf(clean)
let text = new stringTransformer(params.text).sanitizeHTML().linkify().decancer().valueOf()
try {
text = await tidyP(text, tidyOpts)
} catch (error) {
errors.push('FixHTML')
console.log(`tidy: ${error.message}`)
}
text = DOMPurify.sanitize(text)
return {
clean,
// language: 'en',
language: langMap[language] ?? 'und',
text,
}
}
// @flow
import decancer from 'decancer'
import linkifyHtml from 'linkify-html'
import { createRequire } from 'module'
import sanitizeHtml from 'sanitize-html'
import slug from 'slug'
import { html, nonLatin, reb, rew } from '../../constraints/regex.js'
const require = createRequire(import.meta.url)
// const purifier = require('html5-purifier')
const naughtyWords = require('naughty-words')
const badWords = naughtyWords.ar.concat(naughtyWords.fr).concat(naughtyWords.en)
const Filter = require('bad-words'),
filter = new Filter()
filter.addWords(...badWords)
///////////////////////////////////THESE ARE HELPERS, FUNCTIONS THAT I CALL INSIDE THE PIPELINE////////////
function sanitize(str: string) {
str = str.replace(/<h1/g, '<h3').replace(/<h2/g, '<h4')
return sanitizeHtml(str, {
allowedAttributes: {
a: ['href', 'name', 'target'],
},
allowedTags: html.allowedTags,
})
}
function cleanSensitive(blob) {
const whitelisted = []
for (const regexW in rew) {
if (Object.prototype.hasOwnProperty.call(rew, regexW)) {
blob = blob.replace(
rew[regexW],
function (match, index) {
this.push({ i: index, m: match })
return ''
}.bind(whitelisted),
)
}
}
const maskStr = (match) => new Array(match.length + 1).join('X')
for (const regexB in reb) {
if (Object.prototype.hasOwnProperty.call(reb, regexB)) {
blob = blob.replace(reb[regexB], maskStr)
}
}
whitelisted.forEach((w) => {
blob = blob.slice(0, w.i) + w.m + blob.slice(w.i)
})
return blob
}
const settings = decancer.options({
retainArabic: true,
disableBidi: true,
})
// Chain wrapper for Strings
// I believe operations these are fragile to arbitrary NLP strings.
// Even if tested, we'll try-catch for errors and favour the original text if it fails.
function stringTransformer(s: string) {
let internal = String(s)
this.decancer = function () {
try {
internal = decancer(internal, settings).toString()
} catch (error) {
return this
}
return this
}
this.badWords = function () {
try {
internal = filter.clean(internal)
} catch (error) {
return this
}
return this
}
this.sanitizeHTML = function () {
try {
internal = sanitize(internal)
} catch (error) {
return this
}
return this
}
this.cleanSensitive = function () {
try {
internal = cleanSensitive(internal)
} catch (error) {
return this
}
return this
}
this.linkify = function () {
try {
internal = linkifyHtml(internal)
} catch (error) {
return this
}
return this
}
this.valueOf = function () {
// remove extra spaces
// return internal.replace(/\s{2,}/g, ' ').trim()
return internal.trim()
}
}
// Remove non latin
// Credit
// Author: rjanjic
// Source: https://stackoverflow.com/a/22075070
let wordsInText = (text) => text.match(nonLatin)
// Turn a bad title to a good one
// "hello this is a-- nice @ tit buttyyyy it is very longgggggggggg"
// 'hello this is a nice tit hello'
function toTitle(longBadTitle: string, limit = 60) {
// Remove non latin
longBadTitle = longBadTitle.charAt(0).toUpperCase() + longBadTitle.slice(1)
longBadTitle = wordsInText(longBadTitle).join(' ')
if (longBadTitle < 10) throw Error('very bad title')
if (longBadTitle.length < limit) return longBadTitle
let type = ''
let title = longBadTitle.split(' ').reduce((acc, word) => {
if (!acc) return word
if (acc.length >= limit || acc.length > limit - 3) return acc
if (acc.length + word.length >= limit) {
if (word.length < 6) return acc + ' ' + word
return (acc + ' ' + word).slice(0, limit)
} else {
return acc + ' ' + word
}
}, type)
return title
}
/**
* Generate initials from an email string
* Like "sracer2024@yahoo.com" => "S2"
*/
function initials(email_: string): string {
let email =
email_
.split('@')[0]
.replace(/[0-9]/g, '')
.split(/[.\-_]/) || []
if (email.length === 1) {
return email[0].slice(0, 2).toUpperCase()
}
email = ((email.shift()[0] || '') + (email.pop()[0] || '')).toUpperCase()
return email
}
// https://www.npmjs.com/package/text-ellipsis
// var short = textEllipsis('a very long text', 10);
// console.log(short);
// // "a very ..."
// var short = textEllipsis('a very long text', 10, { side: 'start' });
// console.log(short);
// // "...ng text"
// var short = textEllipsis('a very long text', 10, { textEllipsis: ' END' });
// console.log(short);
// // "a very END"
function truncate(str, maxLength, { side = 'end', ellipsis = '...' } = {}) {
if (str.length > maxLength) {
switch (side) {
case 'start':
return ellipsis + str.slice(-(maxLength - ellipsis.length))
case 'end':
default:
return str.slice(0, maxLength - ellipsis.length) + ellipsis
}
}
return str
}
function slugify(str, opt) {
return opt ? slug(str, opt) : slug(str)
}
export { initials, stringTransformer, toTitle, truncate, slugify }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment