bacloud22/main.js

## main.js
// @flow

import { promisify } from 'util'
import { tidy } from 'htmltidy2'
import DOMPurify from 'isomorphic-dompurify'
import { createRequire } from 'module'
import { stringTransformer } from './strings.js'
const require = createRequire(import.meta.url)

const { LanguageDetectorBuilder } = require('../node_modules/@pemistahl/lingua/node/lingua')

const langMap = {
    Arabic: 'ar',
    English: 'en',
    French: 'fr',
}

const detector = LanguageDetectorBuilder.fromLanguages(...Object.keys(langMap))
    .withPreloadedLanguageModels()
    .build()

const tidyP = promisify(tidy)
const tidyOpts = {
    hideComments: false,
    'show-body-only': true,
    'vertical-space': 'auto'
}

export const safeText = async (params) => {
    // remove quilljs default <p>...</p> wrapping p tags (and all other ps / no problem)
    // params.text = params.text.replaceAll(/<\/?p[^>]*>/g, '')

    params.text = params.text.replaceAll(/<p><br><\/p>/g, '<br>')
    params.text = params.text.replaceAll(/<br><br>/g, '<br>')
    const clean = params.text.replace(/<[^>]*>?/gm, ' ').trim()
    const language = detector.detectLanguageOf(clean)
    let text = new stringTransformer(params.text).sanitizeHTML().linkify().decancer().valueOf()
    try {
        text = await tidyP(text, tidyOpts)
    } catch (error) {
        errors.push('FixHTML')
        console.log(`tidy: ${error.message}`)
    }
    text = DOMPurify.sanitize(text)
    return {
        clean,
        // language: 'en',
        language: langMap[language] ?? 'und',
        text,
    }
}

## strings.js
// @flow

import decancer from 'decancer'
import linkifyHtml from 'linkify-html'
import { createRequire } from 'module'
import sanitizeHtml from 'sanitize-html'
import slug from 'slug'

import { html, nonLatin, reb, rew } from '../../constraints/regex.js'

const require = createRequire(import.meta.url)
// const purifier = require('html5-purifier')

const naughtyWords = require('naughty-words')

const badWords = naughtyWords.ar.concat(naughtyWords.fr).concat(naughtyWords.en)
const Filter = require('bad-words'),
    filter = new Filter()
filter.addWords(...badWords)

///////////////////////////////////THESE ARE HELPERS, FUNCTIONS THAT I CALL INSIDE THE PIPELINE////////////
function sanitize(str: string) {
    str = str.replace(/<h1/g, '<h3').replace(/<h2/g, '<h4')
    return sanitizeHtml(str, {
        allowedAttributes: {
            a: ['href', 'name', 'target'],
        },
        allowedTags: html.allowedTags,
    })
}

function cleanSensitive(blob) {
    const whitelisted = []
    for (const regexW in rew) {
        if (Object.prototype.hasOwnProperty.call(rew, regexW)) {
            blob = blob.replace(
                rew[regexW],
                function (match, index) {
                    this.push({ i: index, m: match })
                    return ''
                }.bind(whitelisted),
            )
        }
    }
    const maskStr = (match) => new Array(match.length + 1).join('X')
    for (const regexB in reb) {
        if (Object.prototype.hasOwnProperty.call(reb, regexB)) {
            blob = blob.replace(reb[regexB], maskStr)
        }
    }
    whitelisted.forEach((w) => {
        blob = blob.slice(0, w.i) + w.m + blob.slice(w.i)
    })
    return blob
}

const settings = decancer.options({
    retainArabic: true,
    disableBidi: true,
})

// Chain wrapper for Strings
// I believe operations these are fragile to arbitrary NLP strings.
// Even if tested, we'll try-catch for errors and favour the original text if it fails.
function stringTransformer(s: string) {
    let internal = String(s)
    this.decancer = function () {
        try {
            internal = decancer(internal, settings).toString()
        } catch (error) {
            return this
        }
        return this
    }
    this.badWords = function () {
        try {
            internal = filter.clean(internal)
        } catch (error) {
            return this
        }
        return this
    }
    this.sanitizeHTML = function () {
        try {
            internal = sanitize(internal)
        } catch (error) {
            return this
        }
        return this
    }
    this.cleanSensitive = function () {
        try {
            internal = cleanSensitive(internal)
        } catch (error) {
            return this
        }
        return this
    }
    this.linkify = function () {
        try {
            internal = linkifyHtml(internal)
        } catch (error) {
            return this
        }
        return this
    }
    this.valueOf = function () {
        // remove extra spaces
        // return internal.replace(/\s{2,}/g, ' ').trim()
        return internal.trim()
    }
}

// Remove non latin
// Credit
// Author: rjanjic
// Source: https://stackoverflow.com/a/22075070
let wordsInText = (text) => text.match(nonLatin)

// Turn a bad title to a good one
// "hello this is a-- nice @ tit buttyyyy it is very longgggggggggg"
// 'hello this is a nice tit hello'
function toTitle(longBadTitle: string, limit = 60) {
    // Remove non latin
    longBadTitle = longBadTitle.charAt(0).toUpperCase() + longBadTitle.slice(1)
    longBadTitle = wordsInText(longBadTitle).join(' ')
    if (longBadTitle < 10) throw Error('very bad title')
    if (longBadTitle.length < limit) return longBadTitle
    let type = ''
    let title = longBadTitle.split(' ').reduce((acc, word) => {
        if (!acc) return word
        if (acc.length >= limit || acc.length > limit - 3) return acc
        if (acc.length + word.length >= limit) {
            if (word.length < 6) return acc + ' ' + word
            return (acc + ' ' + word).slice(0, limit)
        } else {
            return acc + ' ' + word
        }
    }, type)
    return title
}

/**
 * Generate initials from an email string
 * Like "sracer2024@yahoo.com" => "S2"
 */
function initials(email_: string): string {
    let email =
        email_
            .split('@')[0]
            .replace(/[0-9]/g, '')
            .split(/[.\-_]/) || []
    if (email.length === 1) {
        return email[0].slice(0, 2).toUpperCase()
    }
    email = ((email.shift()[0] || '') + (email.pop()[0] || '')).toUpperCase()
    return email
}

// https://www.npmjs.com/package/text-ellipsis
// var short = textEllipsis('a very long text', 10);
// console.log(short);
// // "a very ..."

// var short = textEllipsis('a very long text', 10, { side: 'start' });
// console.log(short);
// // "...ng text"

// var short = textEllipsis('a very long text', 10, { textEllipsis: ' END' });
// console.log(short);
// // "a very END"

function truncate(str, maxLength, { side = 'end', ellipsis = '...' } = {}) {
    if (str.length > maxLength) {
        switch (side) {
            case 'start':
                return ellipsis + str.slice(-(maxLength - ellipsis.length))
            case 'end':
            default:
                return str.slice(0, maxLength - ellipsis.length) + ellipsis
        }
    }
    return str
}

function slugify(str, opt) {
    return opt ? slug(str, opt) : slug(str)
}

export { initials, stringTransformer, toTitle, truncate, slugify }
	// @flow

	import { promisify } from 'util'
	import { tidy } from 'htmltidy2'
	import DOMPurify from 'isomorphic-dompurify'
	import { createRequire } from 'module'
	import { stringTransformer } from './strings.js'
	const require = createRequire(import.meta.url)

	const { LanguageDetectorBuilder } = require('../node_modules/@pemistahl/lingua/node/lingua')

	const langMap = {
	Arabic: 'ar',
	English: 'en',
	French: 'fr',
	}

	const detector = LanguageDetectorBuilder.fromLanguages(...Object.keys(langMap))
	.withPreloadedLanguageModels()
	.build()

	const tidyP = promisify(tidy)
	const tidyOpts = {
	hideComments: false,
	'show-body-only': true,
	'vertical-space': 'auto'
	}

	export const safeText = async (params) => {
	// remove quilljs default <p>...</p> wrapping p tags (and all other ps / no problem)
	// params.text = params.text.replaceAll(/<\/?p[^>]*>/g, '')

	params.text = params.text.replaceAll(/<p><br><\/p>/g, '<br>')
	params.text = params.text.replaceAll(/<br><br>/g, '<br>')
	const clean = params.text.replace(/<[^>]*>?/gm, ' ').trim()
	const language = detector.detectLanguageOf(clean)
	let text = new stringTransformer(params.text).sanitizeHTML().linkify().decancer().valueOf()
	try {
	text = await tidyP(text, tidyOpts)
	} catch (error) {
	errors.push('FixHTML')
	console.log(`tidy: ${error.message}`)
	}
	text = DOMPurify.sanitize(text)
	return {
	clean,
	// language: 'en',
	language: langMap[language] ?? 'und',
	text,
	}
	}
	// @flow

	import decancer from 'decancer'
	import linkifyHtml from 'linkify-html'
	import { createRequire } from 'module'
	import sanitizeHtml from 'sanitize-html'
	import slug from 'slug'

	import { html, nonLatin, reb, rew } from '../../constraints/regex.js'

	const require = createRequire(import.meta.url)
	// const purifier = require('html5-purifier')

	const naughtyWords = require('naughty-words')

	const badWords = naughtyWords.ar.concat(naughtyWords.fr).concat(naughtyWords.en)
	const Filter = require('bad-words'),
	filter = new Filter()
	filter.addWords(...badWords)

	///////////////////////////////////THESE ARE HELPERS, FUNCTIONS THAT I CALL INSIDE THE PIPELINE////////////
	function sanitize(str: string) {
	str = str.replace(/<h1/g, '<h3').replace(/<h2/g, '<h4')
	return sanitizeHtml(str, {
	allowedAttributes: {
	a: ['href', 'name', 'target'],
	},
	allowedTags: html.allowedTags,
	})
	}

	function cleanSensitive(blob) {
	const whitelisted = []
	for (const regexW in rew) {
	if (Object.prototype.hasOwnProperty.call(rew, regexW)) {
	blob = blob.replace(
	rew[regexW],
	function (match, index) {
	this.push({ i: index, m: match })
	return ''
	}.bind(whitelisted),
	)
	}
	}
	const maskStr = (match) => new Array(match.length + 1).join('X')
	for (const regexB in reb) {
	if (Object.prototype.hasOwnProperty.call(reb, regexB)) {
	blob = blob.replace(reb[regexB], maskStr)
	}
	}
	whitelisted.forEach((w) => {
	blob = blob.slice(0, w.i) + w.m + blob.slice(w.i)
	})
	return blob
	}

	const settings = decancer.options({
	retainArabic: true,
	disableBidi: true,
	})

	// Chain wrapper for Strings
	// I believe operations these are fragile to arbitrary NLP strings.
	// Even if tested, we'll try-catch for errors and favour the original text if it fails.
	function stringTransformer(s: string) {
	let internal = String(s)
	this.decancer = function () {
	try {
	internal = decancer(internal, settings).toString()
	} catch (error) {
	return this
	}
	return this
	}
	this.badWords = function () {
	try {
	internal = filter.clean(internal)
	} catch (error) {
	return this
	}
	return this
	}
	this.sanitizeHTML = function () {
	try {
	internal = sanitize(internal)
	} catch (error) {
	return this
	}
	return this
	}
	this.cleanSensitive = function () {
	try {
	internal = cleanSensitive(internal)
	} catch (error) {
	return this
	}
	return this
	}
	this.linkify = function () {
	try {
	internal = linkifyHtml(internal)
	} catch (error) {
	return this
	}
	return this
	}
	this.valueOf = function () {
	// remove extra spaces
	// return internal.replace(/\s{2,}/g, ' ').trim()
	return internal.trim()
	}
	}

	// Remove non latin
	// Credit
	// Author: rjanjic
	// Source: https://stackoverflow.com/a/22075070
	let wordsInText = (text) => text.match(nonLatin)

	// Turn a bad title to a good one
	// "hello this is a-- nice @ tit buttyyyy it is very longgggggggggg"
	// 'hello this is a nice tit hello'
	function toTitle(longBadTitle: string, limit = 60) {
	// Remove non latin
	longBadTitle = longBadTitle.charAt(0).toUpperCase() + longBadTitle.slice(1)
	longBadTitle = wordsInText(longBadTitle).join(' ')
	if (longBadTitle < 10) throw Error('very bad title')
	if (longBadTitle.length < limit) return longBadTitle
	let type = ''
	let title = longBadTitle.split(' ').reduce((acc, word) => {
	if (!acc) return word
	if (acc.length >= limit \|\| acc.length > limit - 3) return acc
	if (acc.length + word.length >= limit) {
	if (word.length < 6) return acc + ' ' + word
	return (acc + ' ' + word).slice(0, limit)
	} else {
	return acc + ' ' + word
	}
	}, type)
	return title
	}

	/**
	* Generate initials from an email string
	* Like "sracer2024@yahoo.com" => "S2"
	*/
	function initials(email_: string): string {
	let email =
	email_
	.split('@')[0]
	.replace(/[0-9]/g, '')
	.split(/[.\-_]/) \|\| []
	if (email.length === 1) {
	return email[0].slice(0, 2).toUpperCase()
	}
	email = ((email.shift()[0] \|\| '') + (email.pop()[0] \|\| '')).toUpperCase()
	return email
	}

	// https://www.npmjs.com/package/text-ellipsis
	// var short = textEllipsis('a very long text', 10);
	// console.log(short);
	// // "a very ..."

	// var short = textEllipsis('a very long text', 10, { side: 'start' });
	// console.log(short);
	// // "...ng text"

	// var short = textEllipsis('a very long text', 10, { textEllipsis: ' END' });
	// console.log(short);
	// // "a very END"

	function truncate(str, maxLength, { side = 'end', ellipsis = '...' } = {}) {
	if (str.length > maxLength) {
	switch (side) {
	case 'start':
	return ellipsis + str.slice(-(maxLength - ellipsis.length))
	case 'end':
	default:
	return str.slice(0, maxLength - ellipsis.length) + ellipsis
	}
	}
	return str
	}

	function slugify(str, opt) {
	return opt ? slug(str, opt) : slug(str)
	}

	export { initials, stringTransformer, toTitle, truncate, slugify }