Last active
January 6, 2023 17:42
-
-
Save cshanejennings/2462a2c516be315400de04766b1ab7b0 to your computer and use it in GitHub Desktop.
Cut and paste string cleaning function with diacritics and allowed characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const get_string_sanitizer = (user_maps = {}, user_allowed ) => { | |
const build_map = (map_str) => map_str.split(',').map(c => c.split('|')).reduce((dm, [c, r])=> c.split('').reduce((dm, c) => ({...dm, [c]: r.replace('\\u002C', ',')}), {...dm}), {}) | |
const map_strings = { | |
diacritics: '⁰₀⓪⓿0|0,¹₁①⓵❶➀➊1|1,²₂②⓶❷➁➋2|2,³₃③⓷❸➂➌3|3,⁴₄④⓸❹➃➍4|4,⁵₅⑤⓹❺➄➎5|5,⁶₆⑥⓺❻➅➏6|6,⁷₇⑦⓻❼➆➐7|7,⁸₈⑧⓼❽➇➑8|8,⁹₉⑨⓽❾➈➒9|9,⑩⓾❿➉➓|10,⑪⓫|11,⑫⓬|12,⑬⓭|13,⑭⓮|14,⑮⓯|15,⑯⓰|16,⑰⓱|17,⑱⓲|18,⑲⓳|19,⑳⓴|20,ΑÀÁÂÃÄÅĀĂĄƏǍǞǠǺȀȂȦȺᴀḀẠẢẤẦẨẪẬẮẰẲẴẶⒶA|A,ÆǢǼᴁ|AE,ΒƁƂɃʙᴃḂḄḆⒷB|B,ÇĆĈĊČƇȻʗᴄḈⒸC|C,ÈΕÉÊËĒĔĖĘĚƎƐȄȆȨɆᴇḔḖḘḚḜẸẺẼẾỀỂỄỆⒺⱻE|E,ÌÍÎÏĨĪĬĮİƖƗǏȈȊɪᵻḬḮỈỊⒾꟾIΙ|I,ÐĎĐƉƊƋᴅᴆḊḌḎḐḒⒹꝹD|D,ÑŃŅŇŊƝǸȠɴᴎṄṆṈṊⓃNΝ|N,ÒÓÔÕÖØŌŎŐƆƟƠǑǪǬǾȌȎȪȬȮȰᴏᴐṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢⓄꝊꝌOΟ|O,ÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖɄᴜᵾṲṴṶṸṺỤỦỨỪỬỮỰⓊU|U,ÝŶŸƳȲɎʏẎỲỴỶỸỾⓎYΥ|Y,ÞꝦ|TH,ß|ss,àáâãäåāăąǎǟǡǻȁȃȧɐəɚᶏᶕḁẚạảấầẩẫậắằẳẵặₐₔⓐⱥⱯa|a,æǣǽᴂ|ae,çćĉċčƈȼɕḉↄⓒꜾꜿc|c,èéêëēĕėęěǝȅȇȩɇɘɛɜɝɞʚᴈᶒᶓᶔḕḗḙḛḝẹẻẽếềểễệₑⓔⱸe|e,ìíîïĩīĭįıǐȉȋɨᴉᵢᵼᶖḭḯỉịⁱⓘi|i,ðďđƌȡɖɗᵭᶁᶑḋḍḏḑḓⓓꝺd|d,ñńņňʼnŋƞǹȵɲɳᵰᶇṅṇṉṋⁿⓝn|n,òóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱɔɵᴖᴗᶗṍṏṑṓọỏốồổỗộớờởỡợₒⓞⱺꝋꝍo|o,ùúûüũūŭůűųưǔǖǘǚǜȕȗʉᵤᶙṳṵṷṹṻụủứừửữựⓤu|u,ýÿŷƴȳɏʎẏẙỳỵỷỹỿⓨy|y,þᵺꝧ|th,ĜĞĠĢƓǤǥǦǧǴɢʛḠⒼꝽꝾG|G,ĝğġģǵɠɡᵷᵹᶃḡⓖꝿg|g,ĤĦȞʜḢḤḦḨḪⒽⱧⱵHΗ|H,ĥħȟɥɦʮʯḣḥḧḩḫẖⓗⱨⱶh|h,IJ|IJ,ij|ij,ĴɈᴊⒿJ|J,ĵǰȷɉɟʄʝⓙⱼj|j,ĶƘǨᴋḰḲḴⓀⱩꝀꝂꝄKΚ|K,ķƙǩʞᶄḱḳḵⓚⱪꝁꝃꝅk|k,ĸɋʠⓠꝗꝙq|q,ĹĻĽĿŁȽʟᴌḶḸḺḼⓁⱠⱢꝆꝈꞀL|L,ĺļľŀłƚȴɫɬɭᶅḷḹḻḽⓛⱡꝇꝉꞁl|l,Œɶ|OE,œᴔ|oe,ŔŖŘȐȒɌʀʁᴙᴚṘṚṜṞⓇⱤꝚꞂR|R,ŕŗřȑȓɍɼɽɾɿᵣᵲᵳᶉṙṛṝṟⓡꝛꞃr|r,ŚŜŞŠȘṠṢṤṦṨⓈꜱꞅS|S,śŝşšſșȿʂᵴᶊṡṣṥṧṩẜẝⓢꞄs|s,ŢŤŦƬƮȚȾᴛṪṬṮṰⓉꞆTΤ|T,ţťŧƫƭțȶʇʈᵵṫṭṯṱẗⓣⱦt|t,ŴǷᴡẀẂẄẆẈⓌⱲW|W,ŵƿʍẁẃẅẇẉẘⓦⱳw|w,ŹŻŽƵȜȤᴢẐẒẔⓏⱫꝢZΖ|Z,źżžƶȝȥɀʐʑᵶᶎẑẓẕⓩⱬꝣz|z,ƀƃɓᵬᶀḃḅḇⓑb|b,ƑḞⒻꜰꝻꟻF|F,ƒᵮᶂḟẛⓕꝼf|f,ƕ|hv,ƜᴍḾṀṂⓂⱮꟽꟿMΜ|M,ƤᴘṔṖⓅⱣꝐꝒꝔPΡ|P,ƥᵱᵽᶈṕṗⓟꝑꝓꝕꟼp|p,ƲɅᴠṼṾỼⓋꝞꝨV|V,DŽDZ|DZ,DžDz|Dz,dždzʣʥ|dz,LJ|LJ,Lj|Lj,lj|lj,NJ|NJ,Nj|Nj,nj|nj,Ƕ|HV,Ȣᴕ|OU,ȣ|ou,ȸ|db,ȹ|qp,ɊⓆꝖꝘQ|Q,ɯɰɱᵯᶆḿṁṃⓜm|m,ʋʌᵥᶌṽṿⓥⱱⱴꝟv|v,ʦ|ts,ʨ|tc,ʪ|ls,ʫ|lz,ᵫ|ue,ᶍẋẍₓⓧx|x,ẊẌⓍXΧ|X,ẞ|SS,Ỻ|LL,ỻ|ll,⑴|(1),⑵|(2),⑶|(3),⑷|(4),⑸|(5),⑹|(6),⑺|(7),⑻|(8),⑼|(9),⑽|(10),⑾|(11),⑿|(12),⒀|(13),⒁|(14),⒂|(15),⒃|(16),⒄|(17),⒅|(18),⒆|(19),⒇|(20),⒈|1.,⒉|2.,⒊|3.,⒋|4.,⒌|5.,⒍|6.,⒎|7.,⒏|8.,⒐|9.,⒑|10.,⒒|11.,⒓|12.,⒔|13.,⒕|14.,⒖|15.,⒗|16.,⒘|17.,⒙|18.,⒚|19.,⒛|20.,⒜|(a),⒝|(b),⒞|(c),⒟|(d),⒠|(e),⒡|(f),⒢|(g),⒣|(h),⒤|(i),⒥|(j),⒦|(k),⒧|(l),⒨|(m),⒩|(n),⒪|(o),⒫|(p),⒬|(q),⒭|(r),⒮|(s),⒯|(t),⒰|(u),⒱|(v),⒲|(w),⒳|(x),⒴|(y),⒵|(z),Ꜩ|TZ,ꜩ|tz,Ꜳ|AA,ꜳ|aa,Ꜵ|AO,ꜵ|ao,Ꜷ|AU,ꜷ|au,ꜸꜺ|AV,ꜹꜻ|av,Ꜽ|AY,ꜽ|ay,Ꝏ|OO,ꝏ|oo,Ꝡ|VY,ꝡ|vy,ff|ff,fi|fi,fl|fl,ffi|ffi,ffl|ffl,st|st', | |
alt: '«»“”„″‶❝❞❮❯"|",‐‑‒–—⁻₋-|-,‘’‚‛′‵‹›❛´❜'|\u0027,‸^|^,‼|!!,⁄/|/,⁅❲[|[,⁆❳]|],⁇|??,⁈|?!,⁉|!?,⁎*|*,⁏;|;,⁒%|%,⁓~˜|~,⁺₊+|+,⁼₌=|=,⁽₍❨❪(⟨|(,⁾₎❩❫)⟩|),❬❰<|<,❭❱>|>,❴{|{,❵}|},⸨|((,⸩|)),!|!,#|#,$|$,&|&,,|\\u002C,¸.|.,:|:,?|?,@|@,\|\u005C,_|_', | |
}; | |
const allowed_strings = [ | |
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', // querty | |
'¢£¤¥¦§©¬®°±µ·º¼½¾×÷ΓΔΘΛΞΠΣΦΨΩαβθπφ†‡•…‰€™←↑→↓↔⇐⇒⇔−∗∝∼≅≈≠≡≤≥⋅◊', // other alt characters | |
] | |
const allowed_map = (user_allowed && user_allowed.length) | |
? user_allowed.split('').reduce((map, s) => ({ ...map, [s]: s }), {}) | |
: allowed_strings.flat().join('').split('').reduce((map, s) => ({ ...map, [s]: s }), {}); | |
const maps = { ...Object.keys(map_strings).reduce((maps, key) => ({...maps, [key]: build_map(map_strings[key])}), {}), ...user_maps }; | |
const api = (string) => { return { | |
string, | |
diacritics: () => api(string.replace(/[^u0000-u007E]/g, (a) => (maps.diacritics[a] || a))), | |
alt: () => api(string.replace(/[^u0000-u007E]/g, (a) => (maps.alt[a] || a))), | |
include: () => api(string.replace(/[^u0000-u007E]/g, (a) => (allowed_map[a]) ? allowed_map[a] : (a.match(/[\w\s]+/)) ? a : '')), | |
} } | |
return api; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This could possibly be replaced in many circumstances and simplified using string.normalize , which takes the following "modes" demonstrated below where
const junk = "ŔŖŘȐȒɌʀʁᴙᴚṘṚṜṞⓇⱤꝚꞂR";
NFC -- Canonical Decomposition, followed by Canonical Composition.
junk.normalize('NFC').replace(/[\u0300-\u036f]/g, ''; // ŔŖŘȐȒɌʀʁᴙᴚṘṚṜṞⓇⱤꝚꞂR none replaced
NFKC -- Compatibility Decomposition, followed by Canonical Composition.
junk.normalize('NFKC').replace(/[\u0300-\u036f]/g, ''; // ŔŖŘȐȒɌʀʁᴙᴚṘṚṜṞRⱤꝚꞂR 2 replaced, where ŔŖŘȐȒɌʀʁᴙᴚṘṚṜṞⱤꝚꞂ are unmatched
NFD -- Canonical Decomposition.
junk.normalize('NFD').replace(/[\u0300-\u036f]/g, ''; // RRRRRɌʀʁᴙᴚRRRRⓇⱤꝚꞂR 9 replaced, where ɌʀʁᴙᴚⓇⱤꝚꞂR are unmatched
NFKD -- Compatibility Decomposition.
junk.normalize('NFKD').replace(/[\u0300-\u036f]/g, ''; // RRRRRɌʀʁᴙᴚRRRRRⱤꝚꞂR with 8 replaced, where ɌʀʁᴙᴚⱤꝚꞂ are unmatched
It looks like there are many attempts in this stackoverflow post, where there may be a better solution for the diacritics portion