Skip to content

Instantly share code, notes, and snippets.

Created February 11, 2020 07:33
Show Gist options
  • Save zonesan/a7a079707e9992d52e011afb15f354e4 to your computer and use it in GitHub Desktop.
Save zonesan/a7a079707e9992d52e011afb15f354e4 to your computer and use it in GitHub Desktop.
<!DOCTYPE html>
function is_ascii(str) {
return /^[\x00-\x7F]*$/.test(str);
function is_seperator(c) {
return [" ", ",", "。", ",", ";", ",", "?", ".", "?", ";", "《", "》"].indexOf(c) > -1;
function random_choice(arr) {
return arr[Math.floor(arr.length * Math.random())];
function random_shuffle(array) {
var current_index = array.length, temporary_value, random_index;
// While there remain elements to shuffle...
while (0 !== current_index) {
// Pick a remaining element...
random_index = Math.floor(Math.random() * current_index);
current_index -= 1;
// And swap it with the current element.
temporary_value = array[current_index];
array[current_index] = array[random_index];
array[random_index] = temporary_value;
return array;
function tokenize(src_txt) {
var token_list = [];
var token = "";
for (var c of src_txt) {
if (is_ascii(c)) {
token += c;
} else {
if (token !== "") {
token = "";
if (token !== "") {
return token_list;
function reorder(token_list) {
var n_grams = [2, 3];
var i = 0;
var token_list_reordered = [];
while (i < token_list.length) {
var n_gram = random_choice(n_grams);
var j = Math.min(i + n_gram, token_list.length);
n_gram = token_list.slice(i, j);
Array.prototype.push.apply(token_list_reordered, n_gram);
i = j;
return token_list_reordered;
function sentencize(src_txt) {
var sentence_list = [];
var sentence = "";
var reordered_txt = [];
for (var c of src_txt) {
if (is_seperator(c) || !Number.isNaN(+c)) {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
sentence = "";
} else {
sentence += c;
if (sentence !== "") {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
return reordered_txt.join("");
<textarea id="src" name="message" rows="10" cols="30">
<button type="button"
onclick="document.getElementById('demo').innerHTML = sentencize(document.getElementById('src').innerHTML)">
<p id="demo"></p>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment