Skip to content

Instantly share code, notes, and snippets.

@zonesan
Created February 11, 2020 07:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zonesan/c1be72bbc368eec1f1ac2590044e588e to your computer and use it in GitHub Desktop.
Save zonesan/c1be72bbc368eec1f1ac2590044e588e to your computer and use it in GitHub Desktop.
<!DOCTYPE html>
<html>
<script>
function is_ascii(str) {
return /^[\x00-\x7F]*$/.test(str);
}
function is_seperator(c) {
return [" ", ",", "。", ",", ";", ",", "?", ".", "?", ";", "《", "》"].indexOf(c) > -1;
}
function random_choice(arr) {
return arr[Math.floor(arr.length * Math.random())];
}
function random_shuffle(array) {
var current_index = array.length, temporary_value, random_index;
// While there remain elements to shuffle...
while (0 !== current_index) {
// Pick a remaining element...
random_index = Math.floor(Math.random() * current_index);
current_index -= 1;
// And swap it with the current element.
temporary_value = array[current_index];
array[current_index] = array[random_index];
array[random_index] = temporary_value;
}
return array;
}
function tokenize(src_txt) {
var token_list = [];
var token = "";
for (var c of src_txt) {
if (is_ascii(c)) {
token += c;
} else {
if (token !== "") {
token_list.push(token);
token = "";
}
token_list.push(c);
}
}
if (token !== "") {
token_list.push(token);
}
return token_list;
}
function reorder(token_list) {
var n_grams = [2, 3];
var i = 0;
var token_list_reordered = [];
while (i < token_list.length) {
var n_gram = random_choice(n_grams);
var j = Math.min(i + n_gram, token_list.length);
n_gram = token_list.slice(i, j);
random_shuffle(n_gram);
Array.prototype.push.apply(token_list_reordered, n_gram);
i = j;
}
return token_list_reordered;
}
function sentencize(src_txt) {
var sentence_list = [];
var sentence = "";
var reordered_txt = [];
for (var c of src_txt) {
if (is_seperator(c) || !Number.isNaN(+c)) {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
reordered_txt.push(c);
sentence = "";
} else {
sentence += c;
}
}
if (sentence !== "") {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
}
return reordered_txt.join("");
}
</script>
<body>
<h1>研表究明,汉字的序顺并不定一能影阅响读</h1>
<textarea id="src" name="message" rows="10" cols="30">
研究表明,汉字的顺序并不一定能影响阅读。
</textarea>
<button type="button"
onclick="document.getElementById('demo').innerHTML = sentencize(document.getElementById('src').innerHTML)">
变!</button>
<p id="demo"></p>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment