Skip to content

Instantly share code, notes, and snippets.

@nexpr
Last active October 26, 2023 14:52
Show Gist options
  • Save nexpr/88e72532ac51ae7dc227d9a1df73054e to your computer and use it in GitHub Desktop.
Save nexpr/88e72532ac51ae7dc227d9a1df73054e to your computer and use it in GitHub Desktop.
ワード抽出用

これは?

ライブドアブログのダンプデータからワードを抽出するためのページ

JavaScript とか Node.js とか Web Components とか C# とか .NET とか...
主にプログラミング言語やライブラリやツールや機能などの名前を取り出すのが目的

HTML データから不要なタグを除外した上で改行を維持して文字列として取得したいのでブラウザを使う

半角スペースや記号を名前に含むものがあるのでほとんどの記号を含む半角文字列を正規表現で取り出して除外パターンと一致しないのを残す
foo - bar とか foo.bar とか foo/bar とか foo, bar とかも入ってきてキリがないので ある程度除外したら出現回数でソート

重たくなったので画面には表示せずコンソールで使用
console.table で表示する関数と結果がグローバル変数になっているので 解析後にコンソールで次のようなコードを打って表示

printResult(result)
printResult(result.slice(100))
printResult(result.filter(x => x.norm.includes(".js")))
<!DOCTYPE html>
<script type="module">
input.onchange = async (event) => {
const file = event.target.files[0]
event.target.value = ""
const str = await file.text()
process(str)
}
const parse = function* (text) {
let state = "ignore"
let buf = ""
for (const line of text.split("\n")) {
if (line === "BODY:") {
state = "body"
continue
}
if (line === "EXTENDED BODY:") {
state = "exbody"
continue
}
if (line === "-----" && state !== "ignore") {
state = "ignore"
yield buf
buf = ""
continue
}
if (state !== "ignore") {
buf += line + "\n"
}
}
}
const process = async (str) => {
console.log("start", str.length)
const d = Date.now()
const counts = {}
for (const body of parse(str)) {
// 画像等のロードを避けるため一旦 template を使って documentfragment 上で不要なものを削除
// 削除することで前後が結合されないようマッチ対象外の「 」に置き換える
const tpl = document.createElement("template")
tpl.innerHTML = body
for (const elem of tpl.content.querySelectorAll("div.code,code,script,style,img,video,iframe")) {
elem.replaceWith(" ")
}
tmp.replaceChildren(tpl.content)
const text = tmp.innerText.replaceAll("\u00a0", " ")
const added = new Set()
for (const matched of text.match(/[-_!#$%&@^~';:+*,./\\ A-Za-z0-9]+/g) || []) {
const word = matched.trim()
if (!word || skip(word)) continue
const norm_word = norm(word)
const count = counts[norm_word] = (counts[norm_word] || { total: 0, block: 0, set: new Set() })
count.total++
count.set.add(word)
if (!added.has(norm_word)) {
count.block++
added.add(norm_word)
}
}
await new Promise(r => setTimeout(r, 1))
}
console.log("end", Object.keys(counts).length)
console.log("time", Date.now() - d)
globalThis.result = Object.entries(counts)
.map(x => ({ norm: x[0], total: x[1].total, block: x[1].block, texts: x[1].set }))
.sort((a, b) => b.block - a.block || b.total - a.total || b.texts.size - a.texts.size)
}
const skip = (word) => {
const lword = word.toLowerCase()
if (allow_words.has(lword)) return false
if (lword.length === 1 || lword.length > 20 || lword.split(" ").length > 5) return true
if (skip_words.has(lword)) return true
if (skip_regexp.some(regexp => regexp.test(lword))) return true
const reg_part = "(" + skip_with.map(sk => sk.replace(/(\.)/g, "\\$1")).join("|") + ")"
const trimmed = lword.replace(new RegExp(`^${reg_part}+`), "").replace(new RegExp(`${reg_part}+$`), "")
if (trimmed.length === 1) return true
if (skip_words.has(trimmed)) return true
if (skip_regexp.some(regexp => regexp.test(trimmed))) return true
}
const norm = (word) => {
// 記号も消すとノイズが混ざって数が増えてしまう (foo-bar foo__bar foo.bar foo/bar みたいの)
// アルファベットや数字の全角はマッチ対象外なのでとりあえず小文字化だけ
return word.toLowerCase()
}
globalThis.printResult = (result) => {
console.table(result.slice(0, 100).map(x => ({ block: x.block, total: x.total, norm: x.norm, text: [...x.texts].join("、") })))
}
const allow_words = new Set([
".net",
"c",
"d",
"r",
"v8",
])
const skip_with = [
".",
"-",
"_",
",",
";",
":",
"!",
"@",
"/",
]
const skip_words = new Set([
'""',
"$_get",
"$_post",
"&&",
"''",
"**",
"***",
"*/",
"++",
"--",
".cs",
".css",
".gitignore",
".gitkeep",
".html",
".js",
".jsx",
".php",
".py",
".ts",
".tsx",
".vue",
"/*",
"/**/",
"//",
"///",
"1 + 1",
"1 + 2",
"1, 2",
"1, 2, 3",
"1.js",
"16 bit",
"16bit",
"1e1",
"2.js",
"2d",
"3, 4",
"3.js",
"32 bit",
"32bit",
"3d",
"64 bit",
"64bit",
":nth-child",
":nth-of-type",
"[:]",
"[]",
"\\n",
"\\r",
"\\s",
"\\t",
"__",
"``",
"a, b",
"a, b, c",
"a.js",
"a.php",
"a.prototype",
"a.py",
"a1",
"a2",
"a3",
"a4",
"a5",
"aa",
"aaa",
"abc",
"abcd",
"abort",
"about",
"absolute",
"abstract",
"access",
"action",
"active",
"add",
"addeventlistener",
"addon",
"adduser",
"admin",
"adopt",
"after",
"ai",
"ajax",
"alarm",
"alert",
"alias",
"align",
"align-items",
"align-self",
"all",
"alpha",
"already",
"alt",
"alter",
"amd",
"ameria",
"an",
"analytics",
"and",
"animate",
"animation",
"anonymous",
"answer",
"any",
"anywhere",
"api",
"app",
"app.config",
"app1",
"app2",
"app3",
"append",
"appendchild",
"application",
"application/json",
"apply",
"are",
"arg",
"args",
"arguments",
"arial",
"arm",
"arr",
"array",
"array.from",
"array.of",
"array.prototype",
"arraybuffer",
"arrays",
"arror function",
"arrow",
"article",
"as",
"ascii",
"aside",
"assert",
"asset",
"assign",
"assignment",
"async",
"async/await",
"asyncfunction",
"at",
"atob",
"attach",
"attached",
"attribute",
"author",
"auto",
"autocomplete",
"available",
"await",
"b1",
"b2",
"b3",
"back",
"backend",
"background",
"background-color",
"background-image",
"background-size",
"background.js",
"backspace",
"backward",
"bar",
"bar.html",
"bar.js",
"bar.php",
"bar.py",
"base",
"basic",
"baz",
"be",
"been",
"before",
"before/after",
"beforeunload",
"beta",
"bgm",
"bigint",
"bin",
"bind",
"binding",
"bindings",
"bit",
"black",
"blank",
"blob",
"block",
"blocking",
"blog",
"blue",
"blur",
"body",
"bom",
"book",
"bool",
"boolean",
"border",
"border-box",
"border-radius",
"born",
"bot",
"both",
"bottom",
"box",
"box-shadow",
"box-sizing",
"br",
"break",
"bridge",
"browser",
"btoa",
"buf",
"buffer",
"build",
"builtin",
"builtins",
"bundle",
"bundle.js",
"but",
"button",
"button1",
"button2",
"button3",
"by",
"byte",
"bytes",
"c1",
"c2",
"c3",
"cache",
"calc",
"call",
"callable",
"callback",
"can",
"canary",
"cancel",
"cannot",
"canvas",
"cascade",
"case",
"case1",
"case2",
"case3",
"cast",
"cat",
"catch",
"cb",
"cd",
"cdn",
"celeron",
"center",
"change",
"changed",
"changelog",
"char",
"charset",
"check",
"checkbox",
"checked",
"child",
"childnodes",
"children",
"chmod",
"chown",
"cjs",
"class",
"classlist",
"clean",
"clear",
"click",
"client",
"clone",
"close",
"closed",
"cluster",
"co",
"coalesce",
"code",
"col",
"col1",
"col2",
"col3",
"collection",
"color",
"colspan",
"column",
"columns",
"command",
"command1",
"command2",
"command3",
"comment",
"comments",
"commit",
"commiter",
"committed",
"common",
"community",
"complex",
"component",
"components",
"compress",
"computed",
"concat",
"config",
"configs",
"configurable",
"configure",
"confine",
"confirm",
"conflict",
"connect",
"connected",
"consolas",
"console",
"console.log",
"console.writeline",
"const",
"constructor",
"container",
"contains",
"content",
"content script",
"content scripts",
"content-length",
"content-script",
"content-scripts",
"content-type",
"content_script",
"content_scripts",
"contenteditable",
"contents",
"context",
"continue",
"control",
"controller",
"controls",
"convert",
"converter",
"cookie",
"cookies",
"core",
"correct",
"count",
"counter",
"cp",
"cpu",
"cr",
"create",
"createelement",
"creator",
"creators",
"crlf",
"crypto",
"ctrl",
"ctx",
"cui",
"curl",
"current",
"currenttarget",
"custom",
"customevent",
"cwd",
"d&d",
"daemon",
"daily",
"data",
"database",
"datalist",
"dataset",
"datauri",
"dataurl",
"date",
"datepicker",
"datetime",
"datetime-local",
"day",
"days",
"db",
"dd",
"debug",
"debugger",
"decimal",
"declared",
"decode",
"decodeuricomponent",
"decrypt",
"deep",
"def",
"default",
"defer",
"define",
"definition",
"definitions",
"delay",
"delegate",
"delegatetarget",
"delete",
"demo",
"dependencies",
"deploy",
"deprecated",
"deps",
"deref",
"design",
"detach",
"detached",
"dev",
"devdependencies",
"developer",
"df",
"dialog",
"dict",
"dir",
"directive",
"directories",
"directory",
"directoryinfo",
"dirent",
"dirname",
"dirty",
"disable",
"disabled",
"dispatch",
"dispatchevent",
"display",
"displayname",
"disposable",
"dispose",
"dist",
"div",
"divide",
"divider",
"dll",
"do",
"doc",
"docs",
"docstring",
"doctype",
"document",
"document.body",
"document.head",
"document.write",
"documentfragment",
"documents",
"dom",
"done",
"double",
"down",
"download",
"dpi",
"drag",
"draggable",
"dragstart",
"drain",
"drop",
"dry",
"du",
"dummy",
"dump",
"dynamic",
"early",
"ease",
"ease-in",
"ease-out",
"easing",
"edit",
"edition",
"editor",
"effect",
"elem",
"element",
"elements",
"elif",
"else",
"else if",
"elseif",
"elsif",
"em",
"emit",
"emitter",
"empty",
"emulate",
"emulator",
"en",
"enable",
"enabled",
"encode",
"encodeuricomponent",
"encoding",
"encrypt",
"end",
"endif",
"endswith",
"energy",
"enter",
"enterprise",
"entries",
"entry",
"enum",
"enumerate",
"env",
"eof",
"eol",
"eos",
"eq",
"equals",
"err",
"error",
"errors",
"esc",
"escape",
"esm",
"etc",
"eval",
"evaluate",
"event",
"eventemitter",
"eventemitter2",
"eventemitter3",
"eventtarget",
"every",
"ex",
"example",
"examples",
"except",
"exception",
"exe",
"exec",
"execute",
"exit",
"experiment",
"experimental",
"explorer",
"explorer.exe",
"export",
"exports",
"extend",
"extends",
"extension",
"external",
"extra",
"f1",
"f10",
"f11",
"f12",
"f2",
"f3",
"f4",
"f5",
"f6",
"f7",
"f8",
"f9",
"fail",
"failed",
"failure",
"false",
"faq",
"fatal",
"fieldset",
"file",
"fileinfo",
"filename",
"files",
"fill",
"filter",
"finally",
"find",
"findlast",
"finish",
"finished",
"first",
"first-child",
"firstchild",
"fit",
"fit-content",
"fix",
"fixed",
"flag",
"flags",
"flat",
"flex",
"flex container",
"flex item",
"flex-basis",
"flex-container",
"flex-item",
"flex-start",
"flexbox",
"float",
"flow",
"fn",
"fn1",
"fn2",
"fn3",
"fns",
"focus",
"focused",
"focusin",
"focusout",
"follow",
"font-family",
"font-size",
"foo",
"foo-bar",
"foo.html",
"foo.js",
"foo.php",
"foo.py",
"foo_bar",
"foobar",
"footer",
"for",
"for await of",
"for in",
"for of",
"for-await-of",
"for-in",
"for-of",
"force",
"foreach",
"form",
"form1",
"form2",
"form3",
"form4",
"format",
"formdata",
"forward",
"found",
"fragment",
"frame",
"free",
"freeze",
"from",
"front",
"frontend",
"fs",
"fs.promises",
"fs.readdir",
"fs.readfile",
"fs.rmdir",
"fs.writefile",
"full",
"fullscreen",
"fun",
"func",
"function",
"function.prototype",
"gap",
"gb",
"gc",
"generator",
"generatorfunction",
"generics",
"get",
"get-childitem",
"get-command",
"get-content",
"get-help",
"get/post",
"getattribute",
"getelementbyid",
"getelementsbyclassname",
"getelementsbytagname",
"getter",
"getter / setter",
"getter/setter",
"getvalue",
"gif",
"global",
"globalthis",
"god",
"gothic",
"gpu",
"grant",
"green",
"grep",
"grid",
"group",
"grow",
"gt",
"guest",
"gui",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"handler",
"has",
"have",
"having",
"hdd",
"head",
"header",
"heading",
"height",
"hello",
"help",
"helper",
"helpers",
"here",
"hidden",
"hide",
"history",
"history.push",
"home",
"hook",
"host",
"hosts",
"hour",
"hover",
"hr",
"href",
"hsl",
"hue",
"i3",
"i5",
"i7",
"i9",
"icon",
"id",
"ide",
"if",
"iframe",
"ignore",
"iife",
"illegal",
"image",
"ime",
"img",
"immutable",
"import",
"import.meta",
"important",
"importmap",
"in",
"include",
"includes",
"incorrect",
"index",
"index.html",
"index.js",
"index.php",
"indexof",
"inert",
"inf",
"infinity",
"info",
"information",
"inherit",
"init",
"initialization",
"inline",
"inline-block",
"inner",
"innerhtml",
"innertext",
"input",
"input/output",
"insert",
"inspect",
"install",
"installer",
"instanceof",
"int",
"int8array",
"integer",
"integrate",
"intel",
"interface",
"internal",
"intersect",
"intersection",
"intersectionobserver",
"invalid",
"invocation",
"invoke",
"invoke-webrequest",
"io",
"ip",
"is",
"isolate",
"isolation",
"isostring",
"isset",
"issue",
"issues",
"it",
"item",
"items",
"iterable",
"iterator",
"iwr",
"ja",
"job",
"join",
"joiner",
"jpg",
"json_decode",
"json_encode",
"justify-content",
"kb",
"key",
"key-value",
"keycode",
"keydown",
"keypress",
"keys",
"keyup",
"keyvalue",
"keyword",
"kill",
"known",
"kv",
"kyiv",
"label",
"lang",
"language",
"large",
"last",
"lastindexof",
"latest",
"layer",
"layers",
"learn",
"lecture",
"left",
"len",
"length",
"less",
"let",
"let/const",
"level",
"lf",
"li",
"lib",
"library",
"license",
"light",
"limit",
"line",
"line-height",
"linear",
"lines",
"link",
"list",
"listen",
"listener",
"listeners",
"live",
"ll",
"lnk",
"load",
"loaded",
"loading",
"local",
"locale",
"localecompare",
"localname",
"localstorage",
"location",
"location.hash",
"location.href",
"lock",
"log",
"logger",
"logic",
"logic1",
"logic2",
"logic3",
"long",
"lower",
"ls",
"lt",
"lts",
"machine",
"main",
"main.html",
"main.js",
"maintenance",
"mainwindow",
"manage",
"manager",
"manifest",
"manual",
"map",
"margin",
"margin-bottom",
"margin-left",
"margin-right",
"margin-top",
"margin/padding",
"mark",
"marked",
"marker",
"markers",
"marquee",
"master",
"match",
"matchall",
"matches",
"math",
"math.ceil",
"math.floor",
"math.pow",
"math.round",
"max",
"maxlength",
"may",
"mb",
"mdn",
"measure-command",
"media",
"meiryo",
"meiryoui",
"memo",
"memory",
"menu",
"merge",
"message",
"messagebox",
"meta",
"method",
"methods",
"microtask",
"middle",
"middleware",
"mime",
"min",
"minify",
"minute",
"mkdir",
"mode",
"model",
"module",
"modules",
"monorepo",
"month",
"more",
"most",
"mount",
"mounted",
"ms",
"msg",
"multipart/form-data",
"multiple",
"must",
"mutable",
"mv",
"name",
"namespace",
"nan",
"nat",
"native",
"nav",
"navigate",
"navigation",
"navigator",
"necessary",
"net",
"network",
"new",
"new class",
"new date",
"new promise",
"new regexp",
"next",
"nightly",
"nil",
"no",
"no-save",
"nodetype",
"nomodule",
"none",
"noopener",
"normal",
"not",
"notice",
"now",
"nowrap",
"nul",
"null",
"nullable",
"number",
"obj",
"obj1",
"obj2",
"obj3",
"object",
"object.create",
"object.entries",
"object.keys",
"object.prototype",
"object.values",
"observe",
"observer",
"of",
"off",
"office",
"offset",
"ok",
"omit",
"on",
"on*",
"on/off",
"once",
"onchange",
"onclick",
"one",
"onerror",
"oninput",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"only",
"onrequest",
"opacity",
"open",
"open/close",
"opener",
"operator",
"option",
"optional",
"options",
"or",
"orange",
"order",
"os",
"osaka",
"oss",
"out",
"outer",
"outerhtml",
"outlet",
"output",
"over",
"overflow",
"overload",
"override",
"pack",
"package",
"packages",
"padding",
"page",
"page.html",
"page1",
"page1.js",
"page2",
"pagehide",
"pages",
"pageshow",
"paragraph",
"param",
"parameter",
"parameters",
"parent",
"parse",
"parsefloat",
"parseint",
"parser",
"part",
"partition",
"pass",
"password",
"patch",
"path",
"pathname",
"pause",
"pc",
"performance",
"permission",
"permissions",
"phase",
"php -a",
"php.ini",
"php://input",
"pick",
"pid",
"pipe",
"pkill",
"platform",
"plugin",
"plugins",
"png",
"polyfill",
"ponyfill",
"pop",
"popover",
"popover1",
"popover2",
"popstate",
"popup",
"popup.html",
"popup.js",
"popup1",
"port",
"position",
"post",
"postmessage",
"pr",
"pre",
"prefix",
"preflight",
"preload",
"prepend",
"pretty",
"preview",
"print",
"private",
"pro",
"process",
"process.env",
"process.exit",
"profile",
"profiles",
"program",
"program files",
"programming",
"proj",
"project",
"projects",
"promise",
"promise.all",
"promise.reject",
"promise.resolve",
"promises",
"prompt",
"prop",
"properties",
"property",
"proposal",
"props",
"protected",
"proto",
"protocol",
"prototype",
"provider",
"proxy",
"prune",
"public",
"publish",
"pull",
"push",
"pushstate",
"put",
"pwd",
"px",
"qa",
"query",
"queryselector",
"queryselectorall",
"question",
"queue",
"queuemicrotask",
"qux",
"radio",
"raise",
"range",
"rank",
"rate",
"raw",
"rc",
"re",
"read",
"readable",
"readdir",
"reader",
"readifile",
"readline",
"readme",
"readonly",
"ready",
"real",
"rebase",
"reboot",
"recommend",
"rect",
"rectangle",
"red",
"redirect",
"reduce",
"ref",
"referer",
"referrer",
"reflection",
"regex",
"regexp",
"reject",
"relative",
"release",
"releases",
"remote",
"remove",
"removeeventlistener",
"render",
"renderer",
"rendering",
"repeat",
"repeatable",
"repl",
"replace",
"replacechildren",
"replacer",
"replacestate",
"replacewith",
"reply",
"repo",
"repository",
"req",
"request",
"requestupdate",
"require",
"required",
"requires",
"rerender",
"res",
"reset",
"resolve",
"resource",
"response",
"rest",
"restart",
"restrict",
"result",
"resume",
"return",
"returning",
"reverse",
"rewrite",
"rfc",
"rgb",
"right",
"rm",
"rmdir",
"role",
"root",
"rotate",
"route",
"router",
"routes",
"row",
"row1",
"row2",
"row3",
"rows",
"rowspan",
"rule",
"run",
"runtime",
"safe",
"sample",
"sandbox",
"save",
"save-exact",
"say",
"sbin",
"scale",
"schema",
"scheme",
"scope",
"screen",
"script",
"scripts",
"scroll",
"sd",
"sdk",
"sealed",
"search",
"sec",
"second",
"secret",
"section",
"secure",
"security",
"see",
"select",
"select-object",
"select-string",
"selected",
"selectmany",
"selector",
"selectors",
"self",
"send",
"sendmessage",
"separate",
"separator",
"seq",
"serializable",
"serve",
"server",
"service",
"session",
"sessionstorage",
"set",
"set-cookie",
"setattribute",
"setimmediate",
"setinterval",
"setstate",
"setter",
"settimeout",
"setup",
"setvalue",
"shadow",
"shadowdom",
"shadowroot",
"shallow",
"sharedarraybuffer",
"sheet",
"shift",
"ship",
"shipped",
"short",
"should",
"show",
"shrink",
"shutdown",
"sign",
"signal",
"simple",
"simulate",
"since",
"size",
"skip",
"sleep",
"slice",
"slot",
"small",
"sns",
"socket",
"solution",
"some",
"something",
"sort",
"sorted",
"source",
"sources",
"space",
"span",
"spawn",
"spec",
"specific",
"splice",
"split",
"splitter",
"spread",
"square",
"src",
"srgb",
"ssd",
"stable",
"stack",
"stage",
"stage1",
"stage2",
"stage3",
"stage4",
"standalone",
"standard",
"star",
"start",
"started",
"startswith",
"startup",
"state",
"state1",
"state2",
"statement",
"static",
"status",
"std",
"stderr",
"stdin",
"stdio",
"stdout",
"stdout/stderr",
"step",
"sticky",
"stop",
"stopped",
"storage",
"store",
"str",
"stream",
"stretch",
"strict",
"strict mode",
"strictmode",
"string",
"string.fromcharcode",
"string.raw",
"strings",
"strong",
"style",
"stylemap",
"styles",
"stylesheet",
"sub",
"subarray",
"substr",
"succeed",
"success",
"suffix",
"sum",
"sup",
"super",
"support",
"switch",
"symbol",
"symbol.iterator",
"sync",
"syntax",
"syntax error",
"syntaxerror",
"sys.path",
"syspath",
"system",
"system.object",
"tab",
"tab-size",
"tabindex",
"table",
"tag",
"tagname",
"take",
"target",
"task",
"taskhost",
"tb",
"tbody",
"td",
"temp",
"template",
"terminal",
"test",
"test.js",
"test1",
"test2",
"test3",
"testing",
"tests",
"text",
"text-indent",
"text-shadow",
"text/css",
"text/html",
"text/plain",
"textarea",
"textbox",
"textcontent",
"textdecoder",
"textencoder",
"textnode",
"tfoot",
"th",
"that",
"the",
"thead",
"their",
"then",
"there",
"think",
"third",
"this",
"this.value",
"thread",
"threshold",
"throw",
"time",
"timeout",
"timestamp",
"title",
"tmp",
"to",
"toarray",
"today",
"todo",
"tofixed",
"toggle",
"token",
"tokyo",
"tolist",
"tolocalestring",
"tolowercase",
"tool",
"tools",
"top",
"tostring",
"total",
"touch",
"touppercase",
"tr",
"transform",
"transition",
"translate",
"transparent",
"tree",
"triangle",
"trigger",
"trim",
"true",
"true/false",
"try",
"try-catch",
"try-finally",
"two",
"type",
"typedarray",
"typeerror",
"typeof",
"ua",
"ui",
"uint8array",
"uk",
"ukraine",
"ul",
"uncaught",
"undefined",
"unexpected",
"unhandled",
"uninstall",
"union",
"uniq",
"unique",
"unknown",
"unload",
"unpack",
"unref",
"unsafe",
"unshift",
"unship",
"unshipped",
"unstable",
"untitled",
"up",
"update",
"upgrade",
"upload",
"upper",
"uri",
"url",
"url.createobjecturl",
"urlsearchparams",
"us",
"usa",
"usage",
"usb",
"use",
"use strict",
"usecallback",
"usecontext",
"useeffect",
"useform",
"usememo",
"user",
"user-agent",
"user1",
"user2",
"user3",
"user:pass",
"useradd",
"useragent",
"usercontrol",
"usercontrol1",
"usercontrol2",
"usercontrol3",
"username",
"users",
"using",
"util",
"utility",
"utils",
"val",
"valid",
"validator",
"value",
"value1",
"value2",
"value3",
"valueof",
"values",
"var",
"var_dump",
"vector",
"ver",
"verdana",
"version",
"vh",
"video",
"view",
"viewport",
"virtual",
"visibility",
"visible",
"vm",
"void",
"volume",
"vw",
"wait",
"warn",
"warning",
"was",
"watch",
"wc",
"we",
"weak",
"weakmap",
"weakref",
"weakset",
"web",
"web.config",
"webstorage",
"weight",
"were",
"wget",
"wheel",
"where",
"where-object",
"while",
"white",
"width",
"width/height",
"wiki",
"wikipedia",
"wildcard",
"will",
"win",
"window",
"window.close",
"window.onload",
"window.open",
"wire",
"with",
"work",
"worker",
"workspace",
"world",
"wrap",
"writable",
"write",
"writer",
"ws",
"x-www-urlencoded",
"x.js",
"x.y.z",
"x64",
"x86",
"xhr",
"xor",
"xx",
"xxx",
"xy",
"xyz",
"year",
"years",
"yellow",
"yes",
"yes/no",
"yield",
"you",
"your",
"yy",
"z-index",
"zero",
"zoom",
"zz",
"zzz",
"||",
])
const skip_regexp = [
/^(https?|file|data):\/\//, // URL
/^-?[\d., ]+$/, // 数値
/\.(jpg|png|gif)$/, // ファイル ただし node.js みたいなケースをはじかないように言語系は除外しない
/^(ctrl|shift|win|alt)[-+]/, // ctrl- 系
/^(\d{4}[-\/])?\d{1,2}[-\/]\d{1,2}$/, // 日付
/^\d{1,2}:\d{1,2}$/, // 時刻
/-?[\d.]+ ?(byte|k|ki?b|m|mi?b|g|gi?b|t|ti?b|ms|s|sec|min|px|%)/, // 単位
/localhost/,
/^-[a-z0-9]$/, // コマンドラインオプション系
/^c:\\/,
/^\$\d+$/, // $1, $2 など
/^(\.)+\//, // ./ で始まるのはパス
/^[,\/^~\\]/, // , などの記号で始まるのは名前の可能性低い
/[,:;\/]$/, // 多くの末尾記号は記号不要 ほとんどの場合は記号無しが別にあるはずなので捨てる
/^[^A-Za-z0-9]+$/, // 全部記号のみは名前の可能性低い
/[^A-Za-z0-9]{4,}/, // 記号の長く連続するのは名前の可能性低い
/x{3,}/, // xxxx みたいのが多い
/^0[xbo]/, // 0x や 0b などの N 進数
/^0\./, // 0. から始まる小数は基本ランダム値
/^v[\d.]+$/, // v1, v2 など v8 は allow_list で対応
/^html.*element$/,
/^u\+[a-f0-9]{4,5}/, // U+00A0 など
/^[a-z]+: *[a-z]+$/, // CSS やオブジェクト overflow:hidden や writable: false など 見た感じ必要なものはマッチしてなさそう
]
</script>
<div id="input"><input type="file"></div>
<div id="result"></div>
<div id="tmp" style="position:fixed;top:100vh"></div>
index block count norm text
1 984 2399 javascript JavaScript、Javascript、javascript
2 578 1178 chrome Chrome、chrome
3 472 1225 html HTML、html
4 344 764 node.js Node.js、node.js
5 331 742 windows Windows、windows
6 326 672 css CSS、css
7 246 686 php PHP、php
8 241 442 firefox Firefox、firefox
9 184 398 devtools devtools、DEVTOOLS、DevTools、Devtools
10 170 307 c# C#、c#
11 169 404 python Python、python
12 158 387 ie IE
13 154 350 json json、JSON
14 145 306 linux Linux、linux
15 142 416 react React、react
16 139 280 c C、c
17 129 216 google Google、google
18 118 245 npm npm
19 117 220 fetch fetch、FETCH
20 108 169 github github、Github
21 101 178 fedora fedora、Fedora
22 93 200 webcomponents WebComponents、webcomponents
23 90 256 lit-html lit-html
24 80 256 jquery jQuery
25 79 175 http http、HTTP
26 77 167 vscode VSCode、vscode
27 69 129 edge Edge
28 68 175 webpack webpack、Webpack
29 65 217 wsl WSL、wsl
30 65 159 package.json package.json
31 65 98 es6 ES6、es6
32 64 162 wpf WPF、wpf
33 61 144 node_modules node_modules
34 60 172 hyperhtml hyperHTML、hyperhtml
35 55 97 visualstudio VisualStudio
36 53 80 node Node、node
37 52 166 https https、HTTPS
38 52 98 d D、d
39 51 138 electron electron、Electron
40 50 87 gist gist、Gist
41 49 111 sql SQL、sql
42 48 89 .net .NET、.net
43 48 89 ie11 IE11
44 47 220 typescript TypeScript、Typescript、typescript
45 47 146 spa SPA
46 47 87 xml XML、xml
47 47 83 dnf dnf
48 47 70 c++ C++、c++
49 46 149 powershell PowerShell、powershell
50 46 123 babel Babel、babel
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment