Skip to content

Instantly share code, notes, and snippets.

@shuiRong
Created October 26, 2025 14:07
Show Gist options
  • Select an option

  • Save shuiRong/ec206728023d4e783d7c192e967119c7 to your computer and use it in GitHub Desktop.

Select an option

Save shuiRong/ec206728023d4e783d7c192e967119c7 to your computer and use it in GitHub Desktop.
How many words are there in aozora bunko?
Mix.install([
{:floki, "~> 0.38.0"},
{:mecab, "~> 1.0"},
{:iconv, "~> 1.0"},
{:charset_detect, "~> 0.1.3"}
])
children = [
{Task.Supervisor, name: MyApp.TaskSupervisor}
]
Supervisor.start_link(children, strategy: :one_for_one)
html_files = Path.wildcard("/Users/xxxxx/aozorabunko/**/*.html")
parse_article = fn article ->
Mecab.parse(article)
|> Enum.reject(fn %{
"lexical_form" => lexical_form,
"part_of_speech" => part_of_speech,
"surface_form" => surface_form
} ->
case {lexical_form, part_of_speech, surface_form} do
{"*", _, _} -> true
{_, _, "EOS"} -> true
{_, "助詞", _} -> true
{_, "記号", _} -> true
_ -> false
end
end)
|> Enum.map(fn %{"lexical_form" => lexical_form} -> lexical_form end)
|> Enum.uniq()
end
articles =
html_files
|> Enum.map(fn file ->
data = file |> File.read!()
content =
case CharsetDetect.guess!(data) do
"UTF-8" ->
IO.puts("valid data")
data
"Shift_JIS" ->
IO.puts("invalid data, file: #{file}")
:iconv.convert("SHIFT_JIS", "UTF-8", data)
unknown_encoding ->
IO.puts("unknown encoding, file: #{file}")
:iconv.convert(unknown_encoding, "UTF-8", data)
end
content
|> Floki.parse_document!()
|> Floki.find("body")
|> Floki.text()
end)
Task.Supervisor.async_stream_nolink(MyApp.TaskSupervisor, articles, parse_article,
max_concurrency: 200,
ordered: false,
timeout: 300_000,
on_timeout: :kill_task
)
|> Enum.map(fn {:ok, words} -> words end)
|> Enum.concat()
|> Enum.uniq()
|> tap(fn words ->
File.write("./words.txt", Enum.join(words, ","))
IO.puts("done")
end)
|> Stream.run()
File.read!("./words.txt")
|> String.split(",")
|> Enum.reject(fn str ->
cond do
# 检测乱码的拉丁字符
String.match?(str, ~r/[\x{00C0}-\x{024F}]/u) ->
true
# 检测全角数字,并且只有一个字符
String.match?(str, ~r/^[\x{FF10}-\x{FF19}]$/u) ->
true
# 检测全角英文字母,并且只有一个字符
String.match?(str, ~r/^[\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}]$/u) ->
true
# 检测半角英文字母、数字,并且只有一个字符
String.match?(str, ~r/^[0-9A-Za-z]$/) ->
true
# 检测(全角、半角)平片假名,并且只有一个字符
String.match?(
str,
~r/^[\x{3040}-\x{309F}\x{30A0}-\x{30FF}\x{31F0}-\x{31FF}\x{FF66}-\x{FF9D}]$/u
) ->
true
true ->
false
end
end)
|> tap(fn words ->
File.write!("./clean_words.txt", Enum.join(words, ","))
end)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment