Skip to content

Instantly share code, notes, and snippets.

@litaocheng
Created May 22, 2012 12:04
Show Gist options
  • Save litaocheng/2768621 to your computer and use it in GitHub Desktop.
Save litaocheng/2768621 to your computer and use it in GitHub Desktop.
从文件中提取中文
%%% 从文件中提取中文, 回答:http://erlangqa.com/382
%%% 假设src.txt内容为:
%%% hello自由 !
%%% 2012年5月22日
%%% cheng
%%%
%%% 执行:
%%% erlc match_zh.erl && erl -noinput -eval "match_zh:run(\"src.txt\"), init:stop()" && cat zh.txt
%%% L1 自由
%%% L2 日
%%% L2 月
%%% L2 年
-module(match_zh).
-compile([export_all]).
run(Src) ->
{ok, Fd} = file:open(Src, [raw, binary]),
do_match(Fd).
do_match(Fd) ->
Zh = do_match(Fd, 1, []),
file:write_file("zh.txt", lists:reverse(Zh)).
do_match(Fd, LineNo, Acc) ->
case file:read_line(Fd) of
eof ->
Acc;
{ok, Line} ->
case re:run(Line, "[\x{4e00}-\x{9fff}]+", [unicode,global]) of
nomatch ->
%io:format("dont have zh_CN\n"),
do_match(Fd, LineNo + 1, Acc);
{match, MatchL} ->
L =
[begin
B = binary:part(Line, Pos, Len),
["L", erlang:integer_to_list(LineNo), " ", B, "\n"]
end || [{Pos, Len}] <- MatchL],
%io:format("bin:~w\n", [L]),
do_match(Fd, LineNo + 1, L ++ Acc)
end;
{error, _Reason}->
io:format("read line error:~w", [_Reason]),
Acc
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment