Skip to content

Instantly share code, notes, and snippets.

@mworrell
Created July 19, 2024 09:56
Show Gist options
  • Save mworrell/e0b3e6ff16fd0f8e3f4d285d51525ec1 to your computer and use it in GitHub Desktop.
Save mworrell/e0b3e6ff16fd0f8e3f4d285d51525ec1 to your computer and use it in GitHub Desktop.
Rudimentary XLSX parser in Erlang
-module(parse_xlsx).
-export([
parse_file/1,
fetch_worksheet/1
]).
parse_file(File) ->
case fetch_worksheet(File) of
{ok, #{ worksheet := notfound }} ->
{error, noworksheet};
{ok, #{ worksheet := WorksheetBin } = Parsed} ->
Shared = parse_shared(Parsed),
WorksheetBin1 = remove_empty_rows(WorksheetBin),
WorksheetXML = mochiweb_html:parse(WorksheetBin1),
% io:format("~p~n~n", [WorksheetXML]),
{MaxRows, MaxCols, Idx} = make_rowlist( parse_worksheet(WorksheetXML, Shared) ),
RowData = lists:map(
fun(NR) ->
lists:map(
fun(NC) ->
maps:get({NR, NC}, Idx, <<>>)
end,
lists:seq(0,MaxCols))
end,
lists:seq(1, MaxRows)),
{ok, RowData};
{error, _} = Error ->
Error
end.
remove_empty_rows(Bin) ->
iolist_to_binary(re:replace(Bin, <<"<row[^>]*/>">>, <<>>, [ global ])).
make_rowlist(Rows) ->
lists:foldl(
fun
(Cells, Acc) ->
lists:foldl(
fun({Loc, Data}, {MaxRow, MaxCol, Idx}) ->
{NR, NC} = Lnum = loc_to_num(Loc),
MaxRow1 = max(MaxRow, NR),
MaxCol1 = max(MaxCol, NC),
Idx1 = Idx#{ Lnum => Data },
{MaxRow1, MaxCol1, Idx1}
end,
Acc,
Cells)
end,
{0, 0, #{}},
Rows).
loc_to_num(Loc) ->
{Letters, Numbers} = lists:partition(
fun(C) -> C >= $A andalso C =< $Z end,
binary_to_list(Loc)),
{list_to_integer(Numbers), letters_to_num(Letters)}.
letters_to_num(Letters) ->
lists:foldl(
fun(C, Acc) ->
Acc * 26 + (C-$A)
end,
0,
Letters).
parse_shared(#{ shared := SharedBin }) ->
case mochiweb_html:parse(SharedBin) of
{<<"sst">>, _, Elts} ->
% io:format("~p~n~n", [ Elts ]),
{_, Shared} = lists:foldl(
fun
({<<"si">>, _, Ts}, {N, Acc}) ->
V = case lists:keyfind(<<"t">>, 1, Ts) of
{<<"t">>, _, T} -> text(T);
false -> <<>>
end,
{N+1, Acc#{ N => V}};
(_, NAcc) ->
NAcc
end,
{0, #{}},
Elts),
Shared;
_ ->
#{}
end;
parse_shared(_) ->
#{}.
parse_worksheet({<<"worksheet">>, _Args, Elts}, Shared) ->
[ Data | _ ] = lists:filtermap(
fun
({<<"sheetData">>, _, SheetRows}) ->
{true, extract_rows(SheetRows, Shared)};
({<<"sheetdata">>, _, SheetRows}) ->
{true, extract_rows(SheetRows, Shared)};
(_) ->
false
end,
Elts),
Data.
extract_rows(Rs, Shared) ->
lists:filtermap(
fun
({<<"row">>, _, _} = R) -> {true, extract_row(R, Shared)};
(_) -> false
end,
Rs).
extract_row({<<"row">>, _RowArgs, Cells}, Shared) ->
lists:filtermap(
fun
({<<"c">>, Args, CellData}) ->
Data = map_cell(proplists:get_value(<<"t">>, Args), Args, CellData, Shared),
{true, {proplists:get_value(<<"r">>, Args), Data}};
(_) ->
false
end,
Cells).
map_cell(<<"inlineStr">>, _CellArgs, CellData, _Shared) ->
case lists:keyfind(<<"is">>, 1, CellData) of
{<<"is">>, _, IsData} ->
text(IsData);
false ->
<<>>
end;
map_cell(<<"s">>, _CellArgs, CellData, Shared) ->
case lists:keyfind(<<"v">>, 1, CellData) of
{<<"v">>, _, VData} ->
maps:get(binary_to_integer(text(VData)), Shared, <<>>);
false ->
<<>>
end;
map_cell(undefined, _CellArgs, CellData, _Shared) ->
case lists:keyfind(<<"v">>, 1, CellData) of
{<<"v">>, _, VData} -> text(VData);
false -> <<>>
end.
text(Rs) ->
text(Rs, <<>>).
text([], Acc) ->
Acc;
text([{_, _, Rs} | Rest], Acc) ->
Acc1 = text(Rs, Acc),
text(Rest, Acc1);
text([ B | Rest ], Acc) when is_binary(B) ->
text(Rest, <<Acc/binary, B/binary>>).
fetch_worksheet(File) ->
zip:foldl(
fun
("xl/worksheets/sheet1.xml", _GetInfo, GetBin, Acc) ->
Acc#{ worksheet => GetBin() };
("xl/sharedStrings.xml", _GetInfo, GetBin, Acc) ->
Acc#{ shared => GetBin() };
(_FileInArchive, _GetInfo, _GetBin, AccIn) ->
AccIn
end,
#{},
unicode:characters_to_list(File)).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment