Skip to content

Instantly share code, notes, and snippets.

@leeduckgo
Last active December 11, 2023 03:01
Show Gist options
  • Save leeduckgo/42288d9bab88dfbf620b0ba625553d1c to your computer and use it in GitHub Desktop.
Save leeduckgo/42288d9bab88dfbf620b0ba625553d1c to your computer and use it in GitHub Desktop.
CodesOnChain.DataPipeExample
defmodule CodesOnChain.DataPipeExample.ByLine do
@moduledoc """
just split the markdown file by line.
"""
def get_module_doc(), do: @moduledoc
def handle(raw_data) do
String.split(raw_data, "\n")
end
# +------+
# | TEST |
# +------+
@doc """
output:
```elixir
[" # h1", " ## h2", " ### h3", " #### h4", " ##### h5",
" ###### h6", ""]
```
"""
def test_handle() do
raw_data =
"""
# h1
## h2
### h3
#### h4
##### h5
###### h6
"""
handle(raw_data)
end
end
defmodule CodesOnChain.DataPipeExample.ByStruct do
@moduledoc """
split the markdown file by the documentation struct
would write the h1-h6 as the metadata of items.
"""
def get_module_doc(), do: @moduledoc
@doc """
read and split the md file automatically by the struct.
"""
def handle(raw_data) do
raw_data
|> Earmark.as_ast!() # ast it as a tree.
|> generate_formatted_tree() # format tree with sturct TODO: optimize this func but it works now.
# TODO: acceptted more level.
|> Enum.map(fn %{head: {_h1_level, [], [h1_content], %{}}, body: body} -> # tag all p
elems_handled =
body
|> Enum.filter(&(is_tuple(&1)))
|> Enum.filter(fn elem ->
# get all the `p`
elem
|> Tuple.to_list
|> Enum.fetch!(0) == "p"
end)
|> Enum.map(fn p ->
{"p", [], [content], %{}} = p
%{content: content, metadata: %{h1: h1_content}}
end)
maps_handled =
body
|> Enum.filter(&(is_map(&1)))
|> Enum.map(fn %{head: {_h3_level, [], [h2_content], %{}}, body: body} ->
body
|> Enum.filter(&(is_tuple(&1)))
|> Enum.filter(fn elem ->
# get all the `p`
elem
|> Tuple.to_list()
|> Enum.fetch!(0) == "p"
end)
|> Enum.map(fn p ->
{"p", [], [content], %{}} = p
%{content: content, metadata: %{h1: h1_content, h2: h2_content}}
end)
end)
elems_handled ++ maps_handled
end)
|> List.flatten()
# |> combine_data_with_same_h2_h3()
end
# +-------------------------+
# | generate formatted tree |
# +-------------------------+
def generate_formatted_tree(ori_tree) do
{_pre, after_tree} = format_with_level(ori_tree, "h1")
generate_formatted_tree(after_tree, 2)
end
# level 6 is the most level
def generate_formatted_tree(formatted_tree, 6) do
formatted_tree
end
def generate_formatted_tree(formatted_tree, level) do
Enum.map(formatted_tree, fn %{head: head, body: body} ->
result = format_with_level(body, "h#{level}")
body_handled =
case result do
:error ->
body
{pre, after_tree} ->
pre ++ generate_formatted_tree(after_tree, level + 1)
end
%{head: head, body: body_handled}
end)
end
def format_with_level(tree, level) do
all = tree |> Enum.filter(fn elem -> elem |> Tuple.to_list |> Enum.fetch!(0) == level end)
# fetch the things before the first elem.
case Enum.fetch(all, 0) do
:error ->
:error
{:ok, first_elem} ->
idx = Enum.find_index(tree, &(&1 == first_elem))
{pre, _after} = Enum.split(tree, idx)
after_handled =
Enum.map(all, fn elem ->
idx = Enum.find_index(all, &(&1 == elem))
elem_after = Enum.fetch(all, idx + 1)
# get things between elem & elem after
res =
if elem_after == :error do # it means it is the last one
tree
|> get_content(elem)
|> format_content()
else
{:ok, content} = elem_after
tree
|> get_content(elem, content)
|> format_content()
end
res
end)
{pre, after_handled}
end
end
def combine_data_with_same_h2_h3(data) do
data
|> Enum.group_by(fn elem -> elem.metadata end)
|> Enum.map(fn {key, elem_list} ->
content =
elem_list
|> Enum.map(fn elem -> elem.content end)
|> Enum.join("\n")
%{content: content, metadata: key}
end)
end
def get_content(tree, h2, h2_after) do
h2_index = Enum.find_index(tree, &(&1==h2))
h2_after_index = Enum.find_index(tree, &(&1==h2_after))
{_part_1, part_2} = Enum.split(tree, h2_index)
{content, _part_3} = Enum.split(part_2, h2_after_index)
content
end
def get_content(tree, h2) do
h2_index = Enum.find_index(tree, &(&1==h2))
{_part_1, part_2} = Enum.split(tree, h2_index)
part_2
end
def format_content(content) do
{[h2_part], others} = Enum.split(content, 1)
%{head: h2_part, body: others}
end
# +------+
# | TEST |
# +------+
def test_handle() do
raw_data =
"""
# abc
abcdefg
## ddd
test
### abcdefg
aha
#### habc
abcdefgggg
"""
handle(raw_data)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment