Last active
December 11, 2023 03:01
-
-
Save leeduckgo/42288d9bab88dfbf620b0ba625553d1c to your computer and use it in GitHub Desktop.
CodesOnChain.DataPipeExample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule CodesOnChain.DataPipeExample.ByLine do | |
@moduledoc """ | |
just split the markdown file by line. | |
""" | |
def get_module_doc(), do: @moduledoc | |
def handle(raw_data) do | |
String.split(raw_data, "\n") | |
end | |
# +------+ | |
# | TEST | | |
# +------+ | |
@doc """ | |
output: | |
```elixir | |
[" # h1", " ## h2", " ### h3", " #### h4", " ##### h5", | |
" ###### h6", ""] | |
``` | |
""" | |
def test_handle() do | |
raw_data = | |
""" | |
# h1 | |
## h2 | |
### h3 | |
#### h4 | |
##### h5 | |
###### h6 | |
""" | |
handle(raw_data) | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule CodesOnChain.DataPipeExample.ByStruct do | |
@moduledoc """ | |
split the markdown file by the documentation struct | |
would write the h1-h6 as the metadata of items. | |
""" | |
def get_module_doc(), do: @moduledoc | |
@doc """ | |
read and split the md file automatically by the struct. | |
""" | |
def handle(raw_data) do | |
raw_data | |
|> Earmark.as_ast!() # ast it as a tree. | |
|> generate_formatted_tree() # format tree with sturct TODO: optimize this func but it works now. | |
# TODO: acceptted more level. | |
|> Enum.map(fn %{head: {_h1_level, [], [h1_content], %{}}, body: body} -> # tag all p | |
elems_handled = | |
body | |
|> Enum.filter(&(is_tuple(&1))) | |
|> Enum.filter(fn elem -> | |
# get all the `p` | |
elem | |
|> Tuple.to_list | |
|> Enum.fetch!(0) == "p" | |
end) | |
|> Enum.map(fn p -> | |
{"p", [], [content], %{}} = p | |
%{content: content, metadata: %{h1: h1_content}} | |
end) | |
maps_handled = | |
body | |
|> Enum.filter(&(is_map(&1))) | |
|> Enum.map(fn %{head: {_h3_level, [], [h2_content], %{}}, body: body} -> | |
body | |
|> Enum.filter(&(is_tuple(&1))) | |
|> Enum.filter(fn elem -> | |
# get all the `p` | |
elem | |
|> Tuple.to_list() | |
|> Enum.fetch!(0) == "p" | |
end) | |
|> Enum.map(fn p -> | |
{"p", [], [content], %{}} = p | |
%{content: content, metadata: %{h1: h1_content, h2: h2_content}} | |
end) | |
end) | |
elems_handled ++ maps_handled | |
end) | |
|> List.flatten() | |
# |> combine_data_with_same_h2_h3() | |
end | |
# +-------------------------+ | |
# | generate formatted tree | | |
# +-------------------------+ | |
def generate_formatted_tree(ori_tree) do | |
{_pre, after_tree} = format_with_level(ori_tree, "h1") | |
generate_formatted_tree(after_tree, 2) | |
end | |
# level 6 is the most level | |
def generate_formatted_tree(formatted_tree, 6) do | |
formatted_tree | |
end | |
def generate_formatted_tree(formatted_tree, level) do | |
Enum.map(formatted_tree, fn %{head: head, body: body} -> | |
result = format_with_level(body, "h#{level}") | |
body_handled = | |
case result do | |
:error -> | |
body | |
{pre, after_tree} -> | |
pre ++ generate_formatted_tree(after_tree, level + 1) | |
end | |
%{head: head, body: body_handled} | |
end) | |
end | |
def format_with_level(tree, level) do | |
all = tree |> Enum.filter(fn elem -> elem |> Tuple.to_list |> Enum.fetch!(0) == level end) | |
# fetch the things before the first elem. | |
case Enum.fetch(all, 0) do | |
:error -> | |
:error | |
{:ok, first_elem} -> | |
idx = Enum.find_index(tree, &(&1 == first_elem)) | |
{pre, _after} = Enum.split(tree, idx) | |
after_handled = | |
Enum.map(all, fn elem -> | |
idx = Enum.find_index(all, &(&1 == elem)) | |
elem_after = Enum.fetch(all, idx + 1) | |
# get things between elem & elem after | |
res = | |
if elem_after == :error do # it means it is the last one | |
tree | |
|> get_content(elem) | |
|> format_content() | |
else | |
{:ok, content} = elem_after | |
tree | |
|> get_content(elem, content) | |
|> format_content() | |
end | |
res | |
end) | |
{pre, after_handled} | |
end | |
end | |
def combine_data_with_same_h2_h3(data) do | |
data | |
|> Enum.group_by(fn elem -> elem.metadata end) | |
|> Enum.map(fn {key, elem_list} -> | |
content = | |
elem_list | |
|> Enum.map(fn elem -> elem.content end) | |
|> Enum.join("\n") | |
%{content: content, metadata: key} | |
end) | |
end | |
def get_content(tree, h2, h2_after) do | |
h2_index = Enum.find_index(tree, &(&1==h2)) | |
h2_after_index = Enum.find_index(tree, &(&1==h2_after)) | |
{_part_1, part_2} = Enum.split(tree, h2_index) | |
{content, _part_3} = Enum.split(part_2, h2_after_index) | |
content | |
end | |
def get_content(tree, h2) do | |
h2_index = Enum.find_index(tree, &(&1==h2)) | |
{_part_1, part_2} = Enum.split(tree, h2_index) | |
part_2 | |
end | |
def format_content(content) do | |
{[h2_part], others} = Enum.split(content, 1) | |
%{head: h2_part, body: others} | |
end | |
# +------+ | |
# | TEST | | |
# +------+ | |
def test_handle() do | |
raw_data = | |
""" | |
# abc | |
abcdefg | |
## ddd | |
test | |
### abcdefg | |
aha | |
#### habc | |
abcdefgggg | |
""" | |
handle(raw_data) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment