Last active
March 5, 2018 14:17
-
-
Save vheathen/51bf964662ed222296f885e8a312c96f to your computer and use it in GitHub Desktop.
Just (poor) example of a parser in Elixir lang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule IasipApi.Parsers.NioktrParser do | |
require Logger | |
# TODO: redesign component and split functional and supportive parts into modules | |
@empty_org %{ | |
okopf: "", | |
f_name: "", | |
s_name: "", | |
founder: "", | |
ogrn: "", | |
} | |
@nioktr_map %{ | |
# Rosrid url | |
rosrid_url: nil, | |
# Регистрационный номер НИОКТР | |
rosrid_ref_number: nil, | |
# YYYY-MM-DD # Дата регистрации | |
oistls_ref_date: nil, | |
# Наименование НИОКТР | |
name: nil, | |
# Приоритетное направление развития науки | |
spdf: [], | |
# Критическая технология | |
cr_tech: [], | |
# Вид НИОКТР | |
nioktr_type: nil, | |
# Аннотация | |
annotation: "", | |
# Коды тематических рубрик | |
grnti: [], | |
# Коды международной классификации | |
fos: [], | |
# Ключевые слова | |
descriptors: [], | |
# Сведения об источниках финансирования, 6 записей! | |
f_sources: [ | |
%{name: "Средства федерального бюджета"}, | |
%{name: "Средства бюджетов субъектов Российской Федерации"}, | |
%{name: "Средства местных бюджетов"}, | |
%{name: "Средства сторонних организаций"}, | |
%{name: "Средства фондов поддержки научной и (или) научно-технической деятельности"}, | |
%{name: "Собственные средства организаций"}, | |
], | |
f_allocations: [ | |
%{name: "Средства федерального бюджета", amount: 0.0, kbk: ""}, | |
%{name: "Средства бюджетов субъектов Российской Федерации", amount: 0.0, kbk: ""}, | |
%{name: "Средства местных бюджетов", amount: 0.0, kbk: ""}, | |
%{name: "Средства сторонних организаций", amount: 0.0, kbk: ""}, | |
%{name: "Средства фондов поддержки научной и (или) научно-технической деятельности", amount: 0.0, kbk: ""}, | |
%{name: "Собственные средства организаций", amount: 0.0, kbk: ""}, | |
], | |
# Дата начала работы | |
date_begin: nil, | |
doc_date: nil, | |
# Дата окончания работы | |
date_end: nil, | |
# Номер контракта | |
doc_number: "", | |
# Основание проведения НИОКТР | |
doc_type: "", | |
# Количество отчётов | |
report_count: 0, | |
# Индекс УДК | |
udk: "", | |
# Наименование целевой программы | |
gsp: nil, | |
# Сведения о заказчике | |
customer: nil, | |
# Cведения об исполнителе | |
contractor: nil, | |
# Сведения о соисполнителях | |
co_contractors: [], | |
# Утверждение | |
approval: nil, | |
# Связанные карты | |
rids: [], | |
rightholders: [], | |
cc_work_descriptions: [], | |
ikrbs: [], | |
} | |
# b2 |> Floki.find("#content div div div") |> Enum.at(0) |> Floki.find("span") |> Floki.text | |
def parse(html, url) do | |
html | |
|> Floki.find("#content > div > div") | |
|> Enum.at(0) | |
|> parse_tree | |
|> Map.put(:rosrid_url, url) | |
end | |
def parse_tree(html_tree) when is_tuple(html_tree) do | |
@nioktr_map | |
|> parse_divs(html_tree) | |
|> parse_tables(html_tree) | |
end | |
def parse_tables(map, html_tree) do | |
accum_label_table_pairs([], html_tree) | |
|> Enum.reduce(map, fn({label, table}, new_map) -> | |
label = label | |
|> stripe_label | |
put_data(new_map, label, table) | |
end) | |
end | |
defp parse_divs(map, html_tree) do | |
html_tree | |
|> Floki.find("div > div") | |
|> Enum.reduce(map, fn(div, new_map) -> | |
label = div | |
|> Floki.find("label") | |
|> Floki.text | |
|> stripe_label | |
subtree = | |
if length(div |> Floki.find("span")) > 0, | |
do: div |> Floki.find("span"), | |
else: div |> Floki.find("div > div") | |
data = subtree |> Floki.text |> String.trim | |
put_data(new_map, label, data) | |
end) | |
end | |
# Filter out senseless strings | |
defp put_data(map, _, "Не указана") do | |
map | |
end | |
defp put_data(map, _, "Нет данных") do | |
map | |
end | |
defp put_data(map, _, "") do | |
map | |
end | |
defp put_data(map, "Регистрационный номер НИОКТР", data) do | |
Logger.info "Parsing document with ref #{data}" | |
field = :rosrid_ref_number | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Дата регистрации", data) do | |
field = :oistls_ref_date | |
Map.put(map, field, parse_date(data)) | |
end | |
defp put_data(map, "Наименование НИОКТР", data) do | |
field = :name | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Приоритетное направление развития науки", data) do | |
field = :spdf | |
Map.put(map, field, [%{ name: data}]) | |
end | |
defp put_data(map, "Критическая технология", data) do | |
field = :cr_tech | |
Map.put(map, field, [%{ name: data}]) | |
end | |
defp put_data(map, "Вид НИОКТР", data) do | |
field = :nioktr_type | |
Map.put(map, field, %{name: data}) | |
end | |
defp put_data(map, "Аннотация", data) do | |
field = :annotation | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Коды тематических рубрик", data) do | |
field = :grnti | |
result = parse_semi_separated_string(data, true) | |
|> Enum.reject(&(is_nil(&1))) | |
|> Enum.map(&( %{ id: &1 } )) | |
Map.put(map, field, result) | |
end | |
defp put_data(map, "Коды международной классификации", data) do | |
field = :fos | |
result = parse_semi_separated_string(data) | |
|> Enum.map(&( %{ id: &1 } )) | |
Map.put(map, field, result) | |
end | |
defp put_data(map, "Ключевые слова", data) do | |
field = :descriptors | |
result = parse_unk_separated_string(data) | |
|> Enum.map(&( %{ name: &1 } )) | |
Map.put(map, field, result) | |
end | |
defp put_data(map, "Дата начала работы", data) do | |
date = parse_date(data) | |
map | |
|> Map.put(:date_begin, date) | |
|> Map.put(:doc_date, date) | |
end | |
defp put_data(map, "Дата окончания работы", data) do | |
field = :date_end | |
Map.put(map, field, parse_date(data)) | |
end | |
defp put_data(map, "Номер контракта", data) do | |
field = :doc_number | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Основание проведения НИОКТР", data) do | |
field = :doc_type | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Количество отчётов", data) do | |
data = data |> String.to_integer | |
field = :report_count | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Индекс УДК", data) do | |
field = :udk | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Наименование целевой программы", data) do | |
field = :gsp | |
Map.put(map, field, %{ name: data}) | |
end | |
############################################# | |
# table trees | |
############################################# | |
defp put_data(map, "Заказчик", table) do | |
field = :customer | |
data = get_table_row_data(table) | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Исполнитель", table) do | |
field = :contractor | |
data = get_table_row_data(table) | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Сведения о соисполнителях", table) do | |
field = :co_contractors | |
data = get_table_data(table) | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Сведения об источниках финансирования", table) do | |
field = :f_allocations | |
data = get_table_data(table) | |
|> Enum.map(fn(el) -> | |
amount = Map.get(el, :amount) | |
amount = | |
cond do | |
amount == "" -> 0.0 | |
is_binary(amount) -> | |
amount = String.replace(amount, ~r/[^\d\.,]*/u, "") | |
if Regex.match?(~r/\d+[\.,]\d+/u, amount) do | |
String.to_float(amount) | |
else | |
String.to_float("#{amount},0") | |
end | |
true -> amount | |
end | |
name = Map.get(el, :name) |> String.replace(~r/[\n\t\r]*/ui, "") |> String.replace(~r/\s+/ui, " ") | |
el | |
|> Map.put(:amount, amount) | |
|> Map.put(:name, name) | |
end) | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Сведения о руководителях", table) do | |
field = :approval | |
data = table | |
|> Floki.find("td") | |
|> Enum.reduce(%{}, fn({_, _, el}, map) -> | |
el = Enum.at(el, 0) |> Floki.text |> String.trim | |
case Map.keys(map) |> Enum.count do | |
0 -> if Regex.match?(~r/Руководитель организации/ui, el), do: map, else: Map.put(map, :org_name, el) | |
1 -> Map.put(map, :org_position, el) | |
2 -> if Regex.match?(~r/Руководитель работы/ui, el), do: map, else: Map.put(map, :research_name, el) | |
3 -> Map.put(map, :research_position, el) | |
end | |
end) | |
Map.put(map, field, data) | |
end | |
defp put_data(map, "Связанные карты", table) do | |
map | |
|> put_linked_data(table, :rids, ~r/^\/rid/ui) | |
|> put_linked_data(table, :ikrbs, ~r/^\/ikrbs/ui) | |
end | |
defp put_data(map, _chapter, _data) do | |
# Logger.debug "Other: #{chapter} | #{inspect data}" | |
map | |
end | |
defp put_linked_data(map, table, field, regex) do | |
get_map = fn(td, {i, map}) -> | |
cond do | |
rem(i, 3) == 0 -> | |
{ | |
i + 1, | |
%{rosrid_ref_number: td |> Floki.text |> String.trim } | |
} | |
rem(i, 4) == 0 -> | |
{ | |
i + 1, | |
Map.put(map, :rosrid_url, td |> Floki.find("a") |> Floki.attribute("href") |> Enum.at(0) ) | |
} | |
true -> { i+1, map } | |
end | |
end | |
rows = Floki.find(table, "tr") | |
data = Enum.reduce(rows, [], fn(row, list) -> | |
url = row | |
|> Floki.find("a") | |
|> Floki.attribute("href") | |
|> Floki.text | |
case Regex.match?(regex, url) do | |
true -> tds = Floki.find(row, "td") | |
{_, map} = Enum.reduce(tds, {1, %{}}, get_map) | |
list ++ [map] | |
false -> list | |
end | |
end) | |
Map.put(map, field, data) | |
end | |
defp parse_semi_separated_string(string, to_int \\ false) do | |
parse_separated_string(string, ";", to_int) | |
end | |
# defp parse_comma_separated_string(string, to_int \\ false) do | |
# parse_separated_string(string, ",", to_int) | |
# end | |
defp parse_separated_string(string, separator, to_int \\ false) | |
defp parse_separated_string("", _, _) do | |
[] | |
end | |
defp parse_separated_string(string, separator, to_int) do | |
if Regex.match?(~r/[^\d\w\s]+/, string) do | |
string | |
|> String.split(separator) | |
|> Enum.map(fn el -> | |
el = el |> String.trim |> String.replace(".", "") | |
if to_int and el != "" do | |
if Regex.match?(~r/^\d+$/, el), do: String.to_integer(el), else: nil | |
else | |
el | |
end | |
end) | |
|> Enum.filter(&(&1 != "")) | |
else | |
[] | |
end | |
end | |
defp parse_unk_separated_string(string, to_int \\ false) do | |
string | |
|> String.replace(~r/[,\.]+/, ";") | |
|> parse_semi_separated_string(to_int) | |
end | |
defp parse_date(date_str) do | |
[ day, month, year ] = String.split(date_str, "/") | |
Date.from_iso8601!("#{year}-#{month}-#{day}") | |
end | |
###################################### | |
# Tables \/ | |
###################################### | |
# Travers html tree and find label - table and legend - table pairs | |
# as list of tuples {'label text', table_subtree} | |
defp accum_label_table_pairs(acc, []) do | |
acc | |
end | |
defp accum_label_table_pairs(acc, [head | tail]) do | |
accum_label_table_pairs(acc, head) | |
|> accum_label_table_pairs(tail) | |
end | |
# If we got 'legend' or 'label' tag let's start to deal with it | |
# in one function | |
defp accum_label_table_pairs(acc, {"legend", _, subtree}) do | |
accum_label_table_pairs(acc, {"label", [], subtree}) | |
end | |
defp accum_label_table_pairs(acc, {"label", _, subtree}) do | |
# Logger.error "Got label: #{subtree |> Floki.text}" | |
case List.last(acc) do | |
{ _, nil } -> | |
List.delete_at(acc, -1) | |
_ -> acc | |
end ++ [{ subtree |> Floki.text, nil}] | |
end | |
defp accum_label_table_pairs(acc, {"table", _, subtree}) do | |
# Logger.error "Got table" | |
case List.last(acc) do | |
{ label, nil } -> | |
acc | |
|> List.update_at(-1, fn _ -> { label, { "table", [], subtree } } end) | |
_ -> | |
Logger.debug "We've got <table> without <label>/<legend> first! Is it ok?" | |
acc | |
end | |
end | |
defp accum_label_table_pairs(acc, {_, _, subtree}) do | |
accum_label_table_pairs(acc, subtree) | |
end | |
defp accum_label_table_pairs(acc, _) do | |
acc | |
end | |
defp get_table_data(table) do | |
table = table |> Floki.find("tr") | |
headers = table | |
|> Enum.at(0) | |
|> Floki.transform(fn({tag, attrs}) -> if tag == "td", do: {"th", attrs}, else: {tag, attrs} end) | |
table | |
|> List.delete_at(0) | |
|> Enum.map(&( get_table_row_data([headers, &1]) )) | |
end | |
def get_table_row_data(table) do | |
table | |
|> Floki.find("tr") | |
|> Floki.find("th") | |
|> Enum.reduce(%{}, fn({_,_,c}, map) -> | |
field_name = case get_table_header(c) do | |
"ОКОПФ" -> :okopf | |
"Полное наименование" -> :f_name | |
"Наименование организации" -> :f_name | |
"Сокращённое наименование" -> :s_name | |
"Сокращённое наименование организации" -> :s_name | |
"Учредитель (ведомственная принадлежность)" -> :founder | |
"ОГРН" -> :ogrn | |
"ИНН" -> :inn | |
"Описание работ" -> :work_description | |
"№ п/п" -> :index | |
"Источник финансирования" -> :name | |
"Объём финансирования, тыс руб" -> :amount | |
"Коды бюджетной классификации" -> :kbk | |
nil -> | |
Logger.debug "Got nil table header: is that ok?" | |
:nill | |
end | |
Map.put(map, field_name, get_table_data(table, map)) | |
end) | |
end | |
defp get_table_data(table, map) do | |
case Floki.find(table, "td") | |
|> Enum.at(Map.keys(map) |> Enum.count) | |
|> elem(2) | |
|> Enum.at(0) do | |
nil -> "" | |
any -> any |> Floki.text |> String.replace(~r/[«»]+/u, "\"") |> String.trim | |
end | |
end | |
defp get_table_header(el) do | |
el | |
|> Enum.at(0) | |
|> stripe_label | |
end | |
###################################### | |
# Tables /\ | |
###################################### | |
defp stripe_label(nil) do | |
nil | |
end | |
defp stripe_label(label) do | |
label | |
|> String.replace(~r/[:\r\n\t\.\d]*/, "") | |
|> String.replace(~r/\s+/, " ") | |
|> String.trim | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment