Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Preliminary Neotomex PDF object grammar. Still has some bugs around object name edge cases.
defmodule PdfObjectGrammar do
use Neotomex.ExGrammar
defp describe_object(o, prefix) do
IO.puts "#{prefix} received"
IO.inspect o
end
@root true
define :object, "bool / numeric_object / string_object / name / array / dictionary"
define :dictionary, "<'<<'> <space*> (name_object <space*>)* <space*> <'>>'> <space*>" do
dicts -> dicts |> List.flatten |> Enum.reduce(%{}, fn (x, acc) -> Dict.merge(acc, x) end)
end
define :name, "'/' [a-zA-Z0-9#_+*]*" do
# TODO: This doesn't properly handle cases where there is no space between name definitions
# (e.g. /Type/Page), which shouldn't be valid anyway, but some PDF creators are stupid
s -> Enum.join(s)
end
define :name_object, "name <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array ) <space*>" do
[key, value] -> %{} |> Dict.put(key, value)
end
define :array, "<'['> <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array / <space>)+ <']'>" do
[arr] -> arr |> Enum.reject(&is_nil/1)
end
define :object_reference, "integer <space> integer <space> <'R'>" do
[obj_id, gen_id] -> %{
"object_id" => obj_id,
"generation_id" => gen_id
}
end
define :string_object, "hex_string_object / regular_string_object"
define :regular_string_object, "paren_wrapped_string" do
# This is just to remove outermost parentheses, while allowing nested parentheses to remain as part of literal string
string -> Regex.scan(~r/\((.*)\)/, string, capture: :all_but_first) |> Enum.join
end
define :paren_wrapped_string, "'(' ([^\(\)] / paren_wrapped_string)* ')'" do
# Dumb, but this is how to handle nested parentheses in a string that's delimited with parentheses
chars -> chars |> Enum.join
end
define :hex_string_object, "<'<'> hex_string <'>'>" do
[hso] -> hso
end
define :bool, "'true' / 'false'" do
"false" -> false
"true" -> true
end
define :hex_string, "[a-fA-F0-9]+" do
chars -> chars |> Enum.join |> hex_to_binary
end
define :numeric_object, "('+' / '-')? (float / integer)" do
["-", number] -> -number
[_, number] -> number
n -> n
end
define :float, "integer <'.'> integer" do
[i, m] -> "#{i}.#{m}" |> String.to_float
end
define :integer, "[0-9]+" do
i -> i |> Enum.join |> String.to_integer
end
define :space, "[ \\r\\n\\s\\t]"
# String utilities
defp hex_to_binary(hex_string) do
hex_string
|> to_char_list
|> Enum.chunk(2, 2, [0])
|> hex_to_binary("")
|> decode_string
end
defp hex_to_binary([], result), do: result
defp hex_to_binary([[c, 0]], result), do: result <> ("#{c}0" |> String.to_integer(16))
defp hex_to_binary([pair | rest], result) do
hex_to_binary(rest, result <> <<(pair |> to_string |> String.to_integer(16))>>)
end
defp decode_string(str) when is_binary(str) do
{ encoding, bom_length } = :unicode.bom_to_encoding(str)
<< _bom::binary-size(bom_length), str::binary >> = str
case encoding do
{:utf16, :big} -> :unicode.characters_to_binary(str, encoding, :utf8)
{:utf16, :little} -> :unicode.characters_to_binary(str, encoding, :utf8)
:latin1 -> :unicode.characters_to_binary(str, encoding, :utf8)
{:utf32, endianness} -> :unicode.characters_to_binary(<<0, 0>> <> str, {:utf16, endianness}, :utf8)
_ -> str
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment