Skip to content

Instantly share code, notes, and snippets.

@DoggettCK
Created May 11, 2016 15:57
Show Gist options
  • Save DoggettCK/787a067d3ac6ad6222e332010ed53f3f to your computer and use it in GitHub Desktop.
Save DoggettCK/787a067d3ac6ad6222e332010ed53f3f to your computer and use it in GitHub Desktop.
Preliminary Neotomex PDF object grammar. Still has some bugs around object name edge cases.
defmodule PdfObjectGrammar do
use Neotomex.ExGrammar
defp describe_object(o, prefix) do
IO.puts "#{prefix} received"
IO.inspect o
end
@root true
define :object, "bool / numeric_object / string_object / name / array / dictionary"
define :dictionary, "<'<<'> <space*> (name_object <space*>)* <space*> <'>>'> <space*>" do
dicts -> dicts |> List.flatten |> Enum.reduce(%{}, fn (x, acc) -> Dict.merge(acc, x) end)
end
define :name, "'/' [a-zA-Z0-9#_+*]*" do
# TODO: This doesn't properly handle cases where there is no space between name definitions
# (e.g. /Type/Page), which shouldn't be valid anyway, but some PDF creators are stupid
s -> Enum.join(s)
end
define :name_object, "name <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array ) <space*>" do
[key, value] -> %{} |> Dict.put(key, value)
end
define :array, "<'['> <space*> (dictionary / name / string_object / bool / object_reference / numeric_object / array / <space>)+ <']'>" do
[arr] -> arr |> Enum.reject(&is_nil/1)
end
define :object_reference, "integer <space> integer <space> <'R'>" do
[obj_id, gen_id] -> %{
"object_id" => obj_id,
"generation_id" => gen_id
}
end
define :string_object, "hex_string_object / regular_string_object"
define :regular_string_object, "paren_wrapped_string" do
# This is just to remove outermost parentheses, while allowing nested parentheses to remain as part of literal string
string -> Regex.scan(~r/\((.*)\)/, string, capture: :all_but_first) |> Enum.join
end
define :paren_wrapped_string, "'(' ([^\(\)] / paren_wrapped_string)* ')'" do
# Dumb, but this is how to handle nested parentheses in a string that's delimited with parentheses
chars -> chars |> Enum.join
end
define :hex_string_object, "<'<'> hex_string <'>'>" do
[hso] -> hso
end
define :bool, "'true' / 'false'" do
"false" -> false
"true" -> true
end
define :hex_string, "[a-fA-F0-9]+" do
chars -> chars |> Enum.join |> hex_to_binary
end
define :numeric_object, "('+' / '-')? (float / integer)" do
["-", number] -> -number
[_, number] -> number
n -> n
end
define :float, "integer <'.'> integer" do
[i, m] -> "#{i}.#{m}" |> String.to_float
end
define :integer, "[0-9]+" do
i -> i |> Enum.join |> String.to_integer
end
define :space, "[ \\r\\n\\s\\t]"
# String utilities
defp hex_to_binary(hex_string) do
hex_string
|> to_char_list
|> Enum.chunk(2, 2, [0])
|> hex_to_binary("")
|> decode_string
end
defp hex_to_binary([], result), do: result
defp hex_to_binary([[c, 0]], result), do: result <> ("#{c}0" |> String.to_integer(16))
defp hex_to_binary([pair | rest], result) do
hex_to_binary(rest, result <> <<(pair |> to_string |> String.to_integer(16))>>)
end
defp decode_string(str) when is_binary(str) do
{ encoding, bom_length } = :unicode.bom_to_encoding(str)
<< _bom::binary-size(bom_length), str::binary >> = str
case encoding do
{:utf16, :big} -> :unicode.characters_to_binary(str, encoding, :utf8)
{:utf16, :little} -> :unicode.characters_to_binary(str, encoding, :utf8)
:latin1 -> :unicode.characters_to_binary(str, encoding, :utf8)
{:utf32, endianness} -> :unicode.characters_to_binary(<<0, 0>> <> str, {:utf16, endianness}, :utf8)
_ -> str
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment