Skip to content

Instantly share code, notes, and snippets.

@danlucraft
Last active August 26, 2022 17:20
Show Gist options
  • Save danlucraft/5277732 to your computer and use it in GitHub Desktop.
Save danlucraft/5277732 to your computer and use it in GitHub Desktop.
Extract annotations from PDFs with pdf-reader gem
require 'pdf-reader'
require './markup_receiver'
doc = PDF::Reader.new(ARGV[0])
$objects = doc.objects
def is_note?(object)
object[:Type] == :Annot && [:Text, :FreeText].include?(object[:Subtype])
end
def is_markup?(object)
object[:Type] == :Annot && [:Highlight, :Underline].include?(object[:Subtype])
end
def annots_on_page(page)
references = (page.attributes[:Annots] || [])
lookup_all(references).flatten
end
def lookup_all(refs)
refs = *refs
refs.map { |ref| lookup(ref) }
end
def lookup(ref)
object = $objects[ref]
return object unless object.is_a?(Array)
lookup_all(object)
end
def notes_on_page(page)
all_annots = annots_on_page(page)
all_annots.select { |a| is_note?(a) }
end
def markups_on_page(page)
all_annots = annots_on_page(page)
markups = all_annots.select { |a| is_markup?(a) }.map {|a| Markup.new(a) }
if markups.any?
receiver = MarkupReceiver.new(markups)
page.walk(receiver)
coords = nil
receiver.set_markup_texts
end
markups
end
class Markup
attr_reader :attributes
attr_accessor :text
def initialize(attributes)
@attributes = attributes
end
class Rectangle
attr_reader :quad_points
def initialize(points)
@quad_points = points.sort
end
def bottom_left
quad_points[0]
end
def top_left
quad_points[1]
end
def bottom_right
quad_points[2]
end
def top_right
quad_points[3]
end
def contains?(coords)
x, y = *coords
x >= bottom_left.first && x <= top_right.first &&
y >= bottom_left.last && y <= top_right.last
end
def within?(bottom, top)
bottom_left[1] >= bottom && bottom_left[1] <= top
end
end
def rectangles
attributes[:QuadPoints].each_slice(8).to_a.map do |ps|
Rectangle.new(ps.each_slice(2).to_a)
end
end
def color
rgb_to_hex(attributes[:C])
end
def contains?(x, y)
rectangles.any? {|r| r.contains?([x, y]) }
end
def within?(bottom, top)
rectangles.any? {|r| r.within?(bottom, top) }
end
def rgb_to_hex(rgb)
"#" + rgb.map {|i| (i*255).to_i.to_s(16).rjust(2, "0").upcase }.join
end
end
doc.pages.each do |page|
notes = notes_on_page(page)
markups = markups_on_page(page)
next unless notes.any? or markups.any?
puts "# Page #{page.number}"
notes.each do |note|
puts " * " + note[:Contents]
end
markups.each do |markup|
puts " - " + (markup.text || "")
end
puts
puts
end
require 'pdf/reader/page_layout'
# Builds a UTF-8 string of all the text on a single page within the given markups
# by processing all the operaters in a content stream.
class MarkupReceiver
extend Forwardable
def initialize(markups)
@markups = markups
end
SPACE = " "
attr_reader :state, :content, :options
########## BEGIN FORWARDERS ##########
# Graphics State Operators
def_delegators :@state, :save_graphics_state, :restore_graphics_state
# Matrix Operators
def_delegators :@state, :concatenate_matrix
# Text Object Operators
def_delegators :@state, :begin_text_object, :end_text_object
# Text State Operators
def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling
def_delegators :@state, :set_text_font_and_size, :font_size
def_delegators :@state, :set_text_leading, :set_text_rendering_mode
def_delegators :@state, :set_text_rise, :set_word_spacing
# Text Positioning Operators
def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
########## END FORWARDERS ##########
# starting a new page
def page=(page)
@state = PDF::Reader::PageState.new(page)
@content = []
@characters = Hash.new {|h,k| h[k] = [] }
@mediabox = page.attributes[:MediaBox]
end
def set_markup_texts
@characters.each do |markup, text_runs|
text = PDF::Reader::PageLayout.new(text_runs, @mediabox).to_s
markup.text = text
end
end
#####################################################
# Text Showing Operators
#####################################################
# record text that is drawn on the page
def show_text(string) # Tj (AWAY)
internal_show_text(string)
end
def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
params.each do |arg|
if arg.is_a?(String)
internal_show_text(arg)
else
@state.process_glyph_displacement(0, arg, false)
end
end
end
def move_to_next_line_and_show_text(str) # '
@state.move_to_start_of_next_line
show_text(str)
end
def set_spacing_next_line_show_text(aw, ac, string) # "
@state.set_word_spacing(aw)
@state.set_character_spacing(ac)
move_to_next_line_and_show_text(string)
end
#####################################################
# XObjects
#####################################################
def invoke_xobject(label)
@state.invoke_xobject(label) do |xobj|
case xobj
when PDF::Reader::FormXObject then
xobj.walk(self)
end
end
end
private
def internal_show_text(string)
if @state.current_font.nil?
raise PDF::Reader::MalformedPDFError, "current font is invalid"
end
glyphs = @state.current_font.unpack(string)
glyphs.each_with_index do |glyph_code, index|
# paint the current glyph
newx, newy = @state.trm_transform(0,0)
utf8_chars = @state.current_font.to_utf8(glyph_code)
# apply to glyph displacment for the current glyph so the next
# glyph will appear in the correct position
glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
th = 1
scaled_glyph_width = glyph_width * @state.font_size * th
unless utf8_chars == SPACE
@markups.each do |markup|
if markup.contains?(newx, newy)
text_run = PDF::Reader::TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
@characters[markup] << text_run
end
end
end
@state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
end
end
end
@lecabel
Copy link

lecabel commented May 22, 2016

Hi, is it possible to extract other information from annotation like user who annoted or date?
thanks
Fabio

@diegommarino
Copy link

Thank you danlucraft, your script helped me a lot to extract hyperlinks from the PDF page.

require 'pdf-reader'

puts 'Running...'
file = "/Users/diego/Downloads/DiegoMarinoProfile.pdf"
puts "File: #{file}"

doc = PDF::Reader.new(file)
$objects = doc.objects

def is_link?(object)
  object[:Type] == :Annot && [:Link].include?(object[:Subtype])
end

def is_note?(object)
  object[:Type] == :Annot && [:Text, :FreeText].include?(object[:Subtype])
end

def annots_on_page(page)
  references = (page.attributes[:Annots] || [])
  lookup_all(references).flatten
end

def lookup_all(refs)
  refs = *refs
  refs.map { |ref| lookup(ref) }
end

def lookup(ref)
  object = $objects[ref]
  return object unless object.is_a?(Array)
  lookup_all(object)
end

def notes_on_page(page)
  all_annots = annots_on_page(page)
  all_annots.select { |a| is_note?(a) }
end

def links_on_page(page)
  all_annots = annots_on_page(page)
  all_annots.select { |a| is_link?(a) }
end

doc.pages.each do |page|
  links = links_on_page(page)
  notes = notes_on_page(page)
  
  next unless notes.any? || links.any?

  puts "# Page #{page.number}"
  links.each do |link|
    puts "  * " + $objects[link[:A]][:URI]
  end

  notes.each do |note|
    puts "  * " + note[:Contents]
  end
  puts
  puts
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment