Skip to content

Instantly share code, notes, and snippets.

@glenbray
Last active February 10, 2020 08:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save glenbray/8eb52d56789b589341518683ecb8fc6b to your computer and use it in GitHub Desktop.
Save glenbray/8eb52d56789b589341518683ecb8fc6b to your computer and use it in GitHub Desktop.
doccano to google automl annotations
Annotation = Struct.new(:label, :start_offset, :end_offset)
LABELS = { 1 => 'address', 3 => 'po_box' }
class Snippet
attr_reader :text, :annotations
def initialize(text: '', annotations:, **other)
@text = text
@annotations = annotations
end
def size
text.size
end
end
class SnippetCollection
def initialize(snippets)
@snippets = snippets
end
def to_google_annotation
snippet = merge
annotations = snippet.annotations.map do |ann|
{
text_extraction: {
text_segment: {
start_offset: ann.start_offset,
end_offset: ann.end_offset
}
},
display_name: LABELS[ann.label]
}
end
{
annotations: annotations,
text_snippet: { content: snippet.text },
}.to_json
end
private
def merge
text = ''
annotations = []
@snippets.each do |snippet|
if text.size == 0
text = snippet.text
annotations.push(snippet.annotations)
else
text_offset = text.size
snippet.annotations.each do |an|
join_offset = 2
start_offset = an.start_offset + text_offset + join_offset
end_offset = an.end_offset + text_offset + join_offset
new_annotation = Annotation.new(an.label, start_offset, end_offset)
annotations.push(new_annotation)
end
text = [text, snippet.text].join('. ')
end
end
Snippet.new(text: text, annotations: annotations.flatten)
end
end
snippets = []
File.open('file.json').each_line do |json|
line = JSON.parse(json).inject({}){ |memo,(k,v)| memo[k.to_sym] = v; memo }
line[:annotations] = line[:annotations].map do |a|
a.inject({}) { |memo,(k,v)| memo[k.to_sym] = v; memo }
Annotation.new(a['label'], a['start_offset'], a['end_offset'])
end
snippets.push(Snippet.new(line))
end
grouped_snippets = snippets.each_with_object([]) do |snippet, arr|
a = arr.pop || []
if a.sum { |e| e.size } + snippet.size <= 1000
a.push(snippet)
arr << a
else
arr << a
arr << [snippet]
end
end
snippet_collections = grouped_snippets.map { |group| SnippetCollection.new(group) }
google_annotations = snippet_collections.map(&:to_google_annotation)
File.open('google_annotations.jsonl', 'w') do |f|
google_annotations.each { |ga| f.puts ga }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment