Last active
June 8, 2022 13:15
-
-
Save johncarney/7332f7b2075b86ea52177a4a82453806 to your computer and use it in GitHub Desktop.
A Psych-based YAML parser that captures line numbers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Custom Psych parser that captures line number information from a YAML file. | |
# | |
# For a project I'm working on I need to be able to determine which line(s) in a YAML | |
# file a particular value comes from. There are a few bits of advice on the internet | |
# about this, the best of them that I've found involves monkey-patching, which is a | |
# fairly low bar for "best" in my opinion. I found it on Stack Overflow: | |
# | |
# https://stackoverflow.com/questions/29462856/loading-yaml-with-line-number-for-each-key | |
# | |
# Here's my take without monkey-patching. It deals with values spanning multiple lines | |
# and handles YAML's << (insertion) operator, borrowing liberally from Psych's source code | |
# to do so. | |
require "psych" | |
require "pp" | |
ValueWithLineNumbers = Struct.new(:value, :lines) | |
class Psych::Nodes::ScalarWithLineNumber < Psych::Nodes::Scalar | |
attr_reader :line_number | |
def initialize(*args, line_number) | |
super(*args) | |
@line_number = line_number | |
end | |
end | |
class Psych::TreeWithLineNumbersBuilder < Psych::TreeBuilder | |
attr_accessor :parser | |
def scalar(*args) | |
node = Psych::Nodes::ScalarWithLineNumber.new(*args, parser.mark.line) | |
@last.children << node | |
node | |
end | |
end | |
class Psych::Visitors::ToRubyWithLineNumbers < Psych::Visitors::ToRuby | |
def visit_Psych_Nodes_ScalarWithLineNumber(node) | |
visit_Psych_Nodes_Scalar(node) | |
end | |
private | |
def revive_hash(hash, node) | |
node.children.each_slice(2) do |k, v| | |
key = accept(k) | |
val = accept(v) | |
if v.is_a? Psych::Nodes::ScalarWithLineNumber | |
start_line = end_line = v.line_number + 1 | |
if k.is_a? Psych::Nodes::ScalarWithLineNumber | |
start_line = k.line_number + 1 | |
end | |
val = ValueWithLineNumbers.new(val, start_line..end_line) | |
end | |
if key == SHOVEL && k.tag != "tag:yaml.org,2002:str" | |
case v | |
when Psych::Nodes::Alias, Psych::Nodes::Mapping | |
begin | |
hash.merge! val | |
rescue TypeError | |
hash[key] = val | |
end | |
when Psych::Nodes::Sequence | |
begin | |
h = {} | |
val.reverse_each do |value| | |
h.merge! value | |
end | |
hash.merge! h | |
rescue TypeError | |
hash[key] = val | |
end | |
else | |
hash[key] = val | |
end | |
else | |
hash[key] = val | |
end | |
end | |
hash | |
end | |
end | |
yaml = <<~YAML | |
en: | |
errors: | |
# A comment | |
format: "%{attribute} %{message}" | |
# Another comment | |
messages: | |
"1": "Message 1" | |
"2": "Message 2" | |
long_message: | | |
This message | |
is split over | |
multiple lines | |
a_sequence: | |
- Currently | |
- we | |
- don't | |
- handle | |
- sequences | |
comman: &common | |
a: 1 | |
b: 2 | |
foo: | |
<<: *common | |
c: 3 | |
bar: | |
<<: *common | |
b: 3 | |
date: | |
format: "YYYY-MM-DD" | |
YAML | |
handler = Psych::TreeWithLineNumbersBuilder.new | |
handler.parser = Psych::Parser.new(handler) | |
handler.parser.parse(yaml) | |
ruby_with_line_numbers = Psych::Visitors::ToRubyWithLineNumbers.create.accept(handler.root) | |
pp ruby_with_line_numbers | |
puts | |
# The project I'm working on is identifying which locale strings have been changed (added, | |
# modified, deleted) in a git commit. From the git diff I can get changed lines, so I need a | |
# way to map that to a key path. | |
def key_paths_by_line(hash, *parent_path) | |
hash.inject([]) do |map, (key, value)| | |
path = [*parent_path, key] | |
case value | |
when ValueWithLineNumbers | |
[*map, [value.lines, path]] | |
when Hash | |
map + key_paths_by_line(value, *path) | |
else | |
map | |
end | |
end | |
end | |
key_path_index = ruby_with_line_numbers.map(&method(:key_paths_by_line)).first | |
puts "Key paths indexed by line range:" | |
pp key_path_index | |
puts | |
# Given a line number, we can identify the key path as follows | |
line = 12 | |
key_path = key_path_index.detect { |lines, key_path| lines.cover? line }&.last | |
puts "The key path for the value at line ##{line} is #{key_path.inspect}" | |
puts | |
# If you want to know where in the source a given value is located, you can use the above | |
# to build an index of line ranges by key path. | |
def lines_by_key_path(hash) | |
key_paths_by_line(hash).map(&:reverse).to_h | |
end | |
puts "Line ranges indexed by key path:" | |
pp ruby_with_line_numbers.map(&method(:lines_by_key_path)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment