Skip to content

Instantly share code, notes, and snippets.

@yaauie
Last active January 23, 2024 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yaauie/55b04f3cce86d11ef7e9ff3a14a86b44 to your computer and use it in GitHub Desktop.
Save yaauie/55b04f3cce86d11ef7e9ff3a14a86b44 to your computer and use it in GitHub Desktop.
###############################################################################
# utf8-coerce.logstash-filter-ruby.rb
# ---------------------------------
# A script for a Logstash Ruby Filter to forcefully coerce string-value field
# to valid UTF-8, preferring a _representational_ transcode operation, and
# falling back to the use of UTF8 replacement characters when encountering byte
# sequences that cannot be represented in unicode, optionally stashing a base64
# encoded copy of the original when such lossy replacements are made.
###############################################################################
#
# Copyright 2024 Rye Biesemeyer
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
def register(original_params)
params = original_params.dup
# field: a field-reference to the path of the string (e.g., "[event][original]") that should be coerced
@field = params.delete('field') || report_configuration_error("missing required script parameter `field`")
# stash: an optional field-reference to the path where a base64-encoded copy of the original bytes should
# be placed if-and-only-if the field's value was modified in a lossy manner
@stash = params.delete('stash') || nil
params.empty? || report_configuration_error("unknown script parameter(s): #{params.keys}.")
require 'base64' if @stash
end
def report_configuration_error(message)
raise LogStash::ConfigurationError, message
end
def filter(event)
value = event.get(@field)
if value && value.kind_of?(String)
# we encode-then-scrub to allow not-quite-UTF8 strings to retain their
# valid-UTF8 multibyte characters. In both steps we fall back to using
# the UTF8-replacement character and keep track of lossy operations.
lossy_conversion = nil
replace_and_flag = ->(_){ lossy_conversion = true; "\uFFFD" }
encoded = value.encode(Encoding::UTF_8, fallback: replace_and_flag)
.scrub(&replace_and_flag)
if encoded != value
event.set(@field, encoded)
event.set(@stash, Base64.strict_encode64(value)) if lossy_conversion && @stash
end
end
ensure
return [event]
end
test 'handling valid utf8 input' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
known_valid_unicode_string = "Thïs is a known-valid unicode string 💖".freeze
in_event do
{
"event" => {"original" => known_valid_unicode_string.dup}
}
end
expect "performs no transformation" do |events|
events.first.get('[event][original]') == known_valid_unicode_string
end
expect "performs no transformation to the bytes" do |events|
events.first.get('[event][original]').b == known_valid_unicode_string.b
end
expect "performs no transformation to the encoding flag" do |events|
events.first.get('[event][original]')&.encoding == known_valid_unicode_string.encoding
end
expect 'does not stash base64-encoded snapshot' do |events|
!events.first.include?('[@metadata][stash]')
end
end
test 'handling coercible WIN1252 input ' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
known_valid_windows1252_string = "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze
equivalent_utf8_string = "Th\u00EFs \u00CCs W\u00CFnd\u00D8w\u0160"
in_event do
{
"event" => {"original" => known_valid_windows1252_string.dup}
}
end
expect 'transcodes to valid utf8 representing the same characters' do |events|
events.first.get('[event][original]') == equivalent_utf8_string
end
expect 'does not stash base64-encoded snapshot' do |events|
!events.first.include?('[@metadata][stash]')
end
end
test 'handling WIN1252 input that is flagged as UTF-8' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
known_valid_windows1252_labeled_utf8 = "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze
in_event do
{
"event" => {"original" => known_valid_windows1252_labeled_utf8.dup.force_encoding(Encoding::UTF_8)}
}
end
expect 'transcodes to valid utf8 using the replacement character' do |events|
events.first.get('[event][original]') == "Th\ufffds \ufffds W\ufffdnd\ufffdw\ufffd"
end
expect 'stashes base64-encoded snapshot' do |events|
events.first.get('[@metadata][stash]').then { |s| Base64.decode64(s).bytes == known_valid_windows1252_labeled_utf8.bytes }
end
end
test 'handling binary input' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
invalid_string = "\x06\x02\x1A\x08\x74\x68\x61\x74\x0E\x61\x6E\x6F\x74\x68\x65\x72\x71\x82\x37\xA0\x10\x26\x11\xD7\x0E\xFF\x35\x42\xFF\x3E\xAD\x06".b.freeze
in_event do
{
"event" => {"original" => invalid_string.dup}
}
end
expect 'translates, using utf8 replacement character' do |events|
events.first.get('[event][original]') == "\u0006\u0002\u001A\bthat\u000Eanotherq�7�\u0010&\u0011�\u000E�5B�>�\u0006"
end
expect 'stashes base64 version' do |events|
events.first.get('[@metadata][stash]').then { |s| Base64.decode64(s).bytes == invalid_string.bytes }
end
end
test 'handling binary input that is flagged as UTF-8' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
invalid_string = "\x06\x02\x1A\x08\x74\x68\x61\x74\x0E\x61\x6E\x6F\x74\x68\x65\x72\x71\x82\x37\xA0\x10\x26\x11\xD7\x0E\xFF\x35\x42\xFF\x3E\xAD\x06".b.freeze
in_event do
{
"event" => {"original" => invalid_string.dup.force_encoding(Encoding::UTF_8)}
}
end
expect 'translates, using utf8 replacement character' do |events|
events.first.get('[event][original]') == "\u0006\u0002\u001A\bthat\u000Eanotherq�7�\u0010&\u0011�\u000E�5B�>�\u0006"
end
expect 'stashes base64 version' do |events|
events.first.get('[@metadata][stash]').then { |s| Base64.decode64(s).bytes == invalid_string.bytes }
end
end
test 'handling mixed input that is flagged as UTF-8' do
parameters { {"field" => "[event][original]", "stash" => "[@metadata][stash]"} }
known_invalid_mostly_ok_utf8 = "Thïs is a not-quite-v\xCEalid unicode string 💖".b.force_encoding(Encoding::UTF_8).freeze
in_event do
{
"event" => {"original" => known_invalid_mostly_ok_utf8.dup.force_encoding(Encoding::UTF_8)}
}
end
expect 'transcodes to valid utf8 using the replacement character without munging the valid multibyte sequences' do |events|
events.first.get('[event][original]') == "Thïs is a not-quite-v\uFFFDalid unicode string 💖"
end
expect 'stashes base64-encoded snapshot' do |events|
events.first.get('[@metadata][stash]').then { |s| Base64.decode64(s).bytes == known_invalid_mostly_ok_utf8.bytes }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment