Last active
December 21, 2015 01:09
-
-
Save olexpono/6225898 to your computer and use it in GitHub Desktop.
Ruby Regex Examples / Scrubbing stuff out of Wordpress export files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
print "WP XML filename: " | |
filename = gets.chomp | |
contents = File.open(filename).read | |
# This is purely for __client__001, probably not useful | |
# for a lot of exports | |
leadgen_regex = /\r?\n?\r?\n?<a[^>]*cr=LG.*\s*\r?\n?\s*\]\]/ | |
leadgen_modules = contents.scan(leadgen_regex) | |
puts "Removing Lead-gen Modules from the bottom of posts. Their sizes should be ~170 - 390:" | |
puts leadgen_modules.map(&:size).inspect | |
contents.gsub!(leadgen_regex, "]]") | |
puts "Removing alignments from images" | |
align_regex = /(alignright[ ]?|alignleft[ ]?|alignnone[ ]?)/ | |
contents.gsub!(align_regex, "") | |
puts "Removing specified widths above 500, this takes a while." | |
# el-name ... width attribute | |
specified_width_regex = /<(\w*) [^>]*(width=".{2,7}")/ | |
widths_removed = 0 | |
attrs_to_remove = [] | |
contents.scan(specified_width_regex) do |element, width_attr| | |
width = /"(.*)"/.match(width_attr)[1] | |
if width && width.to_i > 400 | |
attrs_to_remove << width_attr | |
end | |
end | |
attrs_to_remove.each do |width_attr| | |
# slow, yes | |
# but safer not to remove <400px widths | |
puts width_attr | |
contents.sub!(width_attr, "") | |
widths_removed += 1 | |
end | |
puts "Number of removed width attributes: #{ widths_removed.to_s }" | |
puts "Adding tag 'homepage' to every post" | |
# This is purely for __client__001, probably not useful | |
# for a lot of exports | |
homepage_tag_string = "\n\t\t<category domain=\"post_tag\" nicename=\"homepage\"><![CDATA[homepage]]></category>\n" | |
contents.gsub!(/(is_sticky.*)(\n)(.*<category.*("post_tag"|"category"))/) do |match| | |
puts "$1 = #{$1.inspect} || $3 = #{$3.inspect}" | |
puts "$~.to_a = #{$~.to_a}" | |
$1 + homepage_tag_string + $3 | |
end | |
puts "Removing alt tags" | |
contents.gsub!(/alt="[^"]*"/, "") | |
print "Testing XML integrity..." | |
doctest = Nokogiri.XML(contents) | |
puts " ... OK! probably!" unless doctest.nil? | |
puts " BAD!" if doctest.nil? | |
print "Pick a filename to save the new XML: " | |
new_filename = gets.chomp | |
File.open(new_filename, 'w') do |f| | |
f.write(contents) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment