Last active
May 28, 2020 19:33
-
-
Save tinabel/ddd5cc9b0dd762986918520a132800d2 to your computer and use it in GitHub Desktop.
Ensure that MS Word docx file headers are not corrupted by RubyZip
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'zip' | |
require 'zip/zipfilesystem' | |
class ZipService | |
# Initialize with the directory to zip and the location of the output archive. | |
def initialize(input_dir, output_file) | |
@input_dir = input_dir | |
@output_file = output_file | |
end | |
# Zip the input directory. | |
def write | |
entries = Dir.entries(@input_dir) - %w[. ..] | |
::Zip::File.open(@output_file, ::Zip::File::CREATE) do |zipfile| | |
write_entries entries, '', zipfile | |
end | |
end | |
private | |
# A helper method to make the recursion work. | |
def write_entries(entries, path, zipfile) | |
entries.each do |e| | |
zipfile_path = path == '' ? e : File.join(path, e) | |
disk_file_path = File.join(@input_dir, zipfile_path) | |
if File.directory? disk_file_path | |
recursively_deflate_directory(disk_file_path, zipfile, zipfile_path) | |
else | |
put_into_archive(disk_file_path, zipfile, zipfile_path, e) | |
end | |
end | |
end | |
def recursively_deflate_directory(disk_file_path, zipfile, zipfile_path) | |
zipfile.mkdir zipfile_path | |
subdir = Dir.entries(disk_file_path) - %w[. ..] | |
write_entries subdir, zipfile_path, zipfile | |
end | |
def put_into_archive(disk_file_path, zipfile, zipfile_path, entry) | |
if File.extname(zipfile_path) == ".docx" | |
Zip::File.open(disk_file_path) do |zip| | |
doc = zip.read("word/document.xml") | |
xml = Nokogiri::XML.parse(doc) | |
zip.get_output_stream("word/document.xml") {|f| f.write(xml.to_s)} | |
end | |
zipfile.add(zipfile_path, disk_file_path) | |
else | |
zipfile.add(zipfile_path, disk_file_path) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This gist zips files recursively, looking for docx files and re-parsing them using Nokogiri. This solution does not use write_buffer.