Last active
October 5, 2018 08:06
-
-
Save deadlyfingers/69447f695a0c7b7193eb4bb9a0df3662 to your computer and use it in GitHub Desktop.
Downloads Wordpress images and media files from exported posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Downloads Wordpress images and media files from exported posts | |
# usage: | |
# ruby ./wordpress-download-media.rb | |
# ruby ./wordpress-download-media.rb "_posts" "assets/media" "svg,pdf" | |
require "open-uri" | |
require "fileutils" | |
# default config | |
source = "_posts" | |
destination = "assets/media" | |
filetypes = [] | |
if ARGV.length > 0 | |
source = ARGV[0] | |
end | |
if ARGV.length > 1 | |
destination = ARGV[1] | |
end | |
if ARGV.length > 2 | |
filetypes = ARGV[2].split(",") | |
end | |
# define functions | |
def open_file_content(file) | |
#puts "Open '#{file}'" | |
fh = open file | |
content = fh.read | |
fh.close | |
return content | |
end | |
def file_ext(file) | |
return File.extname(file).strip.downcase[1..-1] | |
end | |
def process_wp_post(file, types=[]) | |
lc_types = types.map(&:downcase) | |
content = open_file_content(file) | |
arr = [] | |
pattern = /https?:\/\/w{3}?[a-z0-9\-_.+*#?&=%@!',;:~\$\/]+\/wp-content\/uploads\/[0-9]{4}\/[0-9]{2}\/[a-z0-9\-_.+*#?&=%@!',;:~\$\/]+/mi | |
content.scan(pattern) do |match| | |
src = match.to_s | |
if arr.include?(src) == false && (lc_types.length == 0 || lc_types.include?(file_ext(src))) | |
arr.push(src) | |
end | |
end | |
#puts "file: #{file}" | |
return arr | |
end | |
def download_links(links, dest) | |
arr = links.uniq | |
puts "Start downloading #{arr.length} from #{links.length} links." | |
arr.each do |src| | |
download(src, dest) | |
end | |
return arr.length | |
end | |
def download(url, dest) | |
destFile = File.join(dest, File.basename(url)) | |
puts "Download file from: #{url} to: #{destFile}" | |
Dir.mkdir(dest) unless Dir.exist?(dest) | |
case io = open(url) | |
when StringIO then File.open(destFile, 'w') { |f| f.write(io) } | |
when Tempfile then io.close; FileUtils.mv(io.path, destFile) | |
end | |
end | |
# script start | |
puts filetypes.length == 0 ? "Download .* media types" : "Download #{filetypes.length} media types: #{filetypes}" | |
# media download list | |
downloads = [] | |
# process all html files in directory | |
i = 0 | |
Dir.glob("#{source}**/*") do |post| | |
downloads.concat( process_wp_post(post, filetypes) ) | |
i += 1 | |
end | |
count = download_links(downloads, destination) | |
puts "#{i} posts processed and #{count} files downloaded into #{destination} dir" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment