Coro365/reuters_video.rb

## reuters_video.rb
require 'open-uri'
require 'fileutils'

module Natural_sort
  def natural_sort
    zero_length = max_num_length_of(self)

    number_formated_names = self.map do |data|
      name_array = data.scan(/(\D*)(\d+)(\D*)/).flatten.reject(&:empty?)
      name_array = [data] if name_array.empty?
      number_formated_name_array = name_array.map do |e|
        e.match(/\d/) ? format("%0#{zero_length}d", e.to_i) : e
      end
      number_formated_name_array.join
    end

    names_hash = Hash[self.zip number_formated_names]
    names_hash_sorted =  Hash[names_hash.sort_by{ |k,v| v }]
    names_hash_sorted.keys

  end

  def max_num_length_of(names)
    max_num_len_of_names = names.map do |name|
      if m = name.scan(/\d+/)
        m.map { |e| e.length }.sort.last
      end
    end
    max_num_len_of_names.compact.sort.last
  end
end

include Natural_sort

def join_ts(m3u8)
  ts_files = Dir.glob(File.join(__dir__, 'temp_ts', '*')).natural_sort
  mp4_file = File.join(__dir__, 'mp4', [m3u8[:id], m3u8[:title]].join('-') + '.mp4')
  cmd = ['/usr/local/bin/ffmpeg', '-y', '-i', 'concat:' + ts_files.join('|'), '-c', 'copy', mp4_file]
  system(*cmd)
end

def delete_temp_dir
  FileUtils.rm_rf(File.join(__dir__, 'temp_ts'))
end

def check_downloaded
  Dir.glob(File.join(__dir__, 'mp4', '*')).map { |e| File.basename(e).split('-').first }
end

def add_log(recode)
  recode = [Time.now, recode + "\n"].join(",\s")
  File.write(File.join(__dir__, 'downlod.log'), recode, mode: 'a')
end

def scrap_quality_m3u8_urls
  puts('Scrap quality m3u8')
  video_url = 'https://jp.reuters.com/video/'
  video_html = URI.open(video_url).read

  reg = %r["file":"(https://ajo.prod.reuters.tv/rest/v2/playlist/assets.*?\.m3u8)","share":{"title":"(.*?)","url"]
  video_html.scan(reg).uniq
end

def to_id(url)
  url.split('/').grep(/\d{6}/).first
end

def downlod(file_path, url)
  File.write(File.join(file_path), URI.open(url).read)
end

saved_video_ids = check_downloaded
quality_m3u8_urls = scrap_quality_m3u8_urls.reject { |e| saved_video_ids.include?(to_id(e[0])) || !e[1].include?('字幕') }

puts('Download quality m3u8')
x1080_m3u8s = quality_m3u8_urls.map do |m3u8_url, title|
  id = m3u8_url.split('/').grep(/\d{6}/).first
  title = title.gsub(/（.*?日）/,'').tr('０-９ａ-ｚＡ-Ｚ＝！？「」　、', '0-9a-zA-Z=!?    ')
  { url: URI.open(m3u8_url).read.split("\n").last, id: id, title: title }
end

puts("Download #{x1080_m3u8s.size}video")
x1080_m3u8s.each do |x1080_m3u8|
  add_log(x1080_m3u8[:id])

  FileUtils.mkdir_p(File.join(__dir__, 'temp_ts'))
  FileUtils.mkdir_p(File.join(__dir__, 'mp4'))

  puts("Download ts file (#{x1080_m3u8[:id]})")
  vidoe_parts_line = URI.open(x1080_m3u8[:url]).read.split("\n")
  vidoe_parts = vidoe_parts_line.reject { |e| e[0] == '#' }
  vidoe_parts.each do |ts_url|
    downlod([__dir__, 'temp_ts', File.basename(ts_url)], ts_url)
  end

  puts('Convert mp4')
  join_ts(x1080_m3u8)
  sleep 2
  delete_temp_dir

end
	require 'open-uri'
	require 'fileutils'

	module Natural_sort
	def natural_sort
	zero_length = max_num_length_of(self)

	number_formated_names = self.map do \|data\|
	name_array = data.scan(/(\D)(\d+)(\D)/).flatten.reject(&:empty?)
	name_array = [data] if name_array.empty?
	number_formated_name_array = name_array.map do \|e\|
	e.match(/\d/) ? format("%0#{zero_length}d", e.to_i) : e
	end
	number_formated_name_array.join
	end

	names_hash = Hash[self.zip number_formated_names]
	names_hash_sorted = Hash[names_hash.sort_by{ \|k,v\| v }]
	names_hash_sorted.keys

	end

	def max_num_length_of(names)
	max_num_len_of_names = names.map do \|name\|
	if m = name.scan(/\d+/)
	m.map { \|e\| e.length }.sort.last
	end
	end
	max_num_len_of_names.compact.sort.last
	end
	end

	include Natural_sort

	def join_ts(m3u8)
	ts_files = Dir.glob(File.join(__dir__, 'temp_ts', '*')).natural_sort
	mp4_file = File.join(__dir__, 'mp4', [m3u8[:id], m3u8[:title]].join('-') + '.mp4')
	cmd = ['/usr/local/bin/ffmpeg', '-y', '-i', 'concat:' + ts_files.join('\|'), '-c', 'copy', mp4_file]
	system(*cmd)
	end

	def delete_temp_dir
	FileUtils.rm_rf(File.join(__dir__, 'temp_ts'))
	end

	def check_downloaded
	Dir.glob(File.join(__dir__, 'mp4', '*')).map { \|e\| File.basename(e).split('-').first }
	end

	def add_log(recode)
	recode = [Time.now, recode + "\n"].join(",\s")
	File.write(File.join(__dir__, 'downlod.log'), recode, mode: 'a')
	end

	def scrap_quality_m3u8_urls
	puts('Scrap quality m3u8')
	video_url = 'https://jp.reuters.com/video/'
	video_html = URI.open(video_url).read

	reg = %r["file":"(https://ajo.prod.reuters.tv/rest/v2/playlist/assets.?\.m3u8)","share":{"title":"(.?)","url"]
	video_html.scan(reg).uniq
	end

	def to_id(url)
	url.split('/').grep(/\d{6}/).first
	end

	def downlod(file_path, url)
	File.write(File.join(file_path), URI.open(url).read)
	end

	saved_video_ids = check_downloaded
	quality_m3u8_urls = scrap_quality_m3u8_urls.reject { \|e\| saved_video_ids.include?(to_id(e[0])) \|\| !e[1].include?('字幕') }

	puts('Download quality m3u8')
	x1080_m3u8s = quality_m3u8_urls.map do \|m3u8_url, title\|
	id = m3u8_url.split('/').grep(/\d{6}/).first
	title = title.gsub(/（.*?日）/,'').tr('０-９ａ-ｚＡ-Ｚ＝！？「」　、', '0-9a-zA-Z=!? ')
	{ url: URI.open(m3u8_url).read.split("\n").last, id: id, title: title }
	end

	puts("Download #{x1080_m3u8s.size}video")
	x1080_m3u8s.each do \|x1080_m3u8\|
	add_log(x1080_m3u8[:id])

	FileUtils.mkdir_p(File.join(__dir__, 'temp_ts'))
	FileUtils.mkdir_p(File.join(__dir__, 'mp4'))

	puts("Download ts file (#{x1080_m3u8[:id]})")
	vidoe_parts_line = URI.open(x1080_m3u8[:url]).read.split("\n")
	vidoe_parts = vidoe_parts_line.reject { \|e\| e[0] == '#' }
	vidoe_parts.each do \|ts_url\|
	downlod([__dir__, 'temp_ts', File.basename(ts_url)], ts_url)
	end

	puts('Convert mp4')
	join_ts(x1080_m3u8)
	sleep 2
	delete_temp_dir

	end