Seleucia/download.py

## download.py
# ------------------------------------------------------------------------------
# Adapted from https://github.com/activitynet/ActivityNet/
# Original licence: Copyright (c) Microsoft, under the MIT License.
# ------------------------------------------------------------------------------
import argparse
import glob
import json
import os
import shutil
import ssl
import subprocess
import uuid
from collections import OrderedDict
import pandas as pd
from joblib import Parallel, delayed
from multiprocessing.dummy import Pool as ThreadPool  ### this uses threads


ssl._create_default_https_context = ssl._create_unverified_context


def create_video_folders(dataset, output_dir, tmp_dir):
    """Creates a directory for each label name in the dataset."""
    if 'label-name' not in dataset.columns:
        this_dir = os.path.join(output_dir, 'test')
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        # I should return a dict but ...
        return this_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    label_to_dir = {}
    for label_name in dataset['label-name'].unique():
        this_dir = os.path.join(output_dir, label_name)
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        label_to_dir[label_name] = this_dir
    return label_to_dir


def construct_video_filename(row, label_to_dir, trim_format='%06d'):
    """Given a dataset row, this function constructs the output filename for a
    given video."""
    # print(trim_format)
    basename = '%s_%s_%s.mp4' % (row['video-id'],
                                 trim_format % row['start-time'],
                                 trim_format % row['end-time'])
    if not isinstance(label_to_dir, dict):
        dirname = label_to_dir
    else:
        dirname = label_to_dir[row['label-name']]
    output_filename = os.path.join(dirname, basename)
    return output_filename


def download_clip(video_identifier,
                  output_filename,
                  start_time,
                  end_time,
                  tmp_dir='/tmp/kinetics',
                  num_attempts=5,
                  url_base='https://www.youtube.com/watch?v='):
    """Download a video from youtube if exists and is not blocked.
    arguments:
    ---------
    video_identifier: str
        Unique YouTube video identifier (11 characters)
    output_filename: str
        File path where the video will be stored.
    start_time: float
        Indicates the begining time in seconds from where the video
        will be trimmed.
    end_time: float
        Indicates the ending time in seconds of the trimmed video.
    """
    # Defensive argument checking.
    assert isinstance(video_identifier, str), 'video_identifier must be string'
    assert isinstance(output_filename, str), 'output_filename must be string'
    assert len(video_identifier) == 11, 'video_identifier must have length 11'

    status = False
    # Construct command line for getting the direct video link.
    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())

    if not os.path.exists(output_filename):
        if not os.path.exists(tmp_filename):
            command = [
                'youtube-dl', '--quiet', '--no-warnings',
                '--no-check-certificate', '-f', 'mp4', '-o',
                '"%s"' % tmp_filename,
                '"%s"' % (url_base + video_identifier)
            ]
            command = ' '.join(command)
            print(command)
            attempts = 0
            while True:
                try:
                    # print('Command Started: {0}'.format(video_identifier))
                    subprocess.check_output(
                        command, shell=True, stderr=subprocess.STDOUT)
                    # print('Command ended: {0}'.format(video_identifier))
                except subprocess.CalledProcessError as err:
                    attempts += 1
                    if attempts == num_attempts:
                        # print('Command failed: {0}'.format(video_identifier))
                        return status, err.output
                else:
                    break

        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
        # Construct command to trim the videos (ffmpeg required).
        command = [
            'ffmpeg', '-i',
            '"%s"' % tmp_filename, '-ss',
            str(start_time), '-t',
            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
            '-threads', '1', '-loglevel', 'panic',
            '"%s"' % output_filename
        ]
        command = ' '.join(command)
        try:
            subprocess.check_output(
                command, shell=True, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            # print('errrr',command, err)
            return status, err.output

    # Check if the video was successfully saved.
    status = os.path.exists(output_filename)
    if os.path.exists(tmp_filename):
        os.remove(tmp_filename)
    # print(tmp_filename)
    return status, 'Downloaded'


def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
    """Wrapper for parallel processing purposes."""
    output_filename = construct_video_filename(row, label_to_dir, trim_format)
    clip_id = os.path.basename(output_filename).split('.mp4')[0]
    if os.path.exists(output_filename):
        status = tuple([clip_id, True, 'Exists'])
        return status

    downloaded, log = download_clip(
        row['video-id'],
        output_filename,
        row['start-time'],
        row['end-time'],
        tmp_dir=tmp_dir)
    status = tuple([clip_id, downloaded, log])
    return status


def download_clip_wrapper_pool(row):
    """Wrapper for parallel processing purposes."""
    output_filename = construct_video_filename(row, label_to_dir, trim_format)
    clip_id = os.path.basename(output_filename).split('.mp4')[0]
    if os.path.exists(output_filename):
        status = tuple([clip_id, True, 'Exists'])
        return status

    downloaded, log = download_clip(
        row['video-id'],
        output_filename,
        row['start-time'],
        row['end-time'],
        tmp_dir=tmp_dir)
    status = tuple([clip_id, downloaded, log])
    return status


def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
    """Returns a parsed DataFrame.
    arguments:
    ---------
    input_csv: str
        Path to CSV file containing the following columns:
          'YouTube Identifier,Start time,End time,Class label'
    returns:
    -------
    dataset: DataFrame
        Pandas with the following columns:
            'video-id', 'start-time', 'end-time', 'label-name'
    """
    # df = pd.read_csv(input_csv,nrows=50)
    df = pd.read_csv(input_csv)
    if 'youtube_id' in df.columns:
        columns = OrderedDict([('youtube_id', 'video-id'),
                               ('time_start', 'start-time'),
                               ('time_end', 'end-time'),
                               ('label', 'label-name')])
        df.rename(columns=columns, inplace=True)
        if ignore_is_cc:
            df = df.loc[:, df.columns.tolist()[:-1]]
    return df


label_to_dir, trim_format, tmp_dir='','',''
trim_format = '%06d'
def main(num_jobs):
    # Reading and parsing Kinetics.


    # Download all clips.
    status_list = []
    if num_jobs == 1:
        for i, row in dataset.iterrows():
            status_list.append(
                download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
    else:

        row_lst=[row for i, row in dataset.iterrows()]
        pool = ThreadPool(num_jobs)
        status_list=pool.map(download_clip_wrapper_pool, row_lst)
        pool.close()
        pool.join()

        # status_list = Parallel(n_jobs=num_jobs)(delayed(download_clip_wrapper)(
        #         row, label_to_dir, trim_format, tmp_dir)
        #                      for i, row in dataset.iterrows())

    # Clean tmp dir.
    shutil.rmtree(tmp_dir)

    # Save download report.
    if len(status_list)>0:
        with open('download_report.json', 'w') as fobj:
            fobj.write(json.dumps(status_list))
    print('*************************************************************************************************************************************')
    print('Completed Number videos: {0}; Total videos: {1}'.format(len(status_list)),len(dataset))
    print(
        '*************************************************************************************************************************************')
if __name__ == '__main__':
    description = 'Helper script for downloading and trimming kinetics videos.'
    p = argparse.ArgumentParser(description=description)
    p.add_argument(
        'input_csv',
        type=str,
        default='kinetics400/test.csv',
        help=('CSV file containing the following format: '
              'YouTube Identifier,Start time,End time,Class label'))
    p.add_argument(
        'output_dir',
        type=str,
        default='output_dir',
        help='Output directory where videos will be saved.')
    p.add_argument(
        '-f',
        '--trim-format',
        type=str,
        default='%06d',
        help=('This will be the format for the '
              'filename of trimmed videos: '
              'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
    p.add_argument('-n', '--num-jobs', type=int, default=25)
    p.add_argument('-t', '--tmp-dir', type=str, default='/mnt/3tb/ds/kinetics/kinetics400/tmp')
    # help='CSV file of the previous version of Kinetics.')
    args = p.parse_args()
    input_csv=args.input_csv
    output_dir=args.output_dir
    tmp_dir=args.tmp_dir
    num_jobs=args.num_jobs
    # tmp_dir=args.tmp-dir

    dataset = parse_kinetics_annotations(input_csv)


    # Creates folders where videos will be saved later.
    label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
    main(num_jobs=num_jobs)
	# ------------------------------------------------------------------------------
	# Adapted from https://github.com/activitynet/ActivityNet/
	# Original licence: Copyright (c) Microsoft, under the MIT License.
	# ------------------------------------------------------------------------------
	import argparse
	import glob
	import json
	import os
	import shutil
	import ssl
	import subprocess
	import uuid
	from collections import OrderedDict
	import pandas as pd
	from joblib import Parallel, delayed
	from multiprocessing.dummy import Pool as ThreadPool ### this uses threads


	ssl._create_default_https_context = ssl._create_unverified_context


	def create_video_folders(dataset, output_dir, tmp_dir):
	"""Creates a directory for each label name in the dataset."""
	if 'label-name' not in dataset.columns:
	this_dir = os.path.join(output_dir, 'test')
	if not os.path.exists(this_dir):
	os.makedirs(this_dir)
	# I should return a dict but ...
	return this_dir
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	if not os.path.exists(tmp_dir):
	os.makedirs(tmp_dir)

	label_to_dir = {}
	for label_name in dataset['label-name'].unique():
	this_dir = os.path.join(output_dir, label_name)
	if not os.path.exists(this_dir):
	os.makedirs(this_dir)
	label_to_dir[label_name] = this_dir
	return label_to_dir


	def construct_video_filename(row, label_to_dir, trim_format='%06d'):
	"""Given a dataset row, this function constructs the output filename for a
	given video."""
	# print(trim_format)
	basename = '%s_%s_%s.mp4' % (row['video-id'],
	trim_format % row['start-time'],
	trim_format % row['end-time'])
	if not isinstance(label_to_dir, dict):
	dirname = label_to_dir
	else:
	dirname = label_to_dir[row['label-name']]
	output_filename = os.path.join(dirname, basename)
	return output_filename


	def download_clip(video_identifier,
	output_filename,
	start_time,
	end_time,
	tmp_dir='/tmp/kinetics',
	num_attempts=5,
	url_base='https://www.youtube.com/watch?v='):
	"""Download a video from youtube if exists and is not blocked.
	arguments:
	---------
	video_identifier: str
	Unique YouTube video identifier (11 characters)
	output_filename: str
	File path where the video will be stored.
	start_time: float
	Indicates the begining time in seconds from where the video
	will be trimmed.
	end_time: float
	Indicates the ending time in seconds of the trimmed video.
	"""
	# Defensive argument checking.
	assert isinstance(video_identifier, str), 'video_identifier must be string'
	assert isinstance(output_filename, str), 'output_filename must be string'
	assert len(video_identifier) == 11, 'video_identifier must have length 11'

	status = False
	# Construct command line for getting the direct video link.
	tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())

	if not os.path.exists(output_filename):
	if not os.path.exists(tmp_filename):
	command = [
	'youtube-dl', '--quiet', '--no-warnings',
	'--no-check-certificate', '-f', 'mp4', '-o',
	'"%s"' % tmp_filename,
	'"%s"' % (url_base + video_identifier)
	]
	command = ' '.join(command)
	print(command)
	attempts = 0
	while True:
	try:
	# print('Command Started: {0}'.format(video_identifier))
	subprocess.check_output(
	command, shell=True, stderr=subprocess.STDOUT)
	# print('Command ended: {0}'.format(video_identifier))
	except subprocess.CalledProcessError as err:
	attempts += 1
	if attempts == num_attempts:
	# print('Command failed: {0}'.format(video_identifier))
	return status, err.output
	else:
	break

	tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
	# Construct command to trim the videos (ffmpeg required).
	command = [
	'ffmpeg', '-i',
	'"%s"' % tmp_filename, '-ss',
	str(start_time), '-t',
	str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
	'-threads', '1', '-loglevel', 'panic',
	'"%s"' % output_filename
	]
	command = ' '.join(command)
	try:
	subprocess.check_output(
	command, shell=True, stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError as err:
	# print('errrr',command, err)
	return status, err.output

	# Check if the video was successfully saved.
	status = os.path.exists(output_filename)
	if os.path.exists(tmp_filename):
	os.remove(tmp_filename)
	# print(tmp_filename)
	return status, 'Downloaded'


	def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
	"""Wrapper for parallel processing purposes."""
	output_filename = construct_video_filename(row, label_to_dir, trim_format)
	clip_id = os.path.basename(output_filename).split('.mp4')[0]
	if os.path.exists(output_filename):
	status = tuple([clip_id, True, 'Exists'])
	return status

	downloaded, log = download_clip(
	row['video-id'],
	output_filename,
	row['start-time'],
	row['end-time'],
	tmp_dir=tmp_dir)
	status = tuple([clip_id, downloaded, log])
	return status


	def download_clip_wrapper_pool(row):
	"""Wrapper for parallel processing purposes."""
	output_filename = construct_video_filename(row, label_to_dir, trim_format)
	clip_id = os.path.basename(output_filename).split('.mp4')[0]
	if os.path.exists(output_filename):
	status = tuple([clip_id, True, 'Exists'])
	return status

	downloaded, log = download_clip(
	row['video-id'],
	output_filename,
	row['start-time'],
	row['end-time'],
	tmp_dir=tmp_dir)
	status = tuple([clip_id, downloaded, log])
	return status


	def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
	"""Returns a parsed DataFrame.
	arguments:
	---------
	input_csv: str
	Path to CSV file containing the following columns:
	'YouTube Identifier,Start time,End time,Class label'
	returns:
	-------
	dataset: DataFrame
	Pandas with the following columns:
	'video-id', 'start-time', 'end-time', 'label-name'
	"""
	# df = pd.read_csv(input_csv,nrows=50)
	df = pd.read_csv(input_csv)
	if 'youtube_id' in df.columns:
	columns = OrderedDict([('youtube_id', 'video-id'),
	('time_start', 'start-time'),
	('time_end', 'end-time'),
	('label', 'label-name')])
	df.rename(columns=columns, inplace=True)
	if ignore_is_cc:
	df = df.loc[:, df.columns.tolist()[:-1]]
	return df



	label_to_dir, trim_format, tmp_dir='','',''
	trim_format = '%06d'
	def main(num_jobs):
	# Reading and parsing Kinetics.


	# Download all clips.
	status_list = []
	if num_jobs == 1:
	for i, row in dataset.iterrows():
	status_list.append(
	download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
	else:

	row_lst=[row for i, row in dataset.iterrows()]
	pool = ThreadPool(num_jobs)
	status_list=pool.map(download_clip_wrapper_pool, row_lst)
	pool.close()
	pool.join()

	# status_list = Parallel(n_jobs=num_jobs)(delayed(download_clip_wrapper)(
	# row, label_to_dir, trim_format, tmp_dir)
	# for i, row in dataset.iterrows())

	# Clean tmp dir.
	shutil.rmtree(tmp_dir)

	# Save download report.
	if len(status_list)>0:
	with open('download_report.json', 'w') as fobj:
	fobj.write(json.dumps(status_list))
	print('*************************************************************************************************************************************')
	print('Completed Number videos: {0}; Total videos: {1}'.format(len(status_list)),len(dataset))
	print(
	'*************************************************************************************************************************************')
	if __name__ == '__main__':
	description = 'Helper script for downloading and trimming kinetics videos.'
	p = argparse.ArgumentParser(description=description)
	p.add_argument(
	'input_csv',
	type=str,
	default='kinetics400/test.csv',
	help=('CSV file containing the following format: '
	'YouTube Identifier,Start time,End time,Class label'))
	p.add_argument(
	'output_dir',
	type=str,
	default='output_dir',
	help='Output directory where videos will be saved.')
	p.add_argument(
	'-f',
	'--trim-format',
	type=str,
	default='%06d',
	help=('This will be the format for the '
	'filename of trimmed videos: '
	'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
	p.add_argument('-n', '--num-jobs', type=int, default=25)
	p.add_argument('-t', '--tmp-dir', type=str, default='/mnt/3tb/ds/kinetics/kinetics400/tmp')
	# help='CSV file of the previous version of Kinetics.')
	args = p.parse_args()
	input_csv=args.input_csv
	output_dir=args.output_dir
	tmp_dir=args.tmp_dir
	num_jobs=args.num_jobs
	# tmp_dir=args.tmp-dir

	dataset = parse_kinetics_annotations(input_csv)


	# Creates folders where videos will be saved later.
	label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
	main(num_jobs=num_jobs)