superfawkes/download_bbc_sound_effects.py

## download_bbc_sound_effects.py
#!/usr/bin/env python

""" BBC Sound Effects scraper script: Copyright 2018 Shrinivas Ramani
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.

WARNING:
* Content licensed by BBC under https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
  PLEASE PLEASE READ THIS to understand what exactly is allowed with the contents obtained via the script.
  BBC, Acropolis and other parties continue to own copyrights to the website, its underlying technology and hosted content.
* Author is not responsible for any malicious use of the contents below (as-is or adapted) - the script is simply meant to aid judicious users do their work better.
* This script needs substantial network and disk resources. You and you alone are responsible for repercussions including (and not limited to) network fees, disk crashes, power failures, distress from listening to Ed Sheeran samples and other material, physical or mental distress.

"""
# Once again, please PLEASE read this before executing script!
# https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
#
# v0.1: initial setup
# v0.2: acropolis doesnt like urllib2 or wget. settling for curl, single threaded.
# v0.3: add incremental downloads support (some downloads failed)
# v0.4: Support for additional path scans (when downloading to different path) + fix wav file header size (44 bytes) from v0.3
# v0.5: SIGINT handler + proper presence checking
# v0.6: cleanup + os.path.join for slashes

import urllib2, sys, argparse, os, os.path
import signal

def file_exists_and_isvalid(p):
    #print str(os.path.isfile(p)) + " and " + str(os.stat(p) >= 44)
    return os.path.isfile(p) and os.stat(p) >= 44  # wav file header size

def download_file_from_url(url, to_path, verbose):
    file_name = url.split('/')[-1]

    # urllib2 is too finicky and fails for larger files (whether written one-shot or buffered)
    # Prefer conservative approach ... and using the single threaded curl seems more stable and scalable
    #os.system("wget " + url + " " + to_path)
    os.system("curl -s -o '" + os.path.join(to_path, file_name) + "' " + url)
    return file_exists_and_isvalid(os.path.join(to_path, file_name))

def main(arguments):
    parser = argparse.ArgumentParser(description='BBC sound effects multi-download script')
    parser.add_argument('target_path', help='Where the downloaded files go')
    parser.add_argument('-f', '--force', action='store_true', default=False, help='Always download and overwrite existing files (default: Skip download if file already present)')
    parser.add_argument('--add_path', default="", help='Additional path to check file presence for (ignored if --force is used)')
    parser.add_argument('-v', '--verbose',action='store_true', default=False, help="Verbose - useful for tracking status")
    args = parser.parse_args(arguments)

    base_url='http://bbcsfx.acropolis.org.uk/assets/'

    # download file list
    file_list='BBCSoundEffects.csv'
    download_file_from_url(base_url+file_list, args.target_path, args.verbose)

    # open and parse column 1 for file name:
    failures=0
    num=0
    skipped=0
    with open(args.target_path+'/'+'BBCSoundEffects.csv', 'rb') as csvfile:
        for line in csvfile.readlines():
            array = line.split(',')
            file_name = array[0].replace('"','').replace("'",'')

            # helpful during incremental downloads... note: check for zero-byte download failures
            if not args.force:
                file_path = os.path.join(args.target_path, file_name)
                if args.verbose:
                    print "Checking " + file_path
                if file_exists_and_isvalid(file_path):
                    skipped += 1
                    if args.verbose:
                        print "Skipping " + file_path
                    continue
                if args.add_path:
                    file_path = os.path.join(args.add_path, file_name)
                    if args.verbose:
                        print "Checking " + file_path
                    if file_exists_and_isvalid(file_path):
                        skipped += 1
                        if args.verbose:
                            print "Skipping " + file_path
                        continue

            num += 1
            if args.verbose:
                   print "Downloading " + file_name
            if not download_file_from_url(base_url+file_name, args.target_path, args.verbose):
                print "Failed to download " + file_name
                failures += 1
            else:
                if args.verbose:
                    print "Downloaded #" + str(num) + ":\t" + file_name
            if not args.verbose:
                print "Downloaded #" + str(num) + " with " + str(failures) + ". Skipped: " + str(skipped) + "\r"

    if failures:
        print "Failed to download " + str(failures) + " files out of " + str(num) + " (Skipped " + str(skipped) + " existing)"
        sys.exit(1)
    else:
        print "Downloaded " + str(num) + " successfully (Skipped " + str(skipped) + " existing)"
        sys.exit(0)

def handle_sigint(signal, frame):
    print "Got SIGINT from OS. Wrapping up..."
    sys.exit(0)

if __name__ == '__main__':
    signal.signal(signal.SIGINT, handle_sigint)
    sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python

	""" BBC Sound Effects scraper script: Copyright 2018 Shrinivas Ramani
	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.

	WARNING:
	* Content licensed by BBC under https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
	PLEASE PLEASE READ THIS to understand what exactly is allowed with the contents obtained via the script.
	BBC, Acropolis and other parties continue to own copyrights to the website, its underlying technology and hosted content.
	* Author is not responsible for any malicious use of the contents below (as-is or adapted) - the script is simply meant to aid judicious users do their work better.
	* This script needs substantial network and disk resources. You and you alone are responsible for repercussions including (and not limited to) network fees, disk crashes, power failures, distress from listening to Ed Sheeran samples and other material, physical or mental distress.

	"""
	# Once again, please PLEASE read this before executing script!
	# https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
	#
	# v0.1: initial setup
	# v0.2: acropolis doesnt like urllib2 or wget. settling for curl, single threaded.
	# v0.3: add incremental downloads support (some downloads failed)
	# v0.4: Support for additional path scans (when downloading to different path) + fix wav file header size (44 bytes) from v0.3
	# v0.5: SIGINT handler + proper presence checking
	# v0.6: cleanup + os.path.join for slashes

	import urllib2, sys, argparse, os, os.path
	import signal

	def file_exists_and_isvalid(p):
	#print str(os.path.isfile(p)) + " and " + str(os.stat(p) >= 44)
	return os.path.isfile(p) and os.stat(p) >= 44 # wav file header size

	def download_file_from_url(url, to_path, verbose):
	file_name = url.split('/')[-1]

	# urllib2 is too finicky and fails for larger files (whether written one-shot or buffered)
	# Prefer conservative approach ... and using the single threaded curl seems more stable and scalable
	#os.system("wget " + url + " " + to_path)
	os.system("curl -s -o '" + os.path.join(to_path, file_name) + "' " + url)
	return file_exists_and_isvalid(os.path.join(to_path, file_name))

	def main(arguments):
	parser = argparse.ArgumentParser(description='BBC sound effects multi-download script')
	parser.add_argument('target_path', help='Where the downloaded files go')
	parser.add_argument('-f', '--force', action='store_true', default=False, help='Always download and overwrite existing files (default: Skip download if file already present)')
	parser.add_argument('--add_path', default="", help='Additional path to check file presence for (ignored if --force is used)')
	parser.add_argument('-v', '--verbose',action='store_true', default=False, help="Verbose - useful for tracking status")
	args = parser.parse_args(arguments)

	base_url='http://bbcsfx.acropolis.org.uk/assets/'

	# download file list
	file_list='BBCSoundEffects.csv'
	download_file_from_url(base_url+file_list, args.target_path, args.verbose)

	# open and parse column 1 for file name:
	failures=0
	num=0
	skipped=0
	with open(args.target_path+'/'+'BBCSoundEffects.csv', 'rb') as csvfile:
	for line in csvfile.readlines():
	array = line.split(',')
	file_name = array[0].replace('"','').replace("'",'')

	# helpful during incremental downloads... note: check for zero-byte download failures
	if not args.force:
	file_path = os.path.join(args.target_path, file_name)
	if args.verbose:
	print "Checking " + file_path
	if file_exists_and_isvalid(file_path):
	skipped += 1
	if args.verbose:
	print "Skipping " + file_path
	continue
	if args.add_path:
	file_path = os.path.join(args.add_path, file_name)
	if args.verbose:
	print "Checking " + file_path
	if file_exists_and_isvalid(file_path):
	skipped += 1
	if args.verbose:
	print "Skipping " + file_path
	continue

	num += 1
	if args.verbose:
	print "Downloading " + file_name
	if not download_file_from_url(base_url+file_name, args.target_path, args.verbose):
	print "Failed to download " + file_name
	failures += 1
	else:
	if args.verbose:
	print "Downloaded #" + str(num) + ":\t" + file_name
	if not args.verbose:
	print "Downloaded #" + str(num) + " with " + str(failures) + ". Skipped: " + str(skipped) + "\r"

	if failures:
	print "Failed to download " + str(failures) + " files out of " + str(num) + " (Skipped " + str(skipped) + " existing)"
	sys.exit(1)
	else:
	print "Downloaded " + str(num) + " successfully (Skipped " + str(skipped) + " existing)"
	sys.exit(0)

	def handle_sigint(signal, frame):
	print "Got SIGINT from OS. Wrapping up..."
	sys.exit(0)

	if __name__ == '__main__':
	signal.signal(signal.SIGINT, handle_sigint)
	sys.exit(main(sys.argv[1:]))