Created
May 2, 2018 10:14
-
-
Save superfawkes/625ac72b17b859182f15b7bce4d93d9c to your computer and use it in GitHub Desktop.
Utility script to download BBC's non-commercial sound effects library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" BBC Sound Effects scraper script: Copyright 2018 Shrinivas Ramani | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
WARNING: | |
* Content licensed by BBC under https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf | |
PLEASE PLEASE READ THIS to understand what exactly is allowed with the contents obtained via the script. | |
BBC, Acropolis and other parties continue to own copyrights to the website, its underlying technology and hosted content. | |
* Author is not responsible for any malicious use of the contents below (as-is or adapted) - the script is simply meant to aid judicious users do their work better. | |
* This script needs substantial network and disk resources. You and you alone are responsible for repercussions including (and not limited to) network fees, disk crashes, power failures, distress from listening to Ed Sheeran samples and other material, physical or mental distress. | |
""" | |
# Once again, please PLEASE read this before executing script! | |
# https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf | |
# | |
# v0.1: initial setup | |
# v0.2: acropolis doesnt like urllib2 or wget. settling for curl, single threaded. | |
# v0.3: add incremental downloads support (some downloads failed) | |
# v0.4: Support for additional path scans (when downloading to different path) + fix wav file header size (44 bytes) from v0.3 | |
# v0.5: SIGINT handler + proper presence checking | |
# v0.6: cleanup + os.path.join for slashes | |
import urllib2, sys, argparse, os, os.path | |
import signal | |
def file_exists_and_isvalid(p): | |
#print str(os.path.isfile(p)) + " and " + str(os.stat(p) >= 44) | |
return os.path.isfile(p) and os.stat(p) >= 44 # wav file header size | |
def download_file_from_url(url, to_path, verbose): | |
file_name = url.split('/')[-1] | |
# urllib2 is too finicky and fails for larger files (whether written one-shot or buffered) | |
# Prefer conservative approach ... and using the single threaded curl seems more stable and scalable | |
#os.system("wget " + url + " " + to_path) | |
os.system("curl -s -o '" + os.path.join(to_path, file_name) + "' " + url) | |
return file_exists_and_isvalid(os.path.join(to_path, file_name)) | |
def main(arguments): | |
parser = argparse.ArgumentParser(description='BBC sound effects multi-download script') | |
parser.add_argument('target_path', help='Where the downloaded files go') | |
parser.add_argument('-f', '--force', action='store_true', default=False, help='Always download and overwrite existing files (default: Skip download if file already present)') | |
parser.add_argument('--add_path', default="", help='Additional path to check file presence for (ignored if --force is used)') | |
parser.add_argument('-v', '--verbose',action='store_true', default=False, help="Verbose - useful for tracking status") | |
args = parser.parse_args(arguments) | |
base_url='http://bbcsfx.acropolis.org.uk/assets/' | |
# download file list | |
file_list='BBCSoundEffects.csv' | |
download_file_from_url(base_url+file_list, args.target_path, args.verbose) | |
# open and parse column 1 for file name: | |
failures=0 | |
num=0 | |
skipped=0 | |
with open(args.target_path+'/'+'BBCSoundEffects.csv', 'rb') as csvfile: | |
for line in csvfile.readlines(): | |
array = line.split(',') | |
file_name = array[0].replace('"','').replace("'",'') | |
# helpful during incremental downloads... note: check for zero-byte download failures | |
if not args.force: | |
file_path = os.path.join(args.target_path, file_name) | |
if args.verbose: | |
print "Checking " + file_path | |
if file_exists_and_isvalid(file_path): | |
skipped += 1 | |
if args.verbose: | |
print "Skipping " + file_path | |
continue | |
if args.add_path: | |
file_path = os.path.join(args.add_path, file_name) | |
if args.verbose: | |
print "Checking " + file_path | |
if file_exists_and_isvalid(file_path): | |
skipped += 1 | |
if args.verbose: | |
print "Skipping " + file_path | |
continue | |
num += 1 | |
if args.verbose: | |
print "Downloading " + file_name | |
if not download_file_from_url(base_url+file_name, args.target_path, args.verbose): | |
print "Failed to download " + file_name | |
failures += 1 | |
else: | |
if args.verbose: | |
print "Downloaded #" + str(num) + ":\t" + file_name | |
if not args.verbose: | |
print "Downloaded #" + str(num) + " with " + str(failures) + ". Skipped: " + str(skipped) + "\r" | |
if failures: | |
print "Failed to download " + str(failures) + " files out of " + str(num) + " (Skipped " + str(skipped) + " existing)" | |
sys.exit(1) | |
else: | |
print "Downloaded " + str(num) + " successfully (Skipped " + str(skipped) + " existing)" | |
sys.exit(0) | |
def handle_sigint(signal, frame): | |
print "Got SIGINT from OS. Wrapping up..." | |
sys.exit(0) | |
if __name__ == '__main__': | |
signal.signal(signal.SIGINT, handle_sigint) | |
sys.exit(main(sys.argv[1:])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment