Skip to content

Instantly share code, notes, and snippets.

@superfawkes
Created May 2, 2018 10:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save superfawkes/625ac72b17b859182f15b7bce4d93d9c to your computer and use it in GitHub Desktop.
Save superfawkes/625ac72b17b859182f15b7bce4d93d9c to your computer and use it in GitHub Desktop.
Utility script to download BBC's non-commercial sound effects library
#!/usr/bin/env python
""" BBC Sound Effects scraper script: Copyright 2018 Shrinivas Ramani
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
WARNING:
* Content licensed by BBC under https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
PLEASE PLEASE READ THIS to understand what exactly is allowed with the contents obtained via the script.
BBC, Acropolis and other parties continue to own copyrights to the website, its underlying technology and hosted content.
* Author is not responsible for any malicious use of the contents below (as-is or adapted) - the script is simply meant to aid judicious users do their work better.
* This script needs substantial network and disk resources. You and you alone are responsible for repercussions including (and not limited to) network fees, disk crashes, power failures, distress from listening to Ed Sheeran samples and other material, physical or mental distress.
"""
# Once again, please PLEASE read this before executing script!
# https://github.com/bbcarchdev/Remarc/blob/master/doc/2016.09.27_RemArc_Content%20licence_Terms%20of%20Use_final.pdf
#
# v0.1: initial setup
# v0.2: acropolis doesnt like urllib2 or wget. settling for curl, single threaded.
# v0.3: add incremental downloads support (some downloads failed)
# v0.4: Support for additional path scans (when downloading to different path) + fix wav file header size (44 bytes) from v0.3
# v0.5: SIGINT handler + proper presence checking
# v0.6: cleanup + os.path.join for slashes
import urllib2, sys, argparse, os, os.path
import signal
def file_exists_and_isvalid(p):
#print str(os.path.isfile(p)) + " and " + str(os.stat(p) >= 44)
return os.path.isfile(p) and os.stat(p) >= 44 # wav file header size
def download_file_from_url(url, to_path, verbose):
file_name = url.split('/')[-1]
# urllib2 is too finicky and fails for larger files (whether written one-shot or buffered)
# Prefer conservative approach ... and using the single threaded curl seems more stable and scalable
#os.system("wget " + url + " " + to_path)
os.system("curl -s -o '" + os.path.join(to_path, file_name) + "' " + url)
return file_exists_and_isvalid(os.path.join(to_path, file_name))
def main(arguments):
parser = argparse.ArgumentParser(description='BBC sound effects multi-download script')
parser.add_argument('target_path', help='Where the downloaded files go')
parser.add_argument('-f', '--force', action='store_true', default=False, help='Always download and overwrite existing files (default: Skip download if file already present)')
parser.add_argument('--add_path', default="", help='Additional path to check file presence for (ignored if --force is used)')
parser.add_argument('-v', '--verbose',action='store_true', default=False, help="Verbose - useful for tracking status")
args = parser.parse_args(arguments)
base_url='http://bbcsfx.acropolis.org.uk/assets/'
# download file list
file_list='BBCSoundEffects.csv'
download_file_from_url(base_url+file_list, args.target_path, args.verbose)
# open and parse column 1 for file name:
failures=0
num=0
skipped=0
with open(args.target_path+'/'+'BBCSoundEffects.csv', 'rb') as csvfile:
for line in csvfile.readlines():
array = line.split(',')
file_name = array[0].replace('"','').replace("'",'')
# helpful during incremental downloads... note: check for zero-byte download failures
if not args.force:
file_path = os.path.join(args.target_path, file_name)
if args.verbose:
print "Checking " + file_path
if file_exists_and_isvalid(file_path):
skipped += 1
if args.verbose:
print "Skipping " + file_path
continue
if args.add_path:
file_path = os.path.join(args.add_path, file_name)
if args.verbose:
print "Checking " + file_path
if file_exists_and_isvalid(file_path):
skipped += 1
if args.verbose:
print "Skipping " + file_path
continue
num += 1
if args.verbose:
print "Downloading " + file_name
if not download_file_from_url(base_url+file_name, args.target_path, args.verbose):
print "Failed to download " + file_name
failures += 1
else:
if args.verbose:
print "Downloaded #" + str(num) + ":\t" + file_name
if not args.verbose:
print "Downloaded #" + str(num) + " with " + str(failures) + ". Skipped: " + str(skipped) + "\r"
if failures:
print "Failed to download " + str(failures) + " files out of " + str(num) + " (Skipped " + str(skipped) + " existing)"
sys.exit(1)
else:
print "Downloaded " + str(num) + " successfully (Skipped " + str(skipped) + " existing)"
sys.exit(0)
def handle_sigint(signal, frame):
print "Got SIGINT from OS. Wrapping up..."
sys.exit(0)
if __name__ == '__main__':
signal.signal(signal.SIGINT, handle_sigint)
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment