Skip to content

Instantly share code, notes, and snippets.

@napolitano
Last active January 29, 2024 17:06
Show Gist options
  • Save napolitano/2192f290bb11a55e3b20a9e038aaa1af to your computer and use it in GitHub Desktop.
Save napolitano/2192f290bb11a55e3b20a9e038aaa1af to your computer and use it in GitHub Desktop.
Python script to download the full sound loop library from Soundraw.ai for the purpose of research related to a blog article
# Proof of concept script to retrieve content
# files from soundraw for private research only
#
# Provided "as is" for free use without any
# warranty
#
# NOTICE:
# Soundraw's library is very large and contains
# more than 550.000 Files with more than 190 GByte
# of volume. You need fast internet / time and
# enough free storage space. There's no
# exception handling in this script!
#
# WARNING:
# Be aware that these files are copyrighted
# and are not "free for use" or redistribution
# If you do so, this might be subject for legal
# measures against you
#
# If in doubt contact soundraw.io and ask
# ----------------------------------------------
import requests
import os
import time
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
sourceurl = "https://soundraw-web.s3-ap-northeast-1.amazonaws.com"
xmlfile = "soundraw-contents"
subfolder = "soundraw-loops"
def recursivelyLoadFileList(sourceurl, getParameter, targetfile, numberOfFiles):
fullXmlFileName = targetfile + str(numberOfFiles) + ".xml"
response = requests.get(sourceurl + "/" + getParameter)
with open(fullXmlFileName, "wb") as file:
file.write(response.content)
tree = ET.parse(fullXmlFileName)
root = tree.getroot()
nextPage = ""
for child in root.getchildren():
if child.tag.endswith("NextContinuationToken"):
nextPage = urllib.parse.quote(child.text)
print("token: " + nextPage)
if nextPage != "":
numberOfFiles += 1
recursivelyLoadFileList(sourceurl, "?list-type=2&continuation-token="+nextPage, targetfile, numberOfFiles)
else:
return numberOfFiles
def loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount):
tree = ET.parse(xmlfile + str(xmlFileCount) + ".xml")
root = tree.getroot()
steps = 0
pauseAfterSteps = 300
pauseValue = 0
time.sleep(0)
if not os.path.exists(subfolder):
os.makedirs(subfolder)
print("Processing S3-Bucket-List " + str(xmlFileCount) + " of " + str(maxXmlFileCount))
print("======================================")
print(" ")
for child in root.getchildren():
if child.tag.endswith("Contents"):
for subchild in child.getchildren():
if subchild.tag.endswith("Key"):
subsubfolder = subchild.text.split("_")
if not os.path.exists(subfolder + "/" + subsubfolder[0]):
os.makedirs(subfolder + "/" + subsubfolder[0])
if subsubfolder[4] == "dr":
subsubfolder[4] = "Drums"
elif subsubfolder[4] == "ff":
subsubfolder[4] = "Fill Front"
elif subsubfolder[4] == "fe":
subsubfolder[4] = "Fill End"
elif subsubfolder[4] == "me":
subsubfolder[4] = "Melody"
elif subsubfolder[4] == "bs":
subsubfolder[4] = "Bass"
elif subsubfolder[4] == "bc":
subsubfolder[4] = "Background Chords"
elif subsubfolder[4] == "bc1":
subsubfolder[4] = "Background Chords Extra 1"
elif subsubfolder[4] == "bc2":
subsubfolder[4] = "Background Chords Extra 2"
elif subsubfolder[4] == "se":
subsubfolder[4] = "Ad-Lib"
if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4]):
os.makedirs(subfolder + "/" + subsubfolder[0] + "/" +subsubfolder[4])
if len(subsubfolder) > 7:
if subsubfolder[7].endswith("m4a"):
subsubfolder[7] = subsubfolder[6]
else:
subsubfolder.append("vocals")
if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7]):
os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7])
if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3]):
os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3])
if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text):
steps += 1
if steps > pauseAfterSteps:
steps = 0
time.sleep(pauseValue)
print("Loading: " + sourceurl + '/' + subchild.text)
print("Writing: " + subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" +
subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)
urllib.request.urlretrieve(sourceurl + "/" + subchild.text, subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)
if xmlFileCount < maxXmlFileCount:
xmlFileCount += 1
loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount)
#maxXmlFileCount = recursivelyLoadFileList(sourceurl, "?list-type=2", xmlfile, 0)
maxXmlFileCount = 553
loadContent(sourceurl, xmlfile, subfolder, 500, maxXmlFileCount)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment