napolitano/soundraw-content-helper.py

## soundraw-content-helper.py
# Proof of concept script to retrieve content
# files from soundraw for private research only
#
# Provided "as is" for free use without any
# warranty
#
# NOTICE:
# Soundraw's library is very large and contains
# more than 550.000 Files with more than 190 GByte
# of volume. You need fast internet / time and
# enough free storage space. There's no
# exception handling in this script!
#
# WARNING:
# Be aware that these files are copyrighted
# and are not "free for use" or redistribution
# If you do so, this might be subject for legal
# measures against you
#
# If in doubt contact soundraw.io and ask
# ----------------------------------------------

import requests
import os
import time
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

sourceurl = "https://soundraw-web.s3-ap-northeast-1.amazonaws.com"
xmlfile = "soundraw-contents"
subfolder = "soundraw-loops"

def recursivelyLoadFileList(sourceurl, getParameter, targetfile, numberOfFiles):
    fullXmlFileName = targetfile + str(numberOfFiles) + ".xml"
    response = requests.get(sourceurl + "/" + getParameter)
    with open(fullXmlFileName, "wb") as file:
        file.write(response.content)

    tree = ET.parse(fullXmlFileName)
    root = tree.getroot()
    nextPage = ""

    for child in root.getchildren():
        if child.tag.endswith("NextContinuationToken"):
            nextPage = urllib.parse.quote(child.text)

    print("token: " + nextPage)

    if nextPage != "":
        numberOfFiles += 1
        recursivelyLoadFileList(sourceurl, "?list-type=2&continuation-token="+nextPage, targetfile, numberOfFiles)
    else:
        return numberOfFiles

def loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount):
    tree = ET.parse(xmlfile + str(xmlFileCount) + ".xml")
    root = tree.getroot()
    steps = 0
    pauseAfterSteps = 300
    pauseValue = 0

    time.sleep(0)

    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    print("Processing S3-Bucket-List " + str(xmlFileCount) + " of " + str(maxXmlFileCount))
    print("======================================")
    print(" ")

    for child in root.getchildren():
        if child.tag.endswith("Contents"):
            for subchild in child.getchildren():
                if subchild.tag.endswith("Key"):

                    subsubfolder = subchild.text.split("_")

                    if not os.path.exists(subfolder + "/" + subsubfolder[0]):
                        os.makedirs(subfolder + "/" + subsubfolder[0])

                    if subsubfolder[4] == "dr":
                        subsubfolder[4] = "Drums"
                    elif subsubfolder[4] == "ff":
                        subsubfolder[4] = "Fill Front"
                    elif subsubfolder[4] == "fe":
                        subsubfolder[4] = "Fill End"
                    elif subsubfolder[4] == "me":
                        subsubfolder[4] = "Melody"
                    elif subsubfolder[4] == "bs":
                        subsubfolder[4] = "Bass"
                    elif subsubfolder[4] == "bc":
                        subsubfolder[4] = "Background Chords"
                    elif subsubfolder[4] == "bc1":
                        subsubfolder[4] = "Background Chords Extra 1"
                    elif subsubfolder[4] == "bc2":
                        subsubfolder[4] = "Background Chords Extra 2"
                    elif subsubfolder[4] == "se":
                        subsubfolder[4] = "Ad-Lib"

                    if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4]):
                        os.makedirs(subfolder + "/" + subsubfolder[0] + "/" +subsubfolder[4])

                    if len(subsubfolder) > 7:
                        if subsubfolder[7].endswith("m4a"):
                            subsubfolder[7] = subsubfolder[6]
                    else:
                        subsubfolder.append("vocals")

                    if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7]):
                        os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7])

                    if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3]):
                        os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3])

                    if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text):
                        steps += 1

                        if steps > pauseAfterSteps:
                            steps = 0
                            time.sleep(pauseValue)

                        print("Loading: " + sourceurl + '/' + subchild.text)
                        print("Writing: " + subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" +
                              subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)

                        urllib.request.urlretrieve(sourceurl + "/" + subchild.text, subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)


    if xmlFileCount < maxXmlFileCount:
        xmlFileCount += 1

    loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount)


#maxXmlFileCount = recursivelyLoadFileList(sourceurl, "?list-type=2", xmlfile, 0)
maxXmlFileCount = 553
loadContent(sourceurl, xmlfile, subfolder, 500, maxXmlFileCount)
	# Proof of concept script to retrieve content
	# files from soundraw for private research only
	#
	# Provided "as is" for free use without any
	# warranty
	#
	# NOTICE:
	# Soundraw's library is very large and contains
	# more than 550.000 Files with more than 190 GByte
	# of volume. You need fast internet / time and
	# enough free storage space. There's no
	# exception handling in this script!
	#
	# WARNING:
	# Be aware that these files are copyrighted
	# and are not "free for use" or redistribution
	# If you do so, this might be subject for legal
	# measures against you
	#
	# If in doubt contact soundraw.io and ask
	# ----------------------------------------------

	import requests
	import os
	import time
	import urllib.parse
	import urllib.request
	import xml.etree.ElementTree as ET

	sourceurl = "https://soundraw-web.s3-ap-northeast-1.amazonaws.com"
	xmlfile = "soundraw-contents"
	subfolder = "soundraw-loops"

	def recursivelyLoadFileList(sourceurl, getParameter, targetfile, numberOfFiles):
	fullXmlFileName = targetfile + str(numberOfFiles) + ".xml"
	response = requests.get(sourceurl + "/" + getParameter)
	with open(fullXmlFileName, "wb") as file:
	file.write(response.content)

	tree = ET.parse(fullXmlFileName)
	root = tree.getroot()
	nextPage = ""

	for child in root.getchildren():
	if child.tag.endswith("NextContinuationToken"):
	nextPage = urllib.parse.quote(child.text)

	print("token: " + nextPage)

	if nextPage != "":
	numberOfFiles += 1
	recursivelyLoadFileList(sourceurl, "?list-type=2&continuation-token="+nextPage, targetfile, numberOfFiles)
	else:
	return numberOfFiles

	def loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount):
	tree = ET.parse(xmlfile + str(xmlFileCount) + ".xml")
	root = tree.getroot()
	steps = 0
	pauseAfterSteps = 300
	pauseValue = 0

	time.sleep(0)

	if not os.path.exists(subfolder):
	os.makedirs(subfolder)
	print("Processing S3-Bucket-List " + str(xmlFileCount) + " of " + str(maxXmlFileCount))
	print("======================================")
	print(" ")

	for child in root.getchildren():
	if child.tag.endswith("Contents"):
	for subchild in child.getchildren():
	if subchild.tag.endswith("Key"):

	subsubfolder = subchild.text.split("_")

	if not os.path.exists(subfolder + "/" + subsubfolder[0]):
	os.makedirs(subfolder + "/" + subsubfolder[0])

	if subsubfolder[4] == "dr":
	subsubfolder[4] = "Drums"
	elif subsubfolder[4] == "ff":
	subsubfolder[4] = "Fill Front"
	elif subsubfolder[4] == "fe":
	subsubfolder[4] = "Fill End"
	elif subsubfolder[4] == "me":
	subsubfolder[4] = "Melody"
	elif subsubfolder[4] == "bs":
	subsubfolder[4] = "Bass"
	elif subsubfolder[4] == "bc":
	subsubfolder[4] = "Background Chords"
	elif subsubfolder[4] == "bc1":
	subsubfolder[4] = "Background Chords Extra 1"
	elif subsubfolder[4] == "bc2":
	subsubfolder[4] = "Background Chords Extra 2"
	elif subsubfolder[4] == "se":
	subsubfolder[4] = "Ad-Lib"

	if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4]):
	os.makedirs(subfolder + "/" + subsubfolder[0] + "/" +subsubfolder[4])

	if len(subsubfolder) > 7:
	if subsubfolder[7].endswith("m4a"):
	subsubfolder[7] = subsubfolder[6]
	else:
	subsubfolder.append("vocals")

	if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7]):
	os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7])

	if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3]):
	os.makedirs(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3])

	if not os.path.exists(subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text):
	steps += 1

	if steps > pauseAfterSteps:
	steps = 0
	time.sleep(pauseValue)

	print("Loading: " + sourceurl + '/' + subchild.text)
	print("Writing: " + subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" +
	subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)

	urllib.request.urlretrieve(sourceurl + "/" + subchild.text, subfolder + "/" + subsubfolder[0] + "/" + subsubfolder[4] + "/" + subsubfolder[7] + "/" + subsubfolder[3] + "/" + subchild.text)


	if xmlFileCount < maxXmlFileCount:
	xmlFileCount += 1

	loadContent(sourceurl, xmlfile, subfolder, xmlFileCount, maxXmlFileCount)


	#maxXmlFileCount = recursivelyLoadFileList(sourceurl, "?list-type=2", xmlfile, 0)
	maxXmlFileCount = 553
	loadContent(sourceurl, xmlfile, subfolder, 500, maxXmlFileCount)