Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Last active July 3, 2019 19:44
Show Gist options
  • Save kylemcdonald/283ffd7ebcda2ae9e90d to your computer and use it in GitHub Desktop.
Save kylemcdonald/283ffd7ebcda2ae9e90d to your computer and use it in GitHub Desktop.
Download all 0-1 second samples from FreeSound.
#!/usr/bin/env python
import errno
import os
from os import path
import argparse
import time
from multiprocessing.dummy import Pool
from urllib3 import HTTPConnectionPool
import ujson as json
from tqdm import tqdm
import freesound # pip install --user git+https://github.com/MTG/freesound-python
parser = argparse.ArgumentParser(description='Download a set of sounds from Freesound.')
parser.add_argument('-n', '--n_connections', default=10, type=int, help='Number of parallel connections.')
parser.add_argument('-b', '--base_dir', default='.', type=str, help='Base output directory.')
parser.add_argument('-min', '--min_duration', default=0, type=float, help='Minimum duration.')
parser.add_argument('-max', '--max_duration', default=1, type=float, help='Maximum duration.')
parser.add_argument('-r', '--rate_limit', default=1.1, type=float, help='Rate limit for requests.')
args = parser.parse_args()
access_token = '' # this needs to be an oauth2 access token as described https://www.freesound.org/docs/api/authentication.html
useless = ['analysis',
'analysis_frames',
'analysis_stats',
'bookmark',
'client',
'comment',
'comments',
'download',
'images',
'pack',
'previews',
'rate',
'similar_sounds',
'url']
useful = ['id',
'name',
'tags',
'description',
'geotag',
'created',
'type',
'filesize',
'bitrate',
'bitdepth',
'duration',
'samplerate',
'username',
'download',
'previews',
'num_downloads',
'avg_rating',
'num_ratings',
'num_comments',
'license']
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
print 'Connecting to Freesound...'
client = freesound.FreesoundClient()
client.set_token(access_token, 'token')
fields = ','.join(useful)
filterstr = 'duration:[{} TO {}]'.format(args.min_duration, args.max_duration)
print 'Searching for results...'
results = client.text_search(query='', sort='created_asc', filter=filterstr, fields=fields, page_size=150)
print 'Processing...'
mkdir_p(path.join(args.base_dir, 'mp3'))
mkdir_p(path.join(args.base_dir, 'json'))
http_pool = HTTPConnectionPool('www.freesound.org')
def download(url, fn):
if url.startswith('https'):
url = url.replace('https', 'http')
if not os.path.isfile(fn):
r = http_pool.urlopen('GET', url)
with open(fn, 'wb') as f:
if r.status == 200:
f.write(r.data)
elif r.status != 404:
print 'Error: {} saving {} to {}'.format(r.status, url, fn)
def job(result):
mp3file = path.join(args.base_dir, 'mp3', '{}.mp3'.format(result.id))
if not path.isfile(mp3file):
download(result.previews.preview_hq_mp3, mp3file)
jsonfile = path.join(args.base_dir, 'json', '{}.json'.format(result.id))
if not path.isfile(jsonfile):
with open(jsonfile, 'w') as f:
original = result.json_dict
filtered = {}
for key in useful:
if key in original:
filtered[key] = original[key]
json.dump(filtered, f, sort_keys=True)
return (mp3file, jsonfile)
pool = Pool(args.n_connections)
pbar = tqdm(total=results.count)
prev_time = time.time()
while results.next is not None:
mapped = pool.map(job, results)
pbar.update(len(mapped))
results = results.next_page()
cur_time = time.time()
remaining_time = args.rate_limit - (cur_time - prev_time)
if remaining_time > 0:
time.sleep(remaining_time)
prev_time = cur_time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment