Last active
October 5, 2024 06:36
-
-
Save someodd/1a6d69e3d82263b0aa863bfaaf277d40 to your computer and use it in GitHub Desktop.
Download random audio samples from Internet Archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# PLEASE READ ME BEFORE RUNNING! | |
# ------------------------------ | |
# | |
# giamp3 v1 (2024-10-04) | |
# | |
# Random 30 second samples from Internet Archive. | |
# | |
# Tested on Debian Unstable (Sid) as of the release date. | |
# I think it also works in Debian 12. | |
# | |
# To use this script please first do this: | |
# | |
# sudo apt-get update | |
# sudo apt-get upgrade | |
# sudo apt-get install internetarchive ffmpeg jq | |
# chmod +x giamp3.sh | |
# | |
# Then you can use it like this: | |
# | |
# ./giamp3.sh | |
# Directory to store samples | |
OUTPUT_DIR="samples" | |
mkdir -p "$OUTPUT_DIR" | |
# Number of samples to download | |
NUM_SAMPLES=5 | |
# Search term and parameters | |
SEARCH_TERM='audio' | |
COLLECTION='collection:audio_music' | |
FORMAT='format:MP3' | |
# Loop to download multiple samples | |
for ((i=1; i<=NUM_SAMPLES; i++)); do | |
# Get a random item from the search results | |
ITEM=$(ia search "$SEARCH_TERM" --parameters "$COLLECTION" "$FORMAT" --itemlist --parameters rows=10 | shuf -n 1) | |
# If no item was found, skip this iteration | |
if [ -z "$ITEM" ]; then | |
echo "No item found, skipping..." | |
continue | |
fi | |
echo "Found item: $ITEM" | |
# Fetch metadata for the item to find MP3 files | |
ia metadata "$ITEM" > "$OUTPUT_DIR/$ITEM.json" | |
# Extract mp3 files from metadata | |
MP3_URLS=$(jq -r '.files[] | select(.format == "VBR MP3") | .name' "$OUTPUT_DIR/$ITEM.json") | |
# If no MP3 URLs are found, skip | |
if [ -z "$MP3_URLS" ]; then | |
echo "No MP3s found for $ITEM, skipping..." | |
continue | |
fi | |
# Select a random MP3 file | |
MP3_FILE=$(echo "$MP3_URLS" | shuf -n 1) | |
MP3_URL="https://archive.org/download/$ITEM/$MP3_FILE" | |
echo "Downloading $MP3_URL..." | |
# Download the MP3 file | |
wget -q -O "$OUTPUT_DIR/$ITEM.mp3" "$MP3_URL" | |
# Check if download was successful | |
if [ ! -f "$OUTPUT_DIR/$ITEM.mp3" ]; then | |
echo "Failed to download $MP3_URL, skipping..." | |
continue | |
fi | |
# Determine duration of the file | |
DURATION=$(ffmpeg -i "$OUTPUT_DIR/$ITEM.mp3" 2>&1 | grep "Duration" | awk '{print $2}' | tr -d ,) | |
DURATION_SECONDS=$(echo "$DURATION" | awk -F: '{ print ($1 * 3600) + ($2 * 60) + $3 }' | cut -d'.' -f1) | |
# If file duration is less than 30 seconds, skip | |
if [ "$DURATION_SECONDS" -lt 30 ]; then | |
echo "MP3 duration is less than 30 seconds, skipping..." | |
rm "$OUTPUT_DIR/$ITEM.mp3" | |
continue | |
fi | |
# Pick a random start time for the 30-second sample | |
START_TIME=$((RANDOM % (DURATION_SECONDS - 30))) | |
# Output file for the sample | |
SAMPLE_FILE="$OUTPUT_DIR/${ITEM}_sample_${START_TIME}s.mp3" | |
echo "Creating 30-second sample from $START_TIME seconds..." | |
# Extract the 30-second sample using ffmpeg | |
ffmpeg -ss "$START_TIME" -t 30 -i "$OUTPUT_DIR/$ITEM.mp3" -acodec copy "$SAMPLE_FILE" | |
# Clean up the full mp3 download | |
rm "$OUTPUT_DIR/$ITEM.mp3" | |
done | |
echo "Samples created in $OUTPUT_DIR." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment