Using Python's built-in defaultdict we can easily define a tree data structure:
def tree(): return defaultdict(tree)
That's it!
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
import json | |
import requests | |
COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'], | |
'logged-in-user': os.environ['LOGGED_IN_USER'], | |
} |
#!/usr/bin/env python | |
import os, sys | |
import requests | |
import json | |
COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'], | |
'logged-in-user': os.environ['LOGGED_IN_USER'], | |
'verbose': '1', | |
} |
#!/bin/bash | |
# Download all Symlink Instruction files from a given list of identifiers. | |
# Usage: ./get-symlinks.sh itemlist.txt | |
while read identifier | |
do | |
mkdir -p symlink_instructions | |
URL="http://archive.org/download/${identifier}/${identifier}_symlinks.txt" | |
COOK="Cookie: logged-in-sig=$LOGGED_IN_SIG; logged-in-user=$LOGGED_IN_USER" | |
wget -q "$URL" -O symlink_instructions/${identifier}_symlinks.txt \ |
#!/usr/bin/env python | |
import os | |
#import lxml.html, lxml.etree | |
import lxml.etree | |
import subprocess | |
import json | |
import urllib | |
ROOT_DIR = os.getcwd() | |
utf8_parser = lxml.etree.HTMLParser(encoding='utf-8') |
#!/usr/bin/env python | |
""" Check if an item on archive.org has an acoustid. | |
Usage: | |
./check_for_acoustid.py {item} | |
Usage with GNU Parallel: | |
cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}' |
Using Python's built-in defaultdict we can easily define a tree data structure:
def tree(): return defaultdict(tree)
That's it!
# This demonstrates doing multiple metadata fetches in parallel. | |
# It seems to be fast enough that the json decoding cost becomes | |
# a significant proportion of the execution time. | |
# It requires gevent; see http://www.gevent.org/intro.html#installation | |
# To make this do something useful, modify do_work(). | |
import gevent |
taskid="$1" | |
log=$(curl -s "http://www-tracey.us.archive.org/log_show.php?task_id=$taskid&full=1") | |
filepath=$(echo $log | grep -Po '(?<=\[dir\]\ \=\>\ )/14/items/archiveteam-mobileme-hero-9' | head -1) | |
node=$(echo $log | grep -Po '(?<=\[server).*?(?=.us.archive.org)' | head -1 | grep -Po 'ia6[0-9]{5}') | |
itemsize_url="http://$node.us.archive.org/item-size.php?path=$filepath" | |
# Item size in KB | |
_itemsize=$(curl -s "$itemsize_url" | grep -Po '(?<=\<size\>).*(?=\</size\>)') | |
# Convert to GB | |
itemsize=$(echo "$_itemsize/1000000" | bc) |
#!/usr/bin/env python | |
# | |
# Provided a list of identifiers for items on archive.org, return all items | |
# that have an "acoustid" for every original audio file, but NOT a | |
# "mb_recording_id". | |
# | |
import sys | |
import logging | |
from datetime import datetime |
#!/usr/bin/env python | |
# | |
# Find out the most used metadata fields on archive.org | |
# | |
import sys | |
import logging | |
from datetime import datetime | |
import ujson as json | |
import cPickle as pickle |