jjjake

## metadata_write_api.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json

import requests

COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'],
           'logged-in-user': os.environ['LOGGED_IN_USER'],
}

## ia_catalog.py
#!/usr/bin/env python
import os, sys
import requests
import json

COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'],
           'logged-in-user': os.environ['LOGGED_IN_USER'],
           'verbose': '1',
}

## get-symlinks.sh
#!/bin/bash
# Download all Symlink Instruction files from a given list of identifiers.
# Usage: ./get-symlinks.sh itemlist.txt

while read identifier
do
    mkdir -p symlink_instructions
    URL="http://archive.org/download/${identifier}/${identifier}_symlinks.txt"
    COOK="Cookie: logged-in-sig=$LOGGED_IN_SIG; logged-in-user=$LOGGED_IN_USER"
    wget -q "$URL" -O symlink_instructions/${identifier}_symlinks.txt \

## get.py
#!/usr/bin/env python
import os
#import lxml.html, lxml.etree
import lxml.etree
import subprocess
import json
import urllib

ROOT_DIR = os.getcwd()
utf8_parser = lxml.etree.HTMLParser(encoding='utf-8')

## check_for_acoustid.py
#!/usr/bin/env python

""" Check if an item on archive.org has an acoustid.

Usage:
./check_for_acoustid.py {item}

Usage with GNU Parallel:
cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}'

## tree.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jjjake
                / tree.md
            
            
              Created
              September 24, 2012 15:43
                — forked from hrldcpr/tree.md
            
              
                one-line tree in python
              
          
    One-line Tree in Python

Using Python's built-in defaultdict we can easily define a tree data structure:
def tree(): return defaultdict(tree)
That's it!

  
## parallel_md_get.py
# This demonstrates doing multiple metadata fetches in parallel.
# It seems to be fast enough that the json decoding cost becomes
# a significant proportion of the execution time.

# It requires gevent; see http://www.gevent.org/intro.html#installation

# To make this do something useful, modify do_work().

import gevent

## shuf.sh
taskid="$1"
log=$(curl -s "http://www-tracey.us.archive.org/log_show.php?task_id=$taskid&full=1")
filepath=$(echo $log | grep  -Po '(?<=\[dir\]\ \=\>\ )/14/items/archiveteam-mobileme-hero-9' | head -1)
node=$(echo $log | grep -Po '(?<=\[server).*?(?=.us.archive.org)' | head -1 | grep -Po 'ia6[0-9]{5}')

itemsize_url="http://$node.us.archive.org/item-size.php?path=$filepath"
# Item size in KB
_itemsize=$(curl -s "$itemsize_url" | grep -Po '(?<=\<size\>).*(?=\</size\>)')
# Convert to GB
itemsize=$(echo "$_itemsize/1000000" | bc)

## aid_but_not_mbid.py
#!/usr/bin/env python
#
# Provided a list of identifiers for items on archive.org, return all items
# that have an "acoustid" for every original audio file, but NOT a
# "mb_recording_id".
#
import sys
import logging
from datetime import datetime

## get_meta_keys.py
#!/usr/bin/env python
#
# Find out the most used metadata fields on archive.org
#
import sys
import logging
from datetime import datetime

import ujson as json
import cPickle as pickle
	#!/usr/bin/env python
	# -- coding: utf-8 --
	import os
	import json

	import requests

	COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'],
	'logged-in-user': os.environ['LOGGED_IN_USER'],
	}
	#!/usr/bin/env python
	import os, sys
	import requests
	import json

	COOKIES = {'logged-in-sig': os.environ['LOGGED_IN_SIG'],
	'logged-in-user': os.environ['LOGGED_IN_USER'],
	'verbose': '1',
	}
	#!/bin/bash
	# Download all Symlink Instruction files from a given list of identifiers.
	# Usage: ./get-symlinks.sh itemlist.txt

	while read identifier
	do
	mkdir -p symlink_instructions
	URL="http://archive.org/download/${identifier}/${identifier}_symlinks.txt"
	COOK="Cookie: logged-in-sig=$LOGGED_IN_SIG; logged-in-user=$LOGGED_IN_USER"
	wget -q "$URL" -O symlink_instructions/${identifier}_symlinks.txt \
	#!/usr/bin/env python
	import os
	#import lxml.html, lxml.etree
	import lxml.etree
	import subprocess
	import json
	import urllib

	ROOT_DIR = os.getcwd()
	utf8_parser = lxml.etree.HTMLParser(encoding='utf-8')
	#!/usr/bin/env python

	""" Check if an item on archive.org has an acoustid.

	Usage:
	./check_for_acoustid.py {item}

	Usage with GNU Parallel:
	cat itemlist.txt \| parallel --max-procs=8 --group './check_for_acoustid.py {}'
	# This demonstrates doing multiple metadata fetches in parallel.
	# It seems to be fast enough that the json decoding cost becomes
	# a significant proportion of the execution time.

	# It requires gevent; see http://www.gevent.org/intro.html#installation

	# To make this do something useful, modify do_work().

	import gevent
	taskid="$1"
	log=$(curl -s "http://www-tracey.us.archive.org/log_show.php?task_id=$taskid&full=1")
	filepath=$(echo $log \| grep -Po '(?<=\[dir\]\ \=\>\ )/14/items/archiveteam-mobileme-hero-9' \| head -1)
	node=$(echo $log \| grep -Po '(?<=\[server).*?(?=.us.archive.org)' \| head -1 \| grep -Po 'ia6[0-9]{5}')

	itemsize_url="http://$node.us.archive.org/item-size.php?path=$filepath"
	# Item size in KB
	_itemsize=$(curl -s "$itemsize_url" \| grep -Po '(?<=\<size\>).*(?=\</size\>)')
	# Convert to GB
	itemsize=$(echo "$_itemsize/1000000" \| bc)
	#!/usr/bin/env python
	#
	# Provided a list of identifiers for items on archive.org, return all items
	# that have an "acoustid" for every original audio file, but NOT a
	# "mb_recording_id".
	#
	import sys
	import logging
	from datetime import datetime
	#!/usr/bin/env python
	#
	# Find out the most used metadata fields on archive.org
	#
	import sys
	import logging
	from datetime import datetime

	import ujson as json
	import cPickle as pickle