jjjake/parse_audfprint_output.py

## parse_audfprint_output.py
#!/usr/bin/env python
"""Parse audfprint .out files.

example input:

    Fri Jan  8 00:07:47 2016 Reading hash table /1/2015/db-dem3/dem3-debate-aa.db
    NOMATCH precomp/1/2015/mp3s/ALJAZAM_20151219_000000_News.afpt 3659.9 sec 299066 raw hashes
    Matched    2.9 s starting at   35.1 s in precomp/1/2015/mp3s/ALJAZAM_20151220_040000_Weekend_News.afpt to time    0.8 s in /1/2015/dem3-mp4/2015-12-19-D-Debate-0050.mp4 with    76 of  1264 common hashes at rank  5

"""
import sys


if __name__ == '__main__':
    for line in open(sys.argv[-1]):
        if not line.startswith('Matched'):
            continue

        identifier = line.split('.afpt')[0].split('/')[-1]
        # Subtract 1.5 seconds from start time.
        start = str(float(line.split('starting at')[-1].lstrip().split()[0]) - 1.5)
        # Pad with 3 seconds for 1.5 seconds of padding on start/end (because
        # we already subtracted 1.5 from the start).
        seconds_matched = str(float(line.split()[1]) + 3.0)
        clip_matched = line.split('/')[-1].split()[0].split('.')[0]
        confidence = line.split('with')[-1].split()[0]
        embed_url = 'https://archive.org/embed/{0}&start={1}&end={2}'.format(
                identifier, start, float(start) + float(seconds_matched))

        # Flag matches that may be troublesome.
        flag = 'false'
        if start.startswith('-') or int(confidence) <= 20:
            flag = 'true'

        print('\t'.join([embed_url, seconds_matched, clip_matched, confidence, flag]))
	#!/usr/bin/env python
	"""Parse audfprint .out files.

	example input:

	Fri Jan 8 00:07:47 2016 Reading hash table /1/2015/db-dem3/dem3-debate-aa.db
	NOMATCH precomp/1/2015/mp3s/ALJAZAM_20151219_000000_News.afpt 3659.9 sec 299066 raw hashes
	Matched 2.9 s starting at 35.1 s in precomp/1/2015/mp3s/ALJAZAM_20151220_040000_Weekend_News.afpt to time 0.8 s in /1/2015/dem3-mp4/2015-12-19-D-Debate-0050.mp4 with 76 of 1264 common hashes at rank 5

	"""
	import sys


	if __name__ == '__main__':
	for line in open(sys.argv[-1]):
	if not line.startswith('Matched'):
	continue

	identifier = line.split('.afpt')[0].split('/')[-1]
	# Subtract 1.5 seconds from start time.
	start = str(float(line.split('starting at')[-1].lstrip().split()[0]) - 1.5)
	# Pad with 3 seconds for 1.5 seconds of padding on start/end (because
	# we already subtracted 1.5 from the start).
	seconds_matched = str(float(line.split()[1]) + 3.0)
	clip_matched = line.split('/')[-1].split()[0].split('.')[0]
	confidence = line.split('with')[-1].split()[0]
	embed_url = 'https://archive.org/embed/{0}&start={1}&end={2}'.format(
	identifier, start, float(start) + float(seconds_matched))

	# Flag matches that may be troublesome.
	flag = 'false'
	if start.startswith('-') or int(confidence) <= 20:
	flag = 'true'

	print('\t'.join([embed_url, seconds_matched, clip_matched, confidence, flag]))