Skip to content

Instantly share code, notes, and snippets.

@symm
Last active December 17, 2021 00:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save symm/c0dfac5ca94dc70e6f392e1c36ec1330 to your computer and use it in GitHub Desktop.
Save symm/c0dfac5ca94dc70e6f392e1c36ec1330 to your computer and use it in GitHub Desktop.
Get SHA1 hashes from archive.org metadata
#!/usr/bin/env python
###############################
# Extract SHA1 file hashes from archive.org metadata files
#
# Usage: place archive.org {item_name}_files.xml in ./xml/
# Run ./extract-hashes.py
# sha1 files get written to ./sha1
###############################
import xml.etree.ElementTree as ET
import os
import glob
files = glob.glob('xmls/*.xml')
blacklist = [".xml", ".torrent", ".jpg", ".png", ".sqlite"]
for filename in files:
f = open('sha1/' + os.path.basename(filename).replace('.xml', '.sha1'), 'w')
root = ET.parse(filename).getroot()
accumulator = 0
for type_tag in root.findall('file'):
file_name = type_tag.get('name')
if os.path.splitext(file_name)[-1].lower() not in map(str.lower, blacklist):
sha1 = type_tag.findall('sha1')
size = type_tag.findall('size')
if len(sha1) > 0:
f.write('%s %s\n' % (sha1[0].text, os.path.basename(file_name)))
file_size = int(size[0].text)
accumulator = accumulator + int(file_size)
gb = round(accumulator / 1000 / 1000 / 1000)
print('%s %s Gb' %(filename, gb))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment