Last active
December 17, 2021 00:35
-
-
Save symm/c0dfac5ca94dc70e6f392e1c36ec1330 to your computer and use it in GitHub Desktop.
Get SHA1 hashes from archive.org metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
############################### | |
# Extract SHA1 file hashes from archive.org metadata files | |
# | |
# Usage: place archive.org {item_name}_files.xml in ./xml/ | |
# Run ./extract-hashes.py | |
# sha1 files get written to ./sha1 | |
############################### | |
import xml.etree.ElementTree as ET | |
import os | |
import glob | |
files = glob.glob('xmls/*.xml') | |
blacklist = [".xml", ".torrent", ".jpg", ".png", ".sqlite"] | |
for filename in files: | |
f = open('sha1/' + os.path.basename(filename).replace('.xml', '.sha1'), 'w') | |
root = ET.parse(filename).getroot() | |
accumulator = 0 | |
for type_tag in root.findall('file'): | |
file_name = type_tag.get('name') | |
if os.path.splitext(file_name)[-1].lower() not in map(str.lower, blacklist): | |
sha1 = type_tag.findall('sha1') | |
size = type_tag.findall('size') | |
if len(sha1) > 0: | |
f.write('%s %s\n' % (sha1[0].text, os.path.basename(file_name))) | |
file_size = int(size[0].text) | |
accumulator = accumulator + int(file_size) | |
gb = round(accumulator / 1000 / 1000 / 1000) | |
print('%s %s Gb' %(filename, gb)) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment