Skip to content

Instantly share code, notes, and snippets.

@nsapa
Last active October 29, 2020 18:36
Show Gist options
  • Save nsapa/8c68bf74035d0e7bc8e09e7a572e4f91 to your computer and use it in GitHub Desktop.
Save nsapa/8c68bf74035d0e7bc8e09e7a572e4f91 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import sys
import xml.etree.ElementTree as ET
tree = ET.parse('yahoogroup_info_grab_files.xml')
root = tree.getroot()
for file in root:
# [('name', 'AO3_story_dump_continuing_reviews.xml'), ('source', 'original')]
filename = file.attrib['name']
filesource = file.attrib['source']
if filesource != 'original':
continue
sha1 = ''
fileformat = ''
for attribute in list(file):
# [<Element 'mtime' at 0x7ff43fc78170>, <Element 'size' at 0x7ff43fc781d0>, <Element 'md5' at 0x7ff43fc78230>, <Element 'crc32' at 0x7ff43fc782f0>, <Element 'sha1' at 0x7ff43fc78350>, <Element 'format' at 0x7ff43fc78410>]
if attribute.tag == 'sha1':
sha1 = attribute.text
if attribute.tag == 'format':
fileformat = attribute.text
if sha1 == '':
continue
if fileformat in [
'Item Tile', 'Item CDX Index', 'Item CDX Meta-Index', 'Metadata'
]:
continue
print('#%s - %s\n%s\t%s' % (filesource, fileformat, sha1, filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment