Skip to content

Instantly share code, notes, and snippets.

@tsudoko
Created October 11, 2018 22:27
Show Gist options
  • Save tsudoko/46faae62bbb7c58d033e3d688aca617e to your computer and use it in GitHub Desktop.
Save tsudoko/46faae62bbb7c58d033e3d688aca617e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import collections
import os
import sys
CDXFile = collections.namedtuple("CDXFile", ["sep", "fields", "file"])
essential_fields = frozenset("aku")
warn_warc_dedup = True
def say(*args):
print("{}: ".format(os.path.basename(sys.argv[0])), *args, file=sys.stderr)
def main(*cdx, dest=sys.stdout.buffer):
files = {}
commonfields = set(chr(x) for x in range(256))
for filename in cdx:
f = open(filename, "rb")
sep = f.read(1)
if sep != b" ":
# having multiple different separators in input files makes stream
# processing much harder, you don't know beforehand if there aren't
# any conflicts in field contents, e.g. some field from file 1
# containing a field separator from file 2
# this could be solved with an escaping mechanism, but afaik cdx
# doesn't have one
say("{}: separator is not space: {}".format(filename, sep))
magic = f.read(4)
if magic != b"CDX ":
say("{}: invalid cdx magic: {}".format(filename, magic))
exit(1)
fields = tuple(x.decode() for x in f.readline().rstrip().split(sep))
commonfields.intersection_update(fields)
files[filename] = CDXFile(sep=sep, fields=fields, file=f)
orderedfields = tuple(commonfields)
dest.write(b" CDX ")
dest.write(b" ".join(x.encode() for x in orderedfields))
dest.write(b"\n")
for c in files.values():
for l in c.file.readlines():
entry = {c.fields[i]: field for i, field in enumerate(l.rstrip().split(c.sep))}# if c.fields[i] in commonfields}
dest.write(b" ".join(entry[f] for f in orderedfields))
dest.write(b"\n")
dest.flush()
for filename, c in files.items():
if warn_warc_dedup:
essential_missing = essential_fields - set(c.fields)
if essential_missing:
say("{}: warning: missing fields for warc-dedup: {}".format(filename, set(essential_missing)))
c.file.close()
if __name__ == "__main__":
if len(sys.argv) < 2:
print("usage: {} cdxfile...".format(os.path.basename(sys.argv[0])))
exit(1)
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment