Skip to content

Instantly share code, notes, and snippets.

@edsu
Created July 21, 2022 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/2d80da5b1956ccac54dea197ee416507 to your computer and use it in GitHub Desktop.
Save edsu/2d80da5b1956ccac54dea197ee416507 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import json
druids = ['bj330fg0526', 'bp312sd3142', 'bs648dv9357', 'bz893jg7695', 'bz922hc1158', 'cc095kz3027', 'ch908dt6803', 'cp809cz8166', 'cv292vs5727', 'dn752dz0508', 'dy271hk6968', 'fd892fn4310', 'fj109wp2130', 'fn912wb3725', 'fp815hx3553', 'fs415vb1264', 'fv812yp9241', 'fw782ks7983', 'gf100kp6588', 'gj901jn9353', 'hf001pb6273', 'hh929wg3298', 'hn217tx5368', 'hq140wy0905', 'hv642nf7717', 'hv698ks1475', 'hw434pj6642', 'hw645gv7743', 'jb739pj9696', 'jg940ts4575', 'jh597wr5998', 'jz331hr5976', 'kw186hs7975', 'kx196rt8122', 'ky214ft2956', 'ky357nb9554', 'mg249dy7051', 'mk879xr0461', 'mv110pd4781', 'mv300dt6569', 'mx349xb4098', 'mz415jv3453', 'nd087pt9085', 'nk906ht6735', 'nn453zz9250', 'nr015ch1092', 'nv773xq7981', 'pf139tj8228', 'pn628yn6194', 'pq169jd6716', 'px611qw1504', 'qd726vf4177', 'qk039cf4369', 'qw725qm9638', 'qx771bj6775', 'rv306cp2774', 'sd725cc2793', 'sk583gg2589', 'sn506gj4859', 'sq394vr6558', 'sq694nb4696', 'st474bt2800', 'tk364rs5190', 'tw357sy1852', 'tx189sh1771', 'tz511vp4591', 'vc422nk8372', 'vg960hm2114', 'vw888cw4006', 'wj386hv6416', 'ws693rv6937', 'wx052dy6761', 'wy674gp2809', 'xf312dk5839', 'xs464np9114', 'xw132hq6957', 'xx786xt4190', 'xy030xz3308', 'yn481wn9363', 'zf348pz3049', 'zf772rq0857', 'zr926dz8753', 'zs791bz3245', 'zw829wv5935']
seen = set()
fh = open('/web-archiving-stacks/data/indexes/cdxj/level3.cdxj')
next(fh)
for line in fh:
surt, offset, json_block = line.strip().split(" ", 2)
block = json.loads(json_block)
druid_tree = block['filename'].split("/")
druid = ''.join(druid_tree[1:-1])
if druid in druids:
seen.add(druid)
print("SEEN: ", seen)
print("NOT SEEN:", set(druids) - seen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment