Skip to content

Instantly share code, notes, and snippets.

@borice
Created March 31, 2020 02:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save borice/e10e16a609b8fcb04f66c0382caf66e9 to your computer and use it in GitHub Desktop.
Save borice/e10e16a609b8fcb04f66c0382caf66e9 to your computer and use it in GitHub Desktop.
Example code for converting to- and from- pairtree and stubbytree directory structures for the HTRC Extracted Features dataset
#!/usr/bin/env python3
# Note: depends on `pairtree` package (`pip install pairtree`)
import os
import pairtree.pairtree_path as ppath
def stubby_to_pairtree(path: str, ef_ext: str = '.json.bz2') -> str:
assert path.endswith(ef_ext)
d, f = os.path.split(path)
assert len(d.split(os.sep)) >= 2
lib_id, clean_volid = f[:-len(ef_ext)].split('.', 1)
volid = ppath.id_decode(clean_volid)
pairtree_root = os.path.join(os.sep.join(d.split(os.sep)[:-2]), lib_id, 'pairtree_root')
pairtree_dir = ppath.id_to_dirpath(volid, pairtree_root)
return os.path.join(pairtree_dir, clean_volid, f)
def pairtree_to_stubby(path: str, ef_ext: str = '.json.bz2') -> str:
assert path.endswith(ef_ext)
assert 'pairtree_root' in path
d, f = os.path.split(path)
lib_id, clean_volid = f[:-len(ef_ext)].split('.', 1)
root = d[:d.find(lib_id + os.sep + 'pairtree_root' + os.sep)]
stubby_path = os.path.join(lib_id, clean_volid[::3])
return os.path.join(root, stubby_path, f)
s = 'loc/a+30795/loc.ark+=13960=t70v90g5f.json.bz2'
p = 'loc/pairtree_root/ar/k+/=1/39/60/=t/70/v9/0g/5f/ark+=13960=t70v90g5f/loc.ark+=13960=t70v90g5f.json.bz2'
pair = stubby_to_pairtree(s)
print(pair)
assert pair == p
stubby = pairtree_to_stubby(p)
print(stubby)
assert stubby == s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment