Skip to content

Instantly share code, notes, and snippets.

@rokroskar
Last active November 14, 2019 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rokroskar/9fd57f2a6428c1ae8444972394fc10f9 to your computer and use it in GitHub Desktop.
Save rokroskar/9fd57f2a6428c1ae8444972394fc10f9 to your computer and use it in GitHub Desktop.
htsget as a git filter

This is a demonstration of using a git filter set up to automatically fetch data from an arbitrary remote server, in this case from htsget.

The necessary setup:

  1. make renku-htsget.py an executable and make it available on the PATH.
  2. create a project with a git repository
  3. add a .gitattributes file with the line
*.bam filter=htsget
  1. add the file NA12878_2.bam below and commit it to the repository
  2. set up the htsget filter in git with
git config --global filter.htsget.smudge="renku-htsget.py smudge %f
git config --global filter.htsget.smudge="renku-htsget.py clean %f
  1. to replace the pointer file with the content from the server, run
git reset --hard

You should see a brief info screen with the content of the pointer file and the data should be downloaded from the server. However, if you do a git show you will still just see the pointer file contents.

You can now push the git repository to a remote and only the contents of the pointer file will be pushed. When you clone it, the content will be automatically downloaded.

# htsget pointer file
url: http://htsnexus.rnd.dnanex.us/v1/reads/BroadHiSeqX_b37/NA12878
reference_name: 2
start: 1000
end: 200000
#!/usr/bin/env python
import argparse
import sys
import hashlib
import warnings
from pathlib import Path
import htsget
import git
import ptvsd
from ruamel.yaml import YAML
yaml = YAML()
info_string = """
Reading from {url} with:
reference_name: {reference_name}
start: {start}
end: {end}
"""
def smudge(filename):
"""Convert description to bytes on disk via htsget."""
buffer = sys.stdin.buffer.read()
try:
data = yaml.load(buffer.decode())
except UnicodeDecodeError:
sys.stdout.buffer.write(buffer)
return
sys.stderr.write(info_string.format(**data))
sys.stderr.flush()
data.insert(1, "output", sys.stdout.buffer)
htsget.get(**data)
def clean(filename):
"""Replace the file with a pointer."""
blob = sys.stdin.buffer.read()
sys.stdin.close()
hash = compute_git_hash(blob)
spec = Path(".renku/htsget") / hash
try:
with spec.open("r") as f:
sys.stdout.write(f.read())
except Exception:
warnings.warn("No corresponding metadata found for hash {}".format(hash[:16]))
sys.stdout.buffer.write(blob)
def compute_git_hash(data):
"""Compute the git sha-1."""
blob_str = b"blob " + str(len(data)).encode() + b"\0" + data
return hashlib.sha1(blob_str).hexdigest()
def convert(filename):
"""Convert spec to bytes."""
with open(filename, "r") as f:
buf = f.read(8)
if buf[2:] == "htsget":
f.seek(0)
data = yaml.load(f.read())
with open(filename, "wb") as f:
data.insert(1, "output", f)
htsget.get(**data)
r = git.Repo()
hash = r.git.hash_object(filename)
spec = Path(".renku/htsget") / hash
spec.parent.mkdir(exist_ok=True)
data.pop("output")
with spec.open("w") as f:
yaml.dump(data, f)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert htsget spec.")
parser.add_argument("action", help="Which action to undertake - smudge or clean")
parser.add_argument("filename", help="Name of the file")
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
if args.debug:
print("Waiting for debugger attach")
ptvsd.enable_attach(address=("localhost", 5678), redirect_output=True)
ptvsd.wait_for_attach()
breakpoint()
if args.action == "smudge":
smudge(args.filename)
elif args.action == "clean":
clean(args.filename)
elif args.action == "convert":
convert(args.filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment