Skip to content

Instantly share code, notes, and snippets.

@TimoRoth
Last active September 28, 2022 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TimoRoth/4c0eded7004b06659e53345e3f64dbdf to your computer and use it in GitHub Desktop.
Save TimoRoth/4c0eded7004b06659e53345e3f64dbdf to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import tarfile
import requests
import io
class SeekableHttpStream:
def __init__(self, url, buf_size = 10 * 1024 * 1024):
self._url = url
self._buf_size = buf_size
self._pos = 0
self._buf = None
self._buf_pos = -1
self.transferred = 0
self._ses = requests.Session()
r = self._ses.head(self._url)
r.raise_for_status()
self._size = int(r.headers["content-length"])
def read(self, size=-1):
# If the request just asks for the entire file, just download the whole range:
if size < 0:
r = self._ses.get(self._url, headers={"Range": f"bytes={self._pos}-"})
r.raise_for_status()
self._pos += int(r.headers["content-length"])
self._buf = None
return r.content
# Calculate position in our buffer, and use any matching data:
bdpos = self._pos - self._buf_pos
bdlen = len(self._buf or []) - bdpos
if self._buf and bdlen > 0 and bdpos >= 0:
bdrd = min(size, bdlen)
res = self._buf[bdpos:bdpos+bdrd]
else:
self._buf = None
res = b""
# Check if the entire request was served from buffer:
if len(res) >= size:
self._pos += len(res)
return res
# Calculate amount of remaining data to fetch
rem = size - len(res)
newpos = self._pos + len(res)
# Fetch remaining data plus a new full buffer
r = self._ses.get(self._url, headers={"Range": f"bytes={newpos}-{newpos + rem + self._buf_size - 1}"})
r.raise_for_status()
# Fill remaining request data and buffer
res += r.content[:rem]
self._buf = r.content[rem:]
# Both buffer and reading are at the same new pos now
self._pos += len(res)
self._buf_pos = self._pos
# Informational metadata about actually transferred data
self.transferred += len(r.content)
return res
def tell(self):
return self._pos
def seek(self, offset, whence = io.SEEK_SET):
if whence == io.SEEK_SET:
self._pos = offset
elif whence == io.SEEK_CUR:
self._pos += offset
elif whence == io.SEEK_END:
self._pos = self._size + offset
else:
raise ValueError()
s = SeekableHttpStream("https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/centerlines/RGI62/b_160/L1/RGI60-02/RGI60-02.00.tar")
t = tarfile.open(fileobj=s, mode="r:")
m = t.getmember('RGI60-02.00/RGI60-02.00751.tar.gz')
print("Got member: " + m.name)
f = t.extractfile(m)
d = f.read()
print(len(d))
print(s.transferred/s._size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment