Skip to content

Instantly share code, notes, and snippets.

@NanoExplorer
Created March 20, 2023 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NanoExplorer/95463ca89697d2febb88ad33dc1fd821 to your computer and use it in GitHub Desktop.
Save NanoExplorer/95463ca89697d2febb88ad33dc1fd821 to your computer and use it in GitHub Desktop.
A small python script for reading multiple tar files from a SCSI tape drive.
#!/usr/bin/env python
import subprocess
import os
import shutil
from glob import glob
from pathlib import Path
import time
import argparse
def format_bytes(size):
# https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb
# 2**10 = 1024
power = 2**10
n = 0
power_labels = {0: 'B ', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'}
while size > power:
size /= power
n += 1
return f"{size:6.2f} {power_labels[n]}"
# size, power_labels[n]
def determine_file_name(f):
"""
Given a filename (f), return a file name
that does not already exist. Most of the time
this should be the identity function (i.e. return f)
but if file f already exists, return a new name
that can be used instead like "f~1"
"""
filename = f + "{num}"
files = glob(filename.format(num="*"))
if len(files) == 0:
return f
else:
lastfile = sorted(files)[-1]
no = lastfile.split('~')[-1]
try:
no = int(no)+1
except ValueError:
# no in this case is the file name
no = 1
return f+f"~{no}"
def du(path):
""" Given a path, returns the total size of all files
in sub-directories of that path in bytes."""
# https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python
working_dir = Path(path)
files = working_dir.glob("**/*")
size = sum(f.stat().st_size for f in files if f.is_file())
return size
def main(blksize=10240,dev='/dev/nst1',tarblk=320, tarB=False):
# Set block size. Note that this has worked for
# all 5 tapes I have read so far, but is not guaranteed
assert '/n' in dev, "Must supply a non-rewinding device! On Linux these look like /dev/nstX."
print("setting blk size")
subprocess.run([
"mt", "-f", dev, "setblk", str(blksize)
])
keepgoing = True
try:
os.makedirs("tmp")
except FileExistsError:
pass
os.chdir("tmp")
while keepgoing:
# Start reading tar files. Note again that this has
# worked so far, but it's not unthinkable that some
# tars could be compressed and need a z or something
tar_args = ["tar", "-xf", dev, f"--blocking-factor={tarblk}"]
if tarB:
tar_args.append("-B")
with subprocess.Popen(
tar_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
) as cp: # cp is for CompletedProcess
print("started tar")
last_size = 0
start_time = time.time()
last_good_time = time.time()
# If tar isn't done, print some stats
while cp.poll() is None:
t = time.time()
time.sleep(5)
size = du('.')
dt = time.time() - t
f_size = format_bytes(size)
ds = size-last_size
speed = ds/dt
f_speed = format_bytes(speed)
avg_spd = size/(time.time() - start_time)
f_avg = format_bytes(avg_spd)
to_print = f"Read {f_size} [{f_speed}/s]. (Average speed {f_avg}/s) "
if ds > 1:
last_good_time=time.time()
else:
time_since = t - last_good_time
if time_since > 30:
to_print = f"Read {f_size} [{f_speed}/s]. (last successful read: {time_since:.0f}s ago)"
if time_since > 1200: #abort after 20 minutes
cp.kill()
print(to_print,end="\r",flush=True)
last_size = size
print('\n tar finished')
if cp.returncode != 0:
print(f"tar returned with status code {cp.returncode}.")
out,err = cp.communicate()
print(out.decode(), err.decode())
exit()
# go through all the files we just read and move them out of
# the temporary folder we os.chdir'ed into earlier, renaming
# them in the process if needed.
for f in glob("*"):
print(f"moving file {f}")
moveto = os.path.join("../", f)
# make sure we don't overwrite anything
moveto = determine_file_name(moveto)
shutil.move(f, moveto)
# Advance to the next file. Note that all this does is move the
# 'virtual' tape read head from before the EOF mark to after,
# but doesn't actually move the tape.
cp = subprocess.run(
["mt", "-f", dev, "status"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
# I ran into a problem where sometimes after tar runs we'll already be
# at an EOF, and if we run FSF then, we'll skip a whole file.
out = cp.stdout.decode()
if "EOF" not in out:
cp = subprocess.run(
["mt", "-f", dev, "fsf"]
)
else:
print("Somehow made it to EOF...")
#use fsr to test for eod? Apparently when you go past the end of the file
# you don't automatically notice if you're at the end of the data. To achieve
# that, I'll try to forward space by one "record"
"""
example I guess:
a b c
block | block | last_block | EOF | EOD |
"a" is where you are when tar finishes. FSF moves you to b. FSR moves you to c, or to block 1
of the next file. If we get "EOD" we exit, and if not we use "BSR" to go back to block 0 of
the next file?
"""
print("An I/O error on this line is fine: ",end='',flush=True)
subprocess.run(["mt","-f",dev,'fsr']) # will give an io error at EOD
print()
cp = subprocess.run(
["mt", "-f", dev, "status"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
# If we're at the end of data or tape, finish.
out = cp.stdout.decode()
print(out)
if "EOD" in out or "EOT" in out:
keepgoing = False
else:
subprocess.run(['mt','-f',dev,'bsr'])
cp = subprocess.run([
"mt", "-f", dev, "eject"
])
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog = "tape_read",
description = "read all the tar files out of a tape",
)
parser.add_argument('dest_folder')
parser.add_argument('-f', '--file', help="Device file of tape drive", default="/dev/nst0")
parser.add_argument('-b', '--blksize', help="Block size on tape. Usually 10240 so far, but use 0 if unsure (could slow everything down immensely but should at least work)", default=10240)
parser.add_argument('-x', '--tarblk', help="Tar blocking factor. Suggested values are 20 (tar default); 112 or 256 (suggested by a gnu webpage on tapes); or 320 (this script default, somewhat arbitrary but seems to work decently well)", default=320)
parser.add_argument('-B', '--tarB', help="Tell tar to re-block partial blocks into whole blocks. This may be helpful if we think that block errors are slowing down the untar process", action="store_true")
args = parser.parse_args()
try:
os.makedirs(args.dest_folder)
except FileExistsError:
pass
os.chdir(args.dest_folder)
main(blksize = args.blksize, dev = args.file, tarblk=args.tarblk, tarB = args.tarB)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment