Created
March 20, 2023 22:19
-
-
Save NanoExplorer/95463ca89697d2febb88ad33dc1fd821 to your computer and use it in GitHub Desktop.
A small python script for reading multiple tar files from a SCSI tape drive.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import subprocess | |
import os | |
import shutil | |
from glob import glob | |
from pathlib import Path | |
import time | |
import argparse | |
def format_bytes(size): | |
# https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb | |
# 2**10 = 1024 | |
power = 2**10 | |
n = 0 | |
power_labels = {0: 'B ', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'} | |
while size > power: | |
size /= power | |
n += 1 | |
return f"{size:6.2f} {power_labels[n]}" | |
# size, power_labels[n] | |
def determine_file_name(f): | |
""" | |
Given a filename (f), return a file name | |
that does not already exist. Most of the time | |
this should be the identity function (i.e. return f) | |
but if file f already exists, return a new name | |
that can be used instead like "f~1" | |
""" | |
filename = f + "{num}" | |
files = glob(filename.format(num="*")) | |
if len(files) == 0: | |
return f | |
else: | |
lastfile = sorted(files)[-1] | |
no = lastfile.split('~')[-1] | |
try: | |
no = int(no)+1 | |
except ValueError: | |
# no in this case is the file name | |
no = 1 | |
return f+f"~{no}" | |
def du(path): | |
""" Given a path, returns the total size of all files | |
in sub-directories of that path in bytes.""" | |
# https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python | |
working_dir = Path(path) | |
files = working_dir.glob("**/*") | |
size = sum(f.stat().st_size for f in files if f.is_file()) | |
return size | |
def main(blksize=10240,dev='/dev/nst1',tarblk=320, tarB=False): | |
# Set block size. Note that this has worked for | |
# all 5 tapes I have read so far, but is not guaranteed | |
assert '/n' in dev, "Must supply a non-rewinding device! On Linux these look like /dev/nstX." | |
print("setting blk size") | |
subprocess.run([ | |
"mt", "-f", dev, "setblk", str(blksize) | |
]) | |
keepgoing = True | |
try: | |
os.makedirs("tmp") | |
except FileExistsError: | |
pass | |
os.chdir("tmp") | |
while keepgoing: | |
# Start reading tar files. Note again that this has | |
# worked so far, but it's not unthinkable that some | |
# tars could be compressed and need a z or something | |
tar_args = ["tar", "-xf", dev, f"--blocking-factor={tarblk}"] | |
if tarB: | |
tar_args.append("-B") | |
with subprocess.Popen( | |
tar_args, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) as cp: # cp is for CompletedProcess | |
print("started tar") | |
last_size = 0 | |
start_time = time.time() | |
last_good_time = time.time() | |
# If tar isn't done, print some stats | |
while cp.poll() is None: | |
t = time.time() | |
time.sleep(5) | |
size = du('.') | |
dt = time.time() - t | |
f_size = format_bytes(size) | |
ds = size-last_size | |
speed = ds/dt | |
f_speed = format_bytes(speed) | |
avg_spd = size/(time.time() - start_time) | |
f_avg = format_bytes(avg_spd) | |
to_print = f"Read {f_size} [{f_speed}/s]. (Average speed {f_avg}/s) " | |
if ds > 1: | |
last_good_time=time.time() | |
else: | |
time_since = t - last_good_time | |
if time_since > 30: | |
to_print = f"Read {f_size} [{f_speed}/s]. (last successful read: {time_since:.0f}s ago)" | |
if time_since > 1200: #abort after 20 minutes | |
cp.kill() | |
print(to_print,end="\r",flush=True) | |
last_size = size | |
print('\n tar finished') | |
if cp.returncode != 0: | |
print(f"tar returned with status code {cp.returncode}.") | |
out,err = cp.communicate() | |
print(out.decode(), err.decode()) | |
exit() | |
# go through all the files we just read and move them out of | |
# the temporary folder we os.chdir'ed into earlier, renaming | |
# them in the process if needed. | |
for f in glob("*"): | |
print(f"moving file {f}") | |
moveto = os.path.join("../", f) | |
# make sure we don't overwrite anything | |
moveto = determine_file_name(moveto) | |
shutil.move(f, moveto) | |
# Advance to the next file. Note that all this does is move the | |
# 'virtual' tape read head from before the EOF mark to after, | |
# but doesn't actually move the tape. | |
cp = subprocess.run( | |
["mt", "-f", dev, "status"], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT | |
) | |
# I ran into a problem where sometimes after tar runs we'll already be | |
# at an EOF, and if we run FSF then, we'll skip a whole file. | |
out = cp.stdout.decode() | |
if "EOF" not in out: | |
cp = subprocess.run( | |
["mt", "-f", dev, "fsf"] | |
) | |
else: | |
print("Somehow made it to EOF...") | |
#use fsr to test for eod? Apparently when you go past the end of the file | |
# you don't automatically notice if you're at the end of the data. To achieve | |
# that, I'll try to forward space by one "record" | |
""" | |
example I guess: | |
a b c | |
block | block | last_block | EOF | EOD | | |
"a" is where you are when tar finishes. FSF moves you to b. FSR moves you to c, or to block 1 | |
of the next file. If we get "EOD" we exit, and if not we use "BSR" to go back to block 0 of | |
the next file? | |
""" | |
print("An I/O error on this line is fine: ",end='',flush=True) | |
subprocess.run(["mt","-f",dev,'fsr']) # will give an io error at EOD | |
print() | |
cp = subprocess.run( | |
["mt", "-f", dev, "status"], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT | |
) | |
# If we're at the end of data or tape, finish. | |
out = cp.stdout.decode() | |
print(out) | |
if "EOD" in out or "EOT" in out: | |
keepgoing = False | |
else: | |
subprocess.run(['mt','-f',dev,'bsr']) | |
cp = subprocess.run([ | |
"mt", "-f", dev, "eject" | |
]) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
prog = "tape_read", | |
description = "read all the tar files out of a tape", | |
) | |
parser.add_argument('dest_folder') | |
parser.add_argument('-f', '--file', help="Device file of tape drive", default="/dev/nst0") | |
parser.add_argument('-b', '--blksize', help="Block size on tape. Usually 10240 so far, but use 0 if unsure (could slow everything down immensely but should at least work)", default=10240) | |
parser.add_argument('-x', '--tarblk', help="Tar blocking factor. Suggested values are 20 (tar default); 112 or 256 (suggested by a gnu webpage on tapes); or 320 (this script default, somewhat arbitrary but seems to work decently well)", default=320) | |
parser.add_argument('-B', '--tarB', help="Tell tar to re-block partial blocks into whole blocks. This may be helpful if we think that block errors are slowing down the untar process", action="store_true") | |
args = parser.parse_args() | |
try: | |
os.makedirs(args.dest_folder) | |
except FileExistsError: | |
pass | |
os.chdir(args.dest_folder) | |
main(blksize = args.blksize, dev = args.file, tarblk=args.tarblk, tarB = args.tarB) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment