NanoExplorer/tape_read.py

## tape_read.py
#!/usr/bin/env python

import subprocess
import os
import shutil
from glob import glob
from pathlib import Path
import time
import argparse


def format_bytes(size):
    # https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0: 'B  ', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'}
    while size > power:
        size /= power
        n += 1
    return f"{size:6.2f} {power_labels[n]}"
    # size, power_labels[n]


def determine_file_name(f):
    """
    Given a filename (f), return a file name
    that does not already exist. Most of the time
    this should be the identity function (i.e. return f)
    but if file f already exists, return a new name
    that can be used instead like "f~1"
    """
    filename = f + "{num}"
    files = glob(filename.format(num="*"))
    if len(files) == 0:
        return f
    else:
        lastfile = sorted(files)[-1]
        no = lastfile.split('~')[-1]
        try:
            no = int(no)+1
        except ValueError:
            # no in this case is the file name
            no = 1
        return f+f"~{no}"


def du(path):
    """ Given a path, returns the total size of all files
    in sub-directories of that path in bytes."""
    # https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python
    working_dir = Path(path)
    files = working_dir.glob("**/*")
    size = sum(f.stat().st_size for f in files if f.is_file())
    return size


def main(blksize=10240,dev='/dev/nst1',tarblk=320, tarB=False):
    # Set block size. Note that this has worked for
    # all 5 tapes I have read so far, but is not guaranteed
    assert '/n' in dev, "Must supply a non-rewinding device! On Linux these look like /dev/nstX."
    print("setting blk size")
    subprocess.run([
        "mt", "-f", dev, "setblk", str(blksize)
        ])
    keepgoing = True
    try:
        os.makedirs("tmp")
    except FileExistsError:
        pass
    os.chdir("tmp")
    while keepgoing:
        # Start reading tar files. Note again that this has
        # worked so far, but it's not unthinkable that some
        # tars could be compressed and need a z or something
        tar_args = ["tar", "-xf", dev, f"--blocking-factor={tarblk}"]
        if tarB:
            tar_args.append("-B")
        with subprocess.Popen(
            tar_args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ) as cp:  # cp is for CompletedProcess
            print("started tar")
            last_size = 0
            start_time = time.time()
            last_good_time = time.time()
            # If tar isn't done, print some stats
            while cp.poll() is None:
                t = time.time()
                time.sleep(5)
                size = du('.')
                dt = time.time() - t
                f_size = format_bytes(size)
                ds = size-last_size
                speed = ds/dt
                f_speed = format_bytes(speed)
                avg_spd = size/(time.time() - start_time)
                f_avg = format_bytes(avg_spd)
                to_print = f"Read {f_size} [{f_speed}/s]. (Average speed {f_avg}/s)    "
                if ds > 1:
                    last_good_time=time.time()

                else:
                    time_since = t - last_good_time
                    if time_since > 30:
                        to_print = f"Read {f_size} [{f_speed}/s]. (last successful read: {time_since:.0f}s ago)"
                    if time_since > 1200: #abort after 20 minutes
                        cp.kill()
                print(to_print,end="\r",flush=True)
                last_size = size
            print('\n tar finished')
            if cp.returncode != 0:
                print(f"tar returned with status code {cp.returncode}.")
                out,err = cp.communicate()
                print(out.decode(), err.decode())
                exit()
        # go through all the files we just read and move them out of
        # the temporary folder we os.chdir'ed into earlier, renaming
        # them in the process if needed.
        for f in glob("*"):
            print(f"moving file {f}")
            moveto = os.path.join("../", f)
            # make sure we don't overwrite anything
            moveto = determine_file_name(moveto)
            shutil.move(f, moveto)
        # Advance to the next file. Note that all this does is move the
        # 'virtual' tape read head from before the EOF mark to after,
        # but doesn't actually move the tape.

        cp = subprocess.run(
            ["mt", "-f", dev, "status"],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT
        )
        # I ran into a problem where sometimes after tar runs we'll already be
        # at an EOF, and if we run FSF then, we'll skip a whole file.
        out = cp.stdout.decode()
        if "EOF" not in out:
            cp = subprocess.run(
                ["mt", "-f", dev, "fsf"]
            )
        else:
            print("Somehow made it to EOF...")
        #use fsr to test for eod? Apparently when you go past the end of the file
        # you don't automatically notice if you're at the end of the data. To achieve
        # that, I'll try to forward space by one "record"
        """
example I guess:
                     a        b     c
block | block | last_block | EOF | EOD |

"a" is where you are when tar finishes. FSF moves you to b. FSR moves you to c, or to block 1
of the next file. If we get "EOD" we exit, and if not we use "BSR" to go back to block 0 of
the next file?
        """
        print("An I/O error on this line is fine: ",end='',flush=True)
        subprocess.run(["mt","-f",dev,'fsr']) # will give an io error at EOD
        print()
        cp = subprocess.run(
            ["mt", "-f", dev, "status"],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT
        )
        # If we're at the end of data or tape, finish.
        out = cp.stdout.decode()
        print(out)
        if "EOD" in out or "EOT" in out:
            keepgoing = False
        else:
            subprocess.run(['mt','-f',dev,'bsr'])

    cp = subprocess.run([
        "mt", "-f", dev, "eject"
    ])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog = "tape_read",
        description = "read all the tar files out of a tape",
    )
    parser.add_argument('dest_folder')
    parser.add_argument('-f', '--file', help="Device file of tape drive", default="/dev/nst0")
    parser.add_argument('-b', '--blksize', help="Block size on tape. Usually 10240 so far, but use 0 if unsure (could slow everything down immensely but should at least work)", default=10240)
    parser.add_argument('-x', '--tarblk', help="Tar blocking factor. Suggested values are 20 (tar default); 112 or 256 (suggested by a gnu webpage on tapes); or 320 (this script default, somewhat arbitrary but seems to work decently well)", default=320)
    parser.add_argument('-B', '--tarB', help="Tell tar to re-block partial blocks into whole blocks. This may be helpful if we think that block errors are slowing down the untar process", action="store_true")
    args = parser.parse_args()
    try:
        os.makedirs(args.dest_folder)
    except FileExistsError:
        pass
    os.chdir(args.dest_folder)
    main(blksize = args.blksize, dev = args.file, tarblk=args.tarblk, tarB = args.tarB)
	#!/usr/bin/env python

	import subprocess
	import os
	import shutil
	from glob import glob
	from pathlib import Path
	import time
	import argparse


	def format_bytes(size):
	# https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb
	# 2**10 = 1024
	power = 2**10
	n = 0
	power_labels = {0: 'B ', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'}
	while size > power:
	size /= power
	n += 1
	return f"{size:6.2f} {power_labels[n]}"
	# size, power_labels[n]


	def determine_file_name(f):
	"""
	Given a filename (f), return a file name
	that does not already exist. Most of the time
	this should be the identity function (i.e. return f)
	but if file f already exists, return a new name
	that can be used instead like "f~1"
	"""
	filename = f + "{num}"
	files = glob(filename.format(num="*"))
	if len(files) == 0:
	return f
	else:
	lastfile = sorted(files)[-1]
	no = lastfile.split('~')[-1]
	try:
	no = int(no)+1
	except ValueError:
	# no in this case is the file name
	no = 1
	return f+f"~{no}"


	def du(path):
	""" Given a path, returns the total size of all files
	in sub-directories of that path in bytes."""
	# https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python
	working_dir = Path(path)
	files = working_dir.glob("*/")
	size = sum(f.stat().st_size for f in files if f.is_file())
	return size


	def main(blksize=10240,dev='/dev/nst1',tarblk=320, tarB=False):
	# Set block size. Note that this has worked for
	# all 5 tapes I have read so far, but is not guaranteed
	assert '/n' in dev, "Must supply a non-rewinding device! On Linux these look like /dev/nstX."
	print("setting blk size")
	subprocess.run([
	"mt", "-f", dev, "setblk", str(blksize)
	])
	keepgoing = True
	try:
	os.makedirs("tmp")
	except FileExistsError:
	pass
	os.chdir("tmp")
	while keepgoing:
	# Start reading tar files. Note again that this has
	# worked so far, but it's not unthinkable that some
	# tars could be compressed and need a z or something
	tar_args = ["tar", "-xf", dev, f"--blocking-factor={tarblk}"]
	if tarB:
	tar_args.append("-B")
	with subprocess.Popen(
	tar_args,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	) as cp: # cp is for CompletedProcess
	print("started tar")
	last_size = 0
	start_time = time.time()
	last_good_time = time.time()
	# If tar isn't done, print some stats
	while cp.poll() is None:
	t = time.time()
	time.sleep(5)
	size = du('.')
	dt = time.time() - t
	f_size = format_bytes(size)
	ds = size-last_size
	speed = ds/dt
	f_speed = format_bytes(speed)
	avg_spd = size/(time.time() - start_time)
	f_avg = format_bytes(avg_spd)
	to_print = f"Read {f_size} [{f_speed}/s]. (Average speed {f_avg}/s) "
	if ds > 1:
	last_good_time=time.time()

	else:
	time_since = t - last_good_time
	if time_since > 30:
	to_print = f"Read {f_size} [{f_speed}/s]. (last successful read: {time_since:.0f}s ago)"
	if time_since > 1200: #abort after 20 minutes
	cp.kill()
	print(to_print,end="\r",flush=True)
	last_size = size
	print('\n tar finished')
	if cp.returncode != 0:
	print(f"tar returned with status code {cp.returncode}.")
	out,err = cp.communicate()
	print(out.decode(), err.decode())
	exit()
	# go through all the files we just read and move them out of
	# the temporary folder we os.chdir'ed into earlier, renaming
	# them in the process if needed.
	for f in glob("*"):
	print(f"moving file {f}")
	moveto = os.path.join("../", f)
	# make sure we don't overwrite anything
	moveto = determine_file_name(moveto)
	shutil.move(f, moveto)
	# Advance to the next file. Note that all this does is move the
	# 'virtual' tape read head from before the EOF mark to after,
	# but doesn't actually move the tape.

	cp = subprocess.run(
	["mt", "-f", dev, "status"],
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT
	)
	# I ran into a problem where sometimes after tar runs we'll already be
	# at an EOF, and if we run FSF then, we'll skip a whole file.
	out = cp.stdout.decode()
	if "EOF" not in out:
	cp = subprocess.run(
	["mt", "-f", dev, "fsf"]
	)
	else:
	print("Somehow made it to EOF...")
	#use fsr to test for eod? Apparently when you go past the end of the file
	# you don't automatically notice if you're at the end of the data. To achieve
	# that, I'll try to forward space by one "record"
	"""
	example I guess:
	a b c
	block \| block \| last_block \| EOF \| EOD \|

	"a" is where you are when tar finishes. FSF moves you to b. FSR moves you to c, or to block 1
	of the next file. If we get "EOD" we exit, and if not we use "BSR" to go back to block 0 of
	the next file?
	"""
	print("An I/O error on this line is fine: ",end='',flush=True)
	subprocess.run(["mt","-f",dev,'fsr']) # will give an io error at EOD
	print()
	cp = subprocess.run(
	["mt", "-f", dev, "status"],
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT
	)
	# If we're at the end of data or tape, finish.
	out = cp.stdout.decode()
	print(out)
	if "EOD" in out or "EOT" in out:
	keepgoing = False
	else:
	subprocess.run(['mt','-f',dev,'bsr'])

	cp = subprocess.run([
	"mt", "-f", dev, "eject"
	])


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	prog = "tape_read",
	description = "read all the tar files out of a tape",
	)
	parser.add_argument('dest_folder')
	parser.add_argument('-f', '--file', help="Device file of tape drive", default="/dev/nst0")
	parser.add_argument('-b', '--blksize', help="Block size on tape. Usually 10240 so far, but use 0 if unsure (could slow everything down immensely but should at least work)", default=10240)
	parser.add_argument('-x', '--tarblk', help="Tar blocking factor. Suggested values are 20 (tar default); 112 or 256 (suggested by a gnu webpage on tapes); or 320 (this script default, somewhat arbitrary but seems to work decently well)", default=320)
	parser.add_argument('-B', '--tarB', help="Tell tar to re-block partial blocks into whole blocks. This may be helpful if we think that block errors are slowing down the untar process", action="store_true")
	args = parser.parse_args()
	try:
	os.makedirs(args.dest_folder)
	except FileExistsError:
	pass
	os.chdir(args.dest_folder)
	main(blksize = args.blksize, dev = args.file, tarblk=args.tarblk, tarB = args.tarB)