jerry2605/extract.py

## extract.py
#! /usr/bin/env python
# -*- coding: UTF-8 -*-

"""A command line utility for recursively extracting nested tar archives."""
"""Don't know why tarfile can't extract .gz file, use gzip the extract the remain files"""

import os
import sys
import re
import tarfile
import gzip
from argparse import ArgumentParser

major_version = 1
minor_version = 1
error_count = 0

file_extensions = ('tar', 'tgz', 'tbz', 'tb2', 'tar.gz', 'tar.bz2')
# Edit this according to the archive types you want to extract. Keep in
# mind that these should be extractable by the tarfile module.

__all__ = ['ExtractNested', 'WalkTreeAndExtract']

def FileExtension(file_name):
    """Return the file extension of file

    'file' should be a string. It can be either the full path of
    the file or just its name (or any string as long it contains
    the file extension.)

    Example #1:
    input (file) -->  'abc.tar.gz'
    return value -->  'tar.gz'

    Example #2:
    input (file) -->  'abc.tar'
    return value -->  'tar'

    """
    match = re.compile(r"^.*?[.](?P<ext>tar[.]gz|tar[.]bz2|\w+)$",
      re.VERBOSE|re.IGNORECASE).match(file_name)

    if match:           # if match != None:
        ext = match.group('ext')
        return ext
    else:
        return ''       # there is no file extension to file_name

def AppropriateFolderName(folder_fullpath):
    """Return a folder (path) such that it can be safely created in
    without replacing any existing folder in it.

    Check if the folder folder_fullpath exists. If no, return folder_fullpath
    (without changing, because it can be safely created
    without replacing any already existing folder). If yes, append an
    appropriate number to the folder_fullpath such that this new folder_fullpath
    can be safely created.

    Examples:
    folder_name  = '/a/b/untitled folder'
    return value = '/a/b/untitled folder'   (no such folder already exists.)

    folder_name  = '/a/b/untitled folder'
    return value = '/a/b/untitled folder 1' (the folder '/a/b/untitled folder'
                                            already exists but no folder named
                                            '/a/b/untitled folder 1' exists.)

    folder_name  = '/a/b/untitled folder'
    return value = '/a/b/untitled folder 2' (the folders '/a/b/untitled folder'
                                            and '/a/b/untitled folder 1' both
                                            already exist but no folder
                                            '/a/b/untitled folder 2' exists.)

    """
    if os.path.exists(folder_fullpath):
        folder_name = os.path.basename(folder_fullpath)
        parent_fullpath = os.path.dirname(folder_fullpath)
        match = re.compile(r'^(?P<name>.*)[ ](?P<num>\d+)$').match(folder_name)
        if match:                           # if match != None:
            name = match.group('name')
            number = match.group('num')
            new_folder_name = '%s %d' %(name, int(number)+1)
            new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
            return AppropriateFolderName(new_folder_fullpath)
            # Recursively call itself so that it can be check whether a
            # folder with path new_folder_fullpath already exists or not.
        else:
            new_folder_name = '%s 1' %folder_name
            new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
            return AppropriateFolderName(new_folder_fullpath)
            # Recursively call itself so that it can be check whether a
            # folder with path new_folder_fullpath already exists or not.
    else:
        return folder_fullpath

def Extract(tarfile_fullpath, delete_tar_file=True):
    """Extract the tarfile_fullpath to an appropriate* folder of the same
    name as the tar file (without an extension) and return the path
    of this folder.

    If delete_tar_file is True, it will delete the tar file after
    its extraction; if False, it won`t. Default value is True as you
    would normally want to delete the (nested) tar files after
    extraction. Pass a False, if you don`t want to delete the
    tar file (after its extraction) you are passing.

    """
    try:
        print("Extracting '%s'" %tarfile_fullpath)
        tar = tarfile.open(tarfile_fullpath)
        extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\
          -1*len(FileExtension(tarfile_fullpath))-1])
        extract_folder_name = os.path.basename(extract_folder_fullpath)
        print("to '%s'..." %extract_folder_name)
        tar.extractall(extract_folder_fullpath)
        print("Done!")
        tar.close()
        if delete_tar_file: os.remove(tarfile_fullpath)
        return extract_folder_name

    except Exception:
        # Exceptions can occur while opening a damaged tar file.
        print('(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize())
        global error_count
        error_count += 1

def WalkTreeAndExtract(parent_dir):
    """Recursively descend the directory tree rooted at parent_dir
    and extract each tar file on the way down (recursively)."""
    try:
        dir_contents = os.listdir(parent_dir)
    except OSError:
        # Exception can occur if trying to open some folder whose
        # permissions this program does not have.
        print('Error occured. Could not open folder %s\n%s'\
          %( parent_dir, str(sys.exc_info()[1]).capitalize()))
        global error_count
        error_count += 1
        return

    for content in dir_contents:
        content_fullpath = os.path.join(parent_dir, content)
        if os.path.isdir(content_fullpath):
            # If content is a folder, walk down it completely.
            WalkTreeAndExtract(content_fullpath)
        elif os.path.isfile(content_fullpath):
            # If content is a file, check if it is a tar file.
            if FileExtension(content_fullpath) in file_extensions:
                # If yes, extract its contents to a new folder.
                extract_folder_name = Extract(content_fullpath)
                if extract_folder_name:     # if extract_folder_name != None:
                    dir_contents.append(extract_folder_name)
                    # Append the newly extracted folder to dir_contents
                    # so that it can be later searched for more tar files
                    # to extract.
        else:
            # Unknown file type.
            print('Skipping %s. <Neither file nor folder>' % content_fullpath)

def ExtractNested(tarfile_fullpath):
    extract_folder_name = Extract(tarfile_fullpath, False)
    if extract_folder_name:         # if extract_folder_name != None
        extract_folder_fullpath = os.path.join(os.path.dirname(
          tarfile_fullpath), extract_folder_name)
        WalkTreeAndExtract(extract_folder_fullpath)
        # Given tar file is extracted to extract_folder_name. Now descend
        # down its directory structure and extract all other tar files
        # (recursively).

def un_gz(full_file_name):
    #get name and remove .gz
    f_name = full_file_name.replace(".gz", "")
    #create gzip object
    g_file = gzip.GzipFile(full_file_name)
    #open gzip object and use open to write
    open(f_name, "wb+").write(g_file.read())
    #close gzip object
    g_file.close()
    #remove the original zip file
    os.remove(full_file_name)

#list all the files in the path
def all_files(path):
    listOfFiles = list()
    for (dirpath, dirnames, filenames) in os.walk(path):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]
    return listOfFiles

if __name__ == '__main__':
    #change the path download and file name test.tar.gz to your settings
    file_path = os.path.join(os.getcwd(),'downloads') + '/' + 'test.tar.gz'
    ExtractNested(file_path)
    os.remove(file_path)

    dirName = os.path.join(os.getcwd(),'Downloads')

    listAllFiles = all_files(dirName)

    for elem in listAllFiles:
        if elem.find(".gz")!=-1:
            un_gz(elem)
            print('use GZIP extract %s' % elem)
        #file_list.append(elem)

    '''
    # Use a parser for parsing command line arguments
    parser = ArgumentParser(description='Nested tar archive extractor %d.%d'\
      %(major_version,minor_version))
    parser.add_argument('tar_paths', metavar='path', type=str, nargs='+',
      help='Path of the tar file to be extracted.')
    extraction_paths = parser.parse_args().tar_paths

    # Consider each argument passed as a file path and extract it.
    for argument in extraction_paths:
        if os.path.exists(argument):
            #print       # a blank line
            ExtractNested(argument)
        else:
            print('Not a valid path: %s' %argument)
            error_count += 1
    if error_count !=0: print('%d error(s) occured.' %error_count)
    '''
	#! /usr/bin/env python
	# -- coding: UTF-8 --

	"""A command line utility for recursively extracting nested tar archives."""
	"""Don't know why tarfile can't extract .gz file, use gzip the extract the remain files"""

	import os
	import sys
	import re
	import tarfile
	import gzip
	from argparse import ArgumentParser

	major_version = 1
	minor_version = 1
	error_count = 0

	file_extensions = ('tar', 'tgz', 'tbz', 'tb2', 'tar.gz', 'tar.bz2')
	# Edit this according to the archive types you want to extract. Keep in
	# mind that these should be extractable by the tarfile module.

	__all__ = ['ExtractNested', 'WalkTreeAndExtract']

	def FileExtension(file_name):
	"""Return the file extension of file

	'file' should be a string. It can be either the full path of
	the file or just its name (or any string as long it contains
	the file extension.)

	Example #1:
	input (file) --> 'abc.tar.gz'
	return value --> 'tar.gz'

	Example #2:
	input (file) --> 'abc.tar'
	return value --> 'tar'

	"""
	match = re.compile(r"^.*?[.](?P<ext>tar[.]gz\|tar[.]bz2\|\w+)$",
	re.VERBOSE\|re.IGNORECASE).match(file_name)

	if match: # if match != None:
	ext = match.group('ext')
	return ext
	else:
	return '' # there is no file extension to file_name

	def AppropriateFolderName(folder_fullpath):
	"""Return a folder (path) such that it can be safely created in
	without replacing any existing folder in it.

	Check if the folder folder_fullpath exists. If no, return folder_fullpath
	(without changing, because it can be safely created
	without replacing any already existing folder). If yes, append an
	appropriate number to the folder_fullpath such that this new folder_fullpath
	can be safely created.

	Examples:
	folder_name = '/a/b/untitled folder'
	return value = '/a/b/untitled folder' (no such folder already exists.)

	folder_name = '/a/b/untitled folder'
	return value = '/a/b/untitled folder 1' (the folder '/a/b/untitled folder'
	already exists but no folder named
	'/a/b/untitled folder 1' exists.)

	folder_name = '/a/b/untitled folder'
	return value = '/a/b/untitled folder 2' (the folders '/a/b/untitled folder'
	and '/a/b/untitled folder 1' both
	already exist but no folder
	'/a/b/untitled folder 2' exists.)

	"""
	if os.path.exists(folder_fullpath):
	folder_name = os.path.basename(folder_fullpath)
	parent_fullpath = os.path.dirname(folder_fullpath)
	match = re.compile(r'^(?P<name>.*)[ ](?P<num>\d+)$').match(folder_name)
	if match: # if match != None:
	name = match.group('name')
	number = match.group('num')
	new_folder_name = '%s %d' %(name, int(number)+1)
	new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
	return AppropriateFolderName(new_folder_fullpath)
	# Recursively call itself so that it can be check whether a
	# folder with path new_folder_fullpath already exists or not.
	else:
	new_folder_name = '%s 1' %folder_name
	new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
	return AppropriateFolderName(new_folder_fullpath)
	# Recursively call itself so that it can be check whether a
	# folder with path new_folder_fullpath already exists or not.
	else:
	return folder_fullpath

	def Extract(tarfile_fullpath, delete_tar_file=True):
	"""Extract the tarfile_fullpath to an appropriate* folder of the same
	name as the tar file (without an extension) and return the path
	of this folder.

	If delete_tar_file is True, it will delete the tar file after
	its extraction; if False, it won`t. Default value is True as you
	would normally want to delete the (nested) tar files after
	extraction. Pass a False, if you don`t want to delete the
	tar file (after its extraction) you are passing.

	"""
	try:
	print("Extracting '%s'" %tarfile_fullpath)
	tar = tarfile.open(tarfile_fullpath)
	extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\
	-1*len(FileExtension(tarfile_fullpath))-1])
	extract_folder_name = os.path.basename(extract_folder_fullpath)
	print("to '%s'..." %extract_folder_name)
	tar.extractall(extract_folder_fullpath)
	print("Done!")
	tar.close()
	if delete_tar_file: os.remove(tarfile_fullpath)
	return extract_folder_name

	except Exception:
	# Exceptions can occur while opening a damaged tar file.
	print('(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize())
	global error_count
	error_count += 1

	def WalkTreeAndExtract(parent_dir):
	"""Recursively descend the directory tree rooted at parent_dir
	and extract each tar file on the way down (recursively)."""
	try:
	dir_contents = os.listdir(parent_dir)
	except OSError:
	# Exception can occur if trying to open some folder whose
	# permissions this program does not have.
	print('Error occured. Could not open folder %s\n%s'\
	%( parent_dir, str(sys.exc_info()[1]).capitalize()))
	global error_count
	error_count += 1
	return

	for content in dir_contents:
	content_fullpath = os.path.join(parent_dir, content)
	if os.path.isdir(content_fullpath):
	# If content is a folder, walk down it completely.
	WalkTreeAndExtract(content_fullpath)
	elif os.path.isfile(content_fullpath):
	# If content is a file, check if it is a tar file.
	if FileExtension(content_fullpath) in file_extensions:
	# If yes, extract its contents to a new folder.
	extract_folder_name = Extract(content_fullpath)
	if extract_folder_name: # if extract_folder_name != None:
	dir_contents.append(extract_folder_name)
	# Append the newly extracted folder to dir_contents
	# so that it can be later searched for more tar files
	# to extract.
	else:
	# Unknown file type.
	print('Skipping %s. <Neither file nor folder>' % content_fullpath)

	def ExtractNested(tarfile_fullpath):
	extract_folder_name = Extract(tarfile_fullpath, False)
	if extract_folder_name: # if extract_folder_name != None
	extract_folder_fullpath = os.path.join(os.path.dirname(
	tarfile_fullpath), extract_folder_name)
	WalkTreeAndExtract(extract_folder_fullpath)
	# Given tar file is extracted to extract_folder_name. Now descend
	# down its directory structure and extract all other tar files
	# (recursively).

	def un_gz(full_file_name):
	#get name and remove .gz
	f_name = full_file_name.replace(".gz", "")
	#create gzip object
	g_file = gzip.GzipFile(full_file_name)
	#open gzip object and use open to write
	open(f_name, "wb+").write(g_file.read())
	#close gzip object
	g_file.close()
	#remove the original zip file
	os.remove(full_file_name)

	#list all the files in the path
	def all_files(path):
	listOfFiles = list()
	for (dirpath, dirnames, filenames) in os.walk(path):
	listOfFiles += [os.path.join(dirpath, file) for file in filenames]
	return listOfFiles

	if __name__ == '__main__':
	#change the path download and file name test.tar.gz to your settings
	file_path = os.path.join(os.getcwd(),'downloads') + '/' + 'test.tar.gz'
	ExtractNested(file_path)
	os.remove(file_path)

	dirName = os.path.join(os.getcwd(),'Downloads')

	listAllFiles = all_files(dirName)

	for elem in listAllFiles:
	if elem.find(".gz")!=-1:
	un_gz(elem)
	print('use GZIP extract %s' % elem)
	#file_list.append(elem)

	'''
	# Use a parser for parsing command line arguments
	parser = ArgumentParser(description='Nested tar archive extractor %d.%d'\
	%(major_version,minor_version))
	parser.add_argument('tar_paths', metavar='path', type=str, nargs='+',
	help='Path of the tar file to be extracted.')
	extraction_paths = parser.parse_args().tar_paths

	# Consider each argument passed as a file path and extract it.
	for argument in extraction_paths:
	if os.path.exists(argument):
	#print # a blank line
	ExtractNested(argument)
	else:
	print('Not a valid path: %s' %argument)
	error_count += 1
	if error_count !=0: print('%d error(s) occured.' %error_count)
	'''