Skip to content

Instantly share code, notes, and snippets.

@jobinlawrance
Created July 14, 2017 14:38
Show Gist options
  • Save jobinlawrance/e76f6a661d89680d4e29d2d6eaf8fa2d to your computer and use it in GitHub Desktop.
Save jobinlawrance/e76f6a661d89680d4e29d2d6eaf8fa2d to your computer and use it in GitHub Desktop.
A python script to recursively extract .tar.gz files and .gz files : reference - http://guanidene.blogspot.in/2011/06/nested-tar-archives-extractor.html
#! /usr/bin/env python
# -*- coding: UTF-8 -*-
"""A command line utility for recusively extracting nested tar archives."""
__author__ = "Pushpak Dagade (पुष्पक दगड़े)"
__date__ = "$4 July, 2011 3:00:00 PM$"
import os
import sys
import re
import tarfile
import gzip
from argparse import ArgumentParser
major_version = 1
minor_version = 1
error_count = 0
file_extensions = ('tar', 'tgz', 'tbz', 'tb2', 'tar.gz', 'tar.bz2')
# Edit this according to the archive types you want to extract. Keep in
# mind that these should be extractable by the tarfile module.
__all__ = ['ExtractNested', 'WalkTreeAndExtract']
def FileExtension(file_name):
"""Return the file extension of file
'file' should be a string. It can be either the full path of
the file or just its name (or any string as long it contains
the file extension.)
Example #1:
input (file) --> 'abc.tar.gz'
return value --> 'tar.gz'
Example #2:
input (file) --> 'abc.tar'
return value --> 'tar'
"""
match = re.compile(r"^.*?[.](?P<ext>tar[.]gz|tar[.]bz2|\w+)$",
re.VERBOSE|re.IGNORECASE).match(file_name)
if match: # if match != None:
ext = match.group('ext')
return ext
else:
return '' # there is no file extension to file_name
def AppropriateFolderName(folder_fullpath):
"""Return a folder (path) such that it can be safely created in
without replacing any existing folder in it.
Check if the folder folder_fullpath exists. If no, return folder_fullpath
(without changing, because it can be safely created
without replacing any already existing folder). If yes, append an
appropriate number to the folder_fullpath such that this new folder_fullpath
can be safely created.
Examples:
folder_name = '/a/b/untitled folder'
return value = '/a/b/untitled folder' (no such folder already exists.)
folder_name = '/a/b/untitled folder'
return value = '/a/b/untitled folder 1' (the folder '/a/b/untitled folder'
already exists but no folder named
'/a/b/untitled folder 1' exists.)
folder_name = '/a/b/untitled folder'
return value = '/a/b/untitled folder 2' (the folders '/a/b/untitled folder'
and '/a/b/untitled folder 1' both
already exist but no folder
'/a/b/untitled folder 2' exists.)
"""
if os.path.exists(folder_fullpath):
folder_name = os.path.basename(folder_fullpath)
parent_fullpath = os.path.dirname(folder_fullpath)
match = re.compile(r'^(?P<name>.*)[ ](?P<num>\d+)$').match(folder_name)
if match: # if match != None:
name = match.group('name')
number = match.group('num')
new_folder_name = '%s %d' %(name, int(number)+1)
new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
return AppropriateFolderName(new_folder_fullpath)
# Recursively call itself so that it can be check whether a
# folder with path new_folder_fullpath already exists or not.
else:
new_folder_name = '%s 1' %folder_name
new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name)
return AppropriateFolderName(new_folder_fullpath)
# Recursively call itself so that it can be check whether a
# folder with path new_folder_fullpath already exists or not.
else:
return folder_fullpath
def Extract(tarfile_fullpath, delete_tar_file=True):
"""Extract the tarfile_fullpath to an appropriate* folder of the same
name as the tar file (without an extension) and return the path
of this folder.
If delete_tar_file is True, it will delete the tar file after
its extraction; if False, it won`t. Default value is True as you
would normally want to delete the (nested) tar files after
extraction. Pass a False, if you don`t want to delete the
tar file (after its extraction) you are passing.
"""
try:
print "Extracting '%s'" %tarfile_fullpath,
tar = tarfile.open(tarfile_fullpath)
extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\
-1*len(FileExtension(tarfile_fullpath))-1])
extract_folder_name = os.path.basename(extract_folder_fullpath)
print "to '%s'..." %extract_folder_name,
tar.extractall(extract_folder_fullpath)
print "Done!"
tar.close()
if delete_tar_file: os.remove(tarfile_fullpath)
return extract_folder_name
except Exception:
# Exceptions can occur while opening a damaged tar file.
print '(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize()
global error_count
error_count += 1
def ExtractGz(tarfile_fullpath, delete_tar_file=True):
try:
print "Extracting '%s'" %tarfile_fullpath,
gz = gzip.open(tarfile_fullpath)
extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\
-1*len(FileExtension(tarfile_fullpath))-1])
extract_folder_name = os.path.basename(extract_folder_fullpath)
print "to '%s'..." %extract_folder_name,
output = open(extract_folder_fullpath,"wb")
output.write( gz.read() )
print "Done!"
gz.close()
output.close()
if delete_tar_file: os.remove(tarfile_fullpath)
return extract_folder_name
except Exception:
# Exceptions can occur while opening a damaged tar file.
print '(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize()
global error_count
error_count += 1
def WalkTreeAndExtract(parent_dir):
"""Recursively descend the directory tree rooted at parent_dir
and extract each tar file on the way down (recursively)."""
try:
dir_contents = os.listdir(parent_dir)
except OSError:
# Exception can occur if trying to open some folder whose
# permissions this program does not have.
print 'Error occured. Could not open folder %s\n%s'\
%( parent_dir, str(sys.exc_info()[1]).capitalize())
global error_count
error_count += 1
return
for content in dir_contents:
content_fullpath = os.path.join(parent_dir, content)
if os.path.isdir(content_fullpath):
# If content is a folder, walk down it completely.
WalkTreeAndExtract(content_fullpath)
elif os.path.isfile(content_fullpath):
# If content is a file, check if it is a tar file.
if FileExtension(content_fullpath) in file_extensions:
# If yes, extract its contents to a new folder.
extract_folder_name = Extract(content_fullpath)
if extract_folder_name: # if extract_folder_name != None:
dir_contents.append(extract_folder_name)
# Append the newly extracted folder to dir_contents
# so that it can be later searched for more tar files
# to extract.
elif FileExtension(content_fullpath) == 'gz':
# If yes, extract its contents to a new folder.
extract_folder_name = ExtractGz(content_fullpath)
if extract_folder_name: # if extract_folder_name != None:
dir_contents.append(extract_folder_name)
# Append the newly extracted folder to dir_contents
# so that it can be later searched for more tar files
# to extract.
else:
# Unknown file type.
print 'Skipping %s. <Neither file nor folder>' % content_fullpath
def ExtractNested(tarfile_fullpath):
extract_folder_name = Extract(tarfile_fullpath, False)
if extract_folder_name: # if extract_folder_name != None
extract_folder_fullpath = os.path.join(os.path.dirname(
tarfile_fullpath), extract_folder_name)
WalkTreeAndExtract(extract_folder_fullpath)
# Given tar file is extracted to extract_folder_name. Now descend
# down its directory structure and extract all other tar files
# (recursively).
if __name__ == '__main__':
# Use a parser for parsing command line arguments
parser = ArgumentParser(description='Nested tar archive extractor %d.%d'\
%(major_version,minor_version))
parser.add_argument('tar_paths', metavar='path', type=str, nargs='+',
help='Path of the tar file to be extracted.')
extraction_paths = parser.parse_args().tar_paths
# Consider each argument passed as a file path and extract it.
for argument in extraction_paths:
if os.path.exists(argument):
#print # a blank line
ExtractNested(argument)
else:
print 'Not a valid path: %s' %argument
error_count += 1
if error_count !=0: print '%d error(s) occured.' %error_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment