Created
July 14, 2017 14:38
-
-
Save jobinlawrance/05069d54a76fa91b8eb9e0fb5b214a3f to your computer and use it in GitHub Desktop.
A python script to recursively extract .tar.gz files and .gz files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
"""A command line utility for recusively extracting nested tar archives.""" | |
__author__ = "Pushpak Dagade (पुष्पक दगड़े)" | |
__date__ = "$4 July, 2011 3:00:00 PM$" | |
import os | |
import sys | |
import re | |
import tarfile | |
import gzip | |
from argparse import ArgumentParser | |
major_version = 1 | |
minor_version = 1 | |
error_count = 0 | |
file_extensions = ('tar', 'tgz', 'tbz', 'tb2', 'tar.gz', 'tar.bz2') | |
# Edit this according to the archive types you want to extract. Keep in | |
# mind that these should be extractable by the tarfile module. | |
__all__ = ['ExtractNested', 'WalkTreeAndExtract'] | |
def FileExtension(file_name): | |
"""Return the file extension of file | |
'file' should be a string. It can be either the full path of | |
the file or just its name (or any string as long it contains | |
the file extension.) | |
Example #1: | |
input (file) --> 'abc.tar.gz' | |
return value --> 'tar.gz' | |
Example #2: | |
input (file) --> 'abc.tar' | |
return value --> 'tar' | |
""" | |
match = re.compile(r"^.*?[.](?P<ext>tar[.]gz|tar[.]bz2|\w+)$", | |
re.VERBOSE|re.IGNORECASE).match(file_name) | |
if match: # if match != None: | |
ext = match.group('ext') | |
return ext | |
else: | |
return '' # there is no file extension to file_name | |
def AppropriateFolderName(folder_fullpath): | |
"""Return a folder (path) such that it can be safely created in | |
without replacing any existing folder in it. | |
Check if the folder folder_fullpath exists. If no, return folder_fullpath | |
(without changing, because it can be safely created | |
without replacing any already existing folder). If yes, append an | |
appropriate number to the folder_fullpath such that this new folder_fullpath | |
can be safely created. | |
Examples: | |
folder_name = '/a/b/untitled folder' | |
return value = '/a/b/untitled folder' (no such folder already exists.) | |
folder_name = '/a/b/untitled folder' | |
return value = '/a/b/untitled folder 1' (the folder '/a/b/untitled folder' | |
already exists but no folder named | |
'/a/b/untitled folder 1' exists.) | |
folder_name = '/a/b/untitled folder' | |
return value = '/a/b/untitled folder 2' (the folders '/a/b/untitled folder' | |
and '/a/b/untitled folder 1' both | |
already exist but no folder | |
'/a/b/untitled folder 2' exists.) | |
""" | |
if os.path.exists(folder_fullpath): | |
folder_name = os.path.basename(folder_fullpath) | |
parent_fullpath = os.path.dirname(folder_fullpath) | |
match = re.compile(r'^(?P<name>.*)[ ](?P<num>\d+)$').match(folder_name) | |
if match: # if match != None: | |
name = match.group('name') | |
number = match.group('num') | |
new_folder_name = '%s %d' %(name, int(number)+1) | |
new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name) | |
return AppropriateFolderName(new_folder_fullpath) | |
# Recursively call itself so that it can be check whether a | |
# folder with path new_folder_fullpath already exists or not. | |
else: | |
new_folder_name = '%s 1' %folder_name | |
new_folder_fullpath = os.path.join(parent_fullpath, new_folder_name) | |
return AppropriateFolderName(new_folder_fullpath) | |
# Recursively call itself so that it can be check whether a | |
# folder with path new_folder_fullpath already exists or not. | |
else: | |
return folder_fullpath | |
def Extract(tarfile_fullpath, delete_tar_file=True): | |
"""Extract the tarfile_fullpath to an appropriate* folder of the same | |
name as the tar file (without an extension) and return the path | |
of this folder. | |
If delete_tar_file is True, it will delete the tar file after | |
its extraction; if False, it won`t. Default value is True as you | |
would normally want to delete the (nested) tar files after | |
extraction. Pass a False, if you don`t want to delete the | |
tar file (after its extraction) you are passing. | |
""" | |
try: | |
print "Extracting '%s'" %tarfile_fullpath, | |
tar = tarfile.open(tarfile_fullpath) | |
extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\ | |
-1*len(FileExtension(tarfile_fullpath))-1]) | |
extract_folder_name = os.path.basename(extract_folder_fullpath) | |
print "to '%s'..." %extract_folder_name, | |
tar.extractall(extract_folder_fullpath) | |
print "Done!" | |
tar.close() | |
if delete_tar_file: os.remove(tarfile_fullpath) | |
return extract_folder_name | |
except Exception: | |
# Exceptions can occur while opening a damaged tar file. | |
print '(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize() | |
global error_count | |
error_count += 1 | |
def ExtractGz(tarfile_fullpath, delete_tar_file=True): | |
try: | |
print "Extracting '%s'" %tarfile_fullpath, | |
gz = gzip.open(tarfile_fullpath) | |
extract_folder_fullpath = AppropriateFolderName(tarfile_fullpath[:\ | |
-1*len(FileExtension(tarfile_fullpath))-1]) | |
extract_folder_name = os.path.basename(extract_folder_fullpath) | |
print "to '%s'..." %extract_folder_name, | |
output = open(extract_folder_fullpath,"wb") | |
output.write( gz.read() ) | |
print "Done!" | |
gz.close() | |
output.close() | |
if delete_tar_file: os.remove(tarfile_fullpath) | |
return extract_folder_name | |
except Exception: | |
# Exceptions can occur while opening a damaged tar file. | |
print '(Error)\n(%s)' %str(sys.exc_info()[1]).capitalize() | |
global error_count | |
error_count += 1 | |
def WalkTreeAndExtract(parent_dir): | |
"""Recursively descend the directory tree rooted at parent_dir | |
and extract each tar file on the way down (recursively).""" | |
try: | |
dir_contents = os.listdir(parent_dir) | |
except OSError: | |
# Exception can occur if trying to open some folder whose | |
# permissions this program does not have. | |
print 'Error occured. Could not open folder %s\n%s'\ | |
%( parent_dir, str(sys.exc_info()[1]).capitalize()) | |
global error_count | |
error_count += 1 | |
return | |
for content in dir_contents: | |
content_fullpath = os.path.join(parent_dir, content) | |
if os.path.isdir(content_fullpath): | |
# If content is a folder, walk down it completely. | |
WalkTreeAndExtract(content_fullpath) | |
elif os.path.isfile(content_fullpath): | |
# If content is a file, check if it is a tar file. | |
if FileExtension(content_fullpath) in file_extensions: | |
# If yes, extract its contents to a new folder. | |
extract_folder_name = Extract(content_fullpath) | |
if extract_folder_name: # if extract_folder_name != None: | |
dir_contents.append(extract_folder_name) | |
# Append the newly extracted folder to dir_contents | |
# so that it can be later searched for more tar files | |
# to extract. | |
elif FileExtension(content_fullpath) == 'gz': | |
# If yes, extract its contents to a new folder. | |
extract_folder_name = ExtractGz(content_fullpath) | |
if extract_folder_name: # if extract_folder_name != None: | |
dir_contents.append(extract_folder_name) | |
# Append the newly extracted folder to dir_contents | |
# so that it can be later searched for more tar files | |
# to extract. | |
else: | |
# Unknown file type. | |
print 'Skipping %s. <Neither file nor folder>' % content_fullpath | |
def ExtractNested(tarfile_fullpath): | |
extract_folder_name = Extract(tarfile_fullpath, False) | |
if extract_folder_name: # if extract_folder_name != None | |
extract_folder_fullpath = os.path.join(os.path.dirname( | |
tarfile_fullpath), extract_folder_name) | |
WalkTreeAndExtract(extract_folder_fullpath) | |
# Given tar file is extracted to extract_folder_name. Now descend | |
# down its directory structure and extract all other tar files | |
# (recursively). | |
if __name__ == '__main__': | |
# Use a parser for parsing command line arguments | |
parser = ArgumentParser(description='Nested tar archive extractor %d.%d'\ | |
%(major_version,minor_version)) | |
parser.add_argument('tar_paths', metavar='path', type=str, nargs='+', | |
help='Path of the tar file to be extracted.') | |
extraction_paths = parser.parse_args().tar_paths | |
# Consider each argument passed as a file path and extract it. | |
for argument in extraction_paths: | |
if os.path.exists(argument): | |
#print # a blank line | |
ExtractNested(argument) | |
else: | |
print 'Not a valid path: %s' %argument | |
error_count += 1 | |
if error_count !=0: print '%d error(s) occured.' %error_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment