bgruening/gist:43084476b46d2e711419

## gistfile1.diff
diff -r eb61b02da186 config/datatypes_conf.xml.sample
--- a/config/datatypes_conf.xml.sample	Sun Jan 11 21:43:13 2015 -0500
+++ b/config/datatypes_conf.xml.sample	Mon Jan 12 17:37:25 2015 +0100
@@ -140,6 +140,8 @@
     <datatype extension="rgb" type="galaxy.datatypes.images:Rgb" mimetype="image/rgb"/>
     <datatype extension="pbm" type="galaxy.datatypes.images:Pbm" mimetype="image/pbm"/>
     <datatype extension="pgm" type="galaxy.datatypes.images:Pgm" mimetype="image/pgm"/>
+    <datatype extension="searchgui_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
+    <datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
     <datatype extension="eps" type="galaxy.datatypes.images:Eps" mimetype="image/eps"/>
     <datatype extension="rast" type="galaxy.datatypes.images:Rast" mimetype="image/rast"/>
     <datatype extension="laj" type="galaxy.datatypes.images:Laj"/>
diff -r eb61b02da186 lib/galaxy/datatypes/binary.py
--- a/lib/galaxy/datatypes/binary.py	Sun Jan 11 21:43:13 2015 -0500
+++ b/lib/galaxy/datatypes/binary.py	Mon Jan 12 17:37:25 2015 +0100
@@ -106,6 +106,30 @@

 Binary.register_unsniffable_binary_ext("ab1")

+class CompressedArchive( Binary ):
+    """
+        Class describing an compressed binary file
+        This class can be sublass'ed to implement archive filetypes that will not be unpacked by upload.py.
+    """
+    file_ext = "compressed_archive"
+    compressed = True
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "Compressed binary file"
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Compressed binary file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+Binary.register_unsniffable_binary_ext("compressed_archive")
+

 class GenericAsn1Binary( Binary ):
     """Class for generic ASN.1 binary format"""
diff -r eb61b02da186 tools/data_source/upload.py
--- a/tools/data_source/upload.py	Sun Jan 11 21:43:13 2015 -0500
+++ b/tools/data_source/upload.py	Mon Jan 12 17:37:25 2015 +0100
@@ -120,171 +120,176 @@
             data_type = type_info[0]
             ext = type_info[1]
     if not data_type:
-        # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
-        is_gzipped, is_valid = check_gzip( dataset.path )
-        if is_gzipped and not is_valid:
-            file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
-            return
-        elif is_gzipped and is_valid:
-            if link_data_only == 'copy_files':
-                # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
-                CHUNK_SIZE = 2**20 # 1Mb
-                fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-                gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
-                while 1:
-                    try:
-                        chunk = gzipped_file.read( CHUNK_SIZE )
-                    except IOError:
-                        os.close( fd )
-                        os.remove( uncompressed )
-                        file_err( 'Problem decompressing gzipped data', dataset, json_file )
-                        return
-                    if not chunk:
-                        break
-                    os.write( fd, chunk )
-                os.close( fd )
-                gzipped_file.close()
-                # Replace the gzipped file with the decompressed file if it's safe to do so
-                if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
-                    dataset.path = uncompressed
-                else:
-                    shutil.move( uncompressed, dataset.path )
-                os.chmod(dataset.path, 0644)
-            dataset.name = dataset.name.rstrip( '.gz' )
-            data_type = 'gzip'
-        if not data_type and bz2 is not None:
-            # See if we have a bz2 file, much like gzip
-            is_bzipped, is_valid = check_bz2( dataset.path )
-            if is_bzipped and not is_valid:
+        root_datatype = registry.get_datatype_by_extension( dataset.file_type )
+        if getattr( root_datatype, 'compressed', False ):
+            data_type = 'compressed archive'
+            ext = dataset.file_type
+        else:
+            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
+            is_gzipped, is_valid = check_gzip( dataset.path )
+            if is_gzipped and not is_valid:
                 file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
                 return
-            elif is_bzipped and is_valid:
+            elif is_gzipped and is_valid:
                 if link_data_only == 'copy_files':
-                    # We need to uncompress the temp_name file
+                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                     CHUNK_SIZE = 2**20 # 1Mb
-                    fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-                    bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
+                    fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+                    gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
                     while 1:
                         try:
-                            chunk = bzipped_file.read( CHUNK_SIZE )
+                            chunk = gzipped_file.read( CHUNK_SIZE )
                         except IOError:
                             os.close( fd )
                             os.remove( uncompressed )
-                            file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
+                            file_err( 'Problem decompressing gzipped data', dataset, json_file )
                             return
                         if not chunk:
                             break
                         os.write( fd, chunk )
                     os.close( fd )
-                    bzipped_file.close()
-                    # Replace the bzipped file with the decompressed file if it's safe to do so
+                    gzipped_file.close()
+                    # Replace the gzipped file with the decompressed file if it's safe to do so
                     if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
                         dataset.path = uncompressed
                     else:
                         shutil.move( uncompressed, dataset.path )
                     os.chmod(dataset.path, 0644)
-                dataset.name = dataset.name.rstrip( '.bz2' )
-                data_type = 'bz2'
-        if not data_type:
-            # See if we have a zip archive
-            is_zipped = check_zip( dataset.path )
-            if is_zipped:
-                if link_data_only == 'copy_files':
-                    CHUNK_SIZE = 2**20 # 1Mb
-                    uncompressed = None
-                    uncompressed_name = None
-                    unzipped = False
-                    z = zipfile.ZipFile( dataset.path )
-                    for name in z.namelist():
-                        if name.endswith('/'):
-                            continue
-                        if unzipped:
-                            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
-                            break
-                        fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
-                        if sys.version_info[:2] >= ( 2, 6 ):
-                            zipped_file = z.open( name )
-                            while 1:
+                dataset.name = dataset.name.rstrip( '.gz' )
+                data_type = 'gzip'
+            if not data_type and bz2 is not None:
+                # See if we have a bz2 file, much like gzip
+                is_bzipped, is_valid = check_bz2( dataset.path )
+                if is_bzipped and not is_valid:
+                    file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
+                    return
+                elif is_bzipped and is_valid:
+                    if link_data_only == 'copy_files':
+                        # We need to uncompress the temp_name file
+                        CHUNK_SIZE = 2**20 # 1Mb
+                        fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+                        bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
+                        while 1:
+                            try:
+                                chunk = bzipped_file.read( CHUNK_SIZE )
+                            except IOError:
+                                os.close( fd )
+                                os.remove( uncompressed )
+                                file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
+                                return
+                            if not chunk:
+                                break
+                            os.write( fd, chunk )
+                        os.close( fd )
+                        bzipped_file.close()
+                        # Replace the bzipped file with the decompressed file if it's safe to do so
+                        if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
+                            dataset.path = uncompressed
+                        else:
+                            shutil.move( uncompressed, dataset.path )
+                        os.chmod(dataset.path, 0644)
+                    dataset.name = dataset.name.rstrip( '.bz2' )
+                    data_type = 'bz2'
+            if not data_type:
+                # See if we have a zip archive
+                is_zipped = check_zip( dataset.path )
+                if is_zipped:
+                    if link_data_only == 'copy_files':
+                        CHUNK_SIZE = 2**20 # 1Mb
+                        uncompressed = None
+                        uncompressed_name = None
+                        unzipped = False
+                        z = zipfile.ZipFile( dataset.path )
+                        for name in z.namelist():
+                            if name.endswith('/'):
+                                continue
+                            if unzipped:
+                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
+                                break
+                            fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
+                            if sys.version_info[:2] >= ( 2, 6 ):
+                                zipped_file = z.open( name )
+                                while 1:
+                                    try:
+                                        chunk = zipped_file.read( CHUNK_SIZE )
+                                    except IOError:
+                                        os.close( fd )
+                                        os.remove( uncompressed )
+                                        file_err( 'Problem decompressing zipped data', dataset, json_file )
+                                        return
+                                    if not chunk:
+                                        break
+                                    os.write( fd, chunk )
+                                os.close( fd )
+                                zipped_file.close()
+                                uncompressed_name = name
+                                unzipped = True
+                            else:
+                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                 try:
-                                    chunk = zipped_file.read( CHUNK_SIZE )
+                                    outfile = open( uncompressed, 'wb' )
+                                    outfile.write( z.read( name ) )
+                                    outfile.close()
+                                    uncompressed_name = name
+                                    unzipped = True
                                 except IOError:
                                     os.close( fd )
                                     os.remove( uncompressed )
                                     file_err( 'Problem decompressing zipped data', dataset, json_file )
                                     return
-                                if not chunk:
-                                    break
-                                os.write( fd, chunk )
-                            os.close( fd )
-                            zipped_file.close()
-                            uncompressed_name = name
-                            unzipped = True
+                        z.close()
+                        # Replace the zipped file with the decompressed file if it's safe to do so
+                        if uncompressed is not None:
+                            if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
+                                dataset.path = uncompressed
+                            else:
+                                shutil.move( uncompressed, dataset.path )
+                            os.chmod(dataset.path, 0644)
+                            dataset.name = uncompressed_name
+                    data_type = 'zip'
+            if not data_type:
+                # TODO refactor this logic.  check_binary isn't guaranteed to be
+                # correct since it only looks at whether the first 100 chars are
+                # printable or not.  If someone specifies a known unsniffable
+                # binary datatype and check_binary fails, the file gets mangled.
+                if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
+                    # We have a binary dataset, but it is not Bam, Sff or Pdf
+                    data_type = 'binary'
+                    #binary_ok = False
+                    parts = dataset.name.split( "." )
+                    if len( parts ) > 1:
+                        ext = parts[-1].strip().lower()
+                        if not Binary.is_ext_unsniffable(ext):
+                            file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
+                            return
+                        elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
+                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
+                            file_err( err_msg, dataset, json_file )
+                            return
+            if not data_type:
+                # We must have a text file
+                if check_html( dataset.path ):
+                    file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
+                    return
+            if data_type != 'binary':
+                if link_data_only == 'copy_files':
+                    if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
+                        in_place = False
+                    # Convert universal line endings to Posix line endings, but allow the user to turn it off,
+                    # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
+                    # corrupting the content of those files.
+                    if dataset.to_posix_lines:
+                        tmpdir = output_adjacent_tmpdir( output_path )
+                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
+                        if dataset.space_to_tab:
+                            line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
                         else:
-                            # python < 2.5 doesn't have a way to read members in chunks(!)
-                            try:
-                                outfile = open( uncompressed, 'wb' )
-                                outfile.write( z.read( name ) )
-                                outfile.close()
-                                uncompressed_name = name
-                                unzipped = True
-                            except IOError:
-                                os.close( fd )
-                                os.remove( uncompressed )
-                                file_err( 'Problem decompressing zipped data', dataset, json_file )
-                                return
-                    z.close()
-                    # Replace the zipped file with the decompressed file if it's safe to do so
-                    if uncompressed is not None:
-                        if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
-                            dataset.path = uncompressed
-                        else:
-                            shutil.move( uncompressed, dataset.path )
-                        os.chmod(dataset.path, 0644)
-                        dataset.name = uncompressed_name
-                data_type = 'zip'
-        if not data_type:
-            # TODO refactor this logic.  check_binary isn't guaranteed to be
-            # correct since it only looks at whether the first 100 chars are
-            # printable or not.  If someone specifies a known unsniffable
-            # binary datatype and check_binary fails, the file gets mangled.
-            if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
-                # We have a binary dataset, but it is not Bam, Sff or Pdf
-                data_type = 'binary'
-                #binary_ok = False
-                parts = dataset.name.split( "." )
-                if len( parts ) > 1:
-                    ext = parts[-1].strip().lower()
-                    if not Binary.is_ext_unsniffable(ext):
-                        file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
-                        return
-                    elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
-                        err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
-                        file_err( err_msg, dataset, json_file )
-                        return
-        if not data_type:
-            # We must have a text file
-            if check_html( dataset.path ):
-                file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
-                return
-        if data_type != 'binary':
-            if link_data_only == 'copy_files':
-                if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
-                    in_place = False
-                # Convert universal line endings to Posix line endings, but allow the user to turn it off,
-                # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
-                # corrupting the content of those files.
-                if dataset.to_posix_lines:
-                    tmpdir = output_adjacent_tmpdir( output_path )
-                    tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
-                    if dataset.space_to_tab:
-                        line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
-                    else:
-                        line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
-            if dataset.file_type == 'auto':
-                ext = sniff.guess_ext( dataset.path, registry.sniff_order )
-            else:
-                ext = dataset.file_type
-            data_type = ext
+                            line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
+                if dataset.file_type == 'auto':
+                    ext = sniff.guess_ext( dataset.path, registry.sniff_order )
+                else:
+                    ext = dataset.file_type
+                data_type = ext
     # Save job info for the framework
     if ext == 'auto' and dataset.ext:
         ext = dataset.ext
	diff -r eb61b02da186 config/datatypes_conf.xml.sample
	--- a/config/datatypes_conf.xml.sample Sun Jan 11 21:43:13 2015 -0500
	+++ b/config/datatypes_conf.xml.sample Mon Jan 12 17:37:25 2015 +0100
	@@ -140,6 +140,8 @@
	<datatype extension="rgb" type="galaxy.datatypes.images:Rgb" mimetype="image/rgb"/>
	<datatype extension="pbm" type="galaxy.datatypes.images:Pbm" mimetype="image/pbm"/>
	<datatype extension="pgm" type="galaxy.datatypes.images:Pgm" mimetype="image/pgm"/>
	+ <datatype extension="searchgui_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
	+ <datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
	<datatype extension="eps" type="galaxy.datatypes.images:Eps" mimetype="image/eps"/>
	<datatype extension="rast" type="galaxy.datatypes.images:Rast" mimetype="image/rast"/>
	<datatype extension="laj" type="galaxy.datatypes.images:Laj"/>
	diff -r eb61b02da186 lib/galaxy/datatypes/binary.py
	--- a/lib/galaxy/datatypes/binary.py Sun Jan 11 21:43:13 2015 -0500
	+++ b/lib/galaxy/datatypes/binary.py Mon Jan 12 17:37:25 2015 +0100
	@@ -106,6 +106,30 @@

	Binary.register_unsniffable_binary_ext("ab1")

	+class CompressedArchive( Binary ):
	+ """
	+ Class describing an compressed binary file
	+ This class can be sublass'ed to implement archive filetypes that will not be unpacked by upload.py.
	+ """
	+ file_ext = "compressed_archive"
	+ compressed = True
	+
	+ def set_peek( self, dataset, is_multi_byte=False ):
	+ if not dataset.dataset.purged:
	+ dataset.peek = "Compressed binary file"
	+ dataset.blurb = data.nice_size( dataset.get_size() )
	+ else:
	+ dataset.peek = 'file does not exist'
	+ dataset.blurb = 'file purged from disk'
	+
	+ def display_peek( self, dataset ):
	+ try:
	+ return dataset.peek
	+ except:
	+ return "Compressed binary file (%s)" % ( data.nice_size( dataset.get_size() ) )
	+
	+Binary.register_unsniffable_binary_ext("compressed_archive")
	+

	class GenericAsn1Binary( Binary ):
	"""Class for generic ASN.1 binary format"""
	diff -r eb61b02da186 tools/data_source/upload.py
	--- a/tools/data_source/upload.py Sun Jan 11 21:43:13 2015 -0500
	+++ b/tools/data_source/upload.py Mon Jan 12 17:37:25 2015 +0100
	@@ -120,171 +120,176 @@
	data_type = type_info[0]
	ext = type_info[1]
	if not data_type:
	- # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
	- is_gzipped, is_valid = check_gzip( dataset.path )
	- if is_gzipped and not is_valid:
	- file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
	- return
	- elif is_gzipped and is_valid:
	- if link_data_only == 'copy_files':
	- # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
	- CHUNK_SIZE = 2**20 # 1Mb
	- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	- gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
	- while 1:
	- try:
	- chunk = gzipped_file.read( CHUNK_SIZE )
	- except IOError:
	- os.close( fd )
	- os.remove( uncompressed )
	- file_err( 'Problem decompressing gzipped data', dataset, json_file )
	- return
	- if not chunk:
	- break
	- os.write( fd, chunk )
	- os.close( fd )
	- gzipped_file.close()
	- # Replace the gzipped file with the decompressed file if it's safe to do so
	- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
	- dataset.path = uncompressed
	- else:
	- shutil.move( uncompressed, dataset.path )
	- os.chmod(dataset.path, 0644)
	- dataset.name = dataset.name.rstrip( '.gz' )
	- data_type = 'gzip'
	- if not data_type and bz2 is not None:
	- # See if we have a bz2 file, much like gzip
	- is_bzipped, is_valid = check_bz2( dataset.path )
	- if is_bzipped and not is_valid:
	+ root_datatype = registry.get_datatype_by_extension( dataset.file_type )
	+ if getattr( root_datatype, 'compressed', False ):
	+ data_type = 'compressed archive'
	+ ext = dataset.file_type
	+ else:
	+ # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
	+ is_gzipped, is_valid = check_gzip( dataset.path )
	+ if is_gzipped and not is_valid:
	file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
	return
	- elif is_bzipped and is_valid:
	+ elif is_gzipped and is_valid:
	if link_data_only == 'copy_files':
	- # We need to uncompress the temp_name file
	+ # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
	CHUNK_SIZE = 2**20 # 1Mb
	- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	- bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
	+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	+ gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
	while 1:
	try:
	- chunk = bzipped_file.read( CHUNK_SIZE )
	+ chunk = gzipped_file.read( CHUNK_SIZE )
	except IOError:
	os.close( fd )
	os.remove( uncompressed )
	- file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
	+ file_err( 'Problem decompressing gzipped data', dataset, json_file )
	return
	if not chunk:
	break
	os.write( fd, chunk )
	os.close( fd )
	- bzipped_file.close()
	- # Replace the bzipped file with the decompressed file if it's safe to do so
	+ gzipped_file.close()
	+ # Replace the gzipped file with the decompressed file if it's safe to do so
	if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
	dataset.path = uncompressed
	else:
	shutil.move( uncompressed, dataset.path )
	os.chmod(dataset.path, 0644)
	- dataset.name = dataset.name.rstrip( '.bz2' )
	- data_type = 'bz2'
	- if not data_type:
	- # See if we have a zip archive
	- is_zipped = check_zip( dataset.path )
	- if is_zipped:
	- if link_data_only == 'copy_files':
	- CHUNK_SIZE = 2**20 # 1Mb
	- uncompressed = None
	- uncompressed_name = None
	- unzipped = False
	- z = zipfile.ZipFile( dataset.path )
	- for name in z.namelist():
	- if name.endswith('/'):
	- continue
	- if unzipped:
	- stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
	- break
	- fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	- if sys.version_info[:2] >= ( 2, 6 ):
	- zipped_file = z.open( name )
	- while 1:
	+ dataset.name = dataset.name.rstrip( '.gz' )
	+ data_type = 'gzip'
	+ if not data_type and bz2 is not None:
	+ # See if we have a bz2 file, much like gzip
	+ is_bzipped, is_valid = check_bz2( dataset.path )
	+ if is_bzipped and not is_valid:
	+ file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file )
	+ return
	+ elif is_bzipped and is_valid:
	+ if link_data_only == 'copy_files':
	+ # We need to uncompress the temp_name file
	+ CHUNK_SIZE = 2**20 # 1Mb
	+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	+ bzipped_file = bz2.BZ2File( dataset.path, 'rb' )
	+ while 1:
	+ try:
	+ chunk = bzipped_file.read( CHUNK_SIZE )
	+ except IOError:
	+ os.close( fd )
	+ os.remove( uncompressed )
	+ file_err( 'Problem decompressing bz2 compressed data', dataset, json_file )
	+ return
	+ if not chunk:
	+ break
	+ os.write( fd, chunk )
	+ os.close( fd )
	+ bzipped_file.close()
	+ # Replace the bzipped file with the decompressed file if it's safe to do so
	+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
	+ dataset.path = uncompressed
	+ else:
	+ shutil.move( uncompressed, dataset.path )
	+ os.chmod(dataset.path, 0644)
	+ dataset.name = dataset.name.rstrip( '.bz2' )
	+ data_type = 'bz2'
	+ if not data_type:
	+ # See if we have a zip archive
	+ is_zipped = check_zip( dataset.path )
	+ if is_zipped:
	+ if link_data_only == 'copy_files':
	+ CHUNK_SIZE = 2**20 # 1Mb
	+ uncompressed = None
	+ uncompressed_name = None
	+ unzipped = False
	+ z = zipfile.ZipFile( dataset.path )
	+ for name in z.namelist():
	+ if name.endswith('/'):
	+ continue
	+ if unzipped:
	+ stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
	+ break
	+ fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False )
	+ if sys.version_info[:2] >= ( 2, 6 ):
	+ zipped_file = z.open( name )
	+ while 1:
	+ try:
	+ chunk = zipped_file.read( CHUNK_SIZE )
	+ except IOError:
	+ os.close( fd )
	+ os.remove( uncompressed )
	+ file_err( 'Problem decompressing zipped data', dataset, json_file )
	+ return
	+ if not chunk:
	+ break
	+ os.write( fd, chunk )
	+ os.close( fd )
	+ zipped_file.close()
	+ uncompressed_name = name
	+ unzipped = True
	+ else:
	+ # python < 2.5 doesn't have a way to read members in chunks(!)
	try:
	- chunk = zipped_file.read( CHUNK_SIZE )
	+ outfile = open( uncompressed, 'wb' )
	+ outfile.write( z.read( name ) )
	+ outfile.close()
	+ uncompressed_name = name
	+ unzipped = True
	except IOError:
	os.close( fd )
	os.remove( uncompressed )
	file_err( 'Problem decompressing zipped data', dataset, json_file )
	return
	- if not chunk:
	- break
	- os.write( fd, chunk )
	- os.close( fd )
	- zipped_file.close()
	- uncompressed_name = name
	- unzipped = True
	+ z.close()
	+ # Replace the zipped file with the decompressed file if it's safe to do so
	+ if uncompressed is not None:
	+ if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
	+ dataset.path = uncompressed
	+ else:
	+ shutil.move( uncompressed, dataset.path )
	+ os.chmod(dataset.path, 0644)
	+ dataset.name = uncompressed_name
	+ data_type = 'zip'
	+ if not data_type:
	+ # TODO refactor this logic. check_binary isn't guaranteed to be
	+ # correct since it only looks at whether the first 100 chars are
	+ # printable or not. If someone specifies a known unsniffable
	+ # binary datatype and check_binary fails, the file gets mangled.
	+ if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
	+ # We have a binary dataset, but it is not Bam, Sff or Pdf
	+ data_type = 'binary'
	+ #binary_ok = False
	+ parts = dataset.name.split( "." )
	+ if len( parts ) > 1:
	+ ext = parts[-1].strip().lower()
	+ if not Binary.is_ext_unsniffable(ext):
	+ file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
	+ return
	+ elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
	+ err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
	+ file_err( err_msg, dataset, json_file )
	+ return
	+ if not data_type:
	+ # We must have a text file
	+ if check_html( dataset.path ):
	+ file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
	+ return
	+ if data_type != 'binary':
	+ if link_data_only == 'copy_files':
	+ if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
	+ in_place = False
	+ # Convert universal line endings to Posix line endings, but allow the user to turn it off,
	+ # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
	+ # corrupting the content of those files.
	+ if dataset.to_posix_lines:
	+ tmpdir = output_adjacent_tmpdir( output_path )
	+ tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
	+ if dataset.space_to_tab:
	+ line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
	else:
	- # python < 2.5 doesn't have a way to read members in chunks(!)
	- try:
	- outfile = open( uncompressed, 'wb' )
	- outfile.write( z.read( name ) )
	- outfile.close()
	- uncompressed_name = name
	- unzipped = True
	- except IOError:
	- os.close( fd )
	- os.remove( uncompressed )
	- file_err( 'Problem decompressing zipped data', dataset, json_file )
	- return
	- z.close()
	- # Replace the zipped file with the decompressed file if it's safe to do so
	- if uncompressed is not None:
	- if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
	- dataset.path = uncompressed
	- else:
	- shutil.move( uncompressed, dataset.path )
	- os.chmod(dataset.path, 0644)
	- dataset.name = uncompressed_name
	- data_type = 'zip'
	- if not data_type:
	- # TODO refactor this logic. check_binary isn't guaranteed to be
	- # correct since it only looks at whether the first 100 chars are
	- # printable or not. If someone specifies a known unsniffable
	- # binary datatype and check_binary fails, the file gets mangled.
	- if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
	- # We have a binary dataset, but it is not Bam, Sff or Pdf
	- data_type = 'binary'
	- #binary_ok = False
	- parts = dataset.name.split( "." )
	- if len( parts ) > 1:
	- ext = parts[-1].strip().lower()
	- if not Binary.is_ext_unsniffable(ext):
	- file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
	- return
	- elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
	- err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
	- file_err( err_msg, dataset, json_file )
	- return
	- if not data_type:
	- # We must have a text file
	- if check_html( dataset.path ):
	- file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
	- return
	- if data_type != 'binary':
	- if link_data_only == 'copy_files':
	- if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]:
	- in_place = False
	- # Convert universal line endings to Posix line endings, but allow the user to turn it off,
	- # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
	- # corrupting the content of those files.
	- if dataset.to_posix_lines:
	- tmpdir = output_adjacent_tmpdir( output_path )
	- tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
	- if dataset.space_to_tab:
	- line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
	- else:
	- line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
	- if dataset.file_type == 'auto':
	- ext = sniff.guess_ext( dataset.path, registry.sniff_order )
	- else:
	- ext = dataset.file_type
	- data_type = ext
	+ line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix )
	+ if dataset.file_type == 'auto':
	+ ext = sniff.guess_ext( dataset.path, registry.sniff_order )
	+ else:
	+ ext = dataset.file_type
	+ data_type = ext
	# Save job info for the framework
	if ext == 'auto' and dataset.ext:
	ext = dataset.ext