kamawanu/xls2plain.py

## xls2plain.py
#!python
# -*- coding: utf-8 -*-

import sys, unicodedata, logging, re, os.path

basepath = os.path.dirname( sys.argv[0] )

sys.path.append( os.path.join( basepath, "pyexcelerator-0.6.4.1.zip") ) # http://sourceforge.net/projects/pyexcelerator/

import pyExcelerator

class patchedReader(pyExcelerator.CompoundDoc.Reader): # pyExceleratorの中を改造
    def get_stream_data(self, data, SAT, start_sid, sect_size):
        sid = start_sid
        chunks = [(sid, sid)]
        stream_data = ''

        try:
            while SAT[sid] >= 0:
                next_in_chain = SAT[sid]
                last_chunk_start, last_chunk_finish = chunks[-1]
                if next_in_chain - last_chunk_finish <= 1:
                    chunks[-1] = last_chunk_start, next_in_chain
                else:
                    chunks.extend([(next_in_chain, next_in_chain)])
                sid = next_in_chain
        except:
            import logging
###            logging.warn( sid ) # 無視するしか無さそう
        for s, f in chunks:
            stream_data += data[s*sect_size:(f+1)*sect_size]
        #print chunks
        return stream_data

pyExcelerator.CompoundDoc.Reader = patchedReader# pyExceleratorの中で例外が出ちゃうので入れ替える

def extractxlsbook(filename1): # xlsブック単位の処理
    xlsbooksheetdicts = pyExcelerator.parse_xls( filename1 ) # pyExcelerator/ImportXLS.py

    import md5, os.path, time
    md5o = md5.new()
    md5o.update( open(filename1).read() )

    print "# %s %d bytes modtime=%s md5=%s " % ( filename1,
            os.path.getsize( filename1 ),
            time.strftime("%Y%m%d-%H%M%S", time.localtime(os.path.getmtime(filename1)) ),
            md5o.hexdigest() )

    for (sheetName, sheetcellsdict) in xlsbooksheetdicts:
        print "*" * 50 + " " + sheetName.encode("utf-8") + " " +  "*" * 50
        sheetdatamap = extractxlssheet( sheetName, sheetcellsdict )

def extractxlssheet( sheetName, sheetcellsdict ): # シート単位の処理
    keys = sorted(sheetcellsdict.keys())
###    logging.warn(keys[:10])

    lastrow = None
    lastbuffer = None

    for (row, col) in keys:
        value = sheetcellsdict[(row, col)]
        value = unicode(value)

####        if col > 300: continue

        if row == lastrow:
            if len(lastbuffer) > 1000:
                break
            padding = col - len(lastbuffer)
            if padding <= 1: padding = 1
            lastbuffer = lastbuffer  + "  " * padding + value
        else:
##            if re.compile( ur"" ).match('^ +$') != None:
##                lastbuffer = ""
            if lastbuffer != None:
                print lastbuffer.encode("utf-8")
            if lastrow != None:
                for ii in xrange( lastrow, row-1 ):
                    print ""
            lastbuffer = "  " * col + value
            lastrow = row

    if lastbuffer != None:
        print lastbuffer.encode("utf-8")

for filename1 in sys.argv[1:]:
    extractxlsbook(filename1)
	#!python
	# -- coding: utf-8 --

	import sys, unicodedata, logging, re, os.path

	basepath = os.path.dirname( sys.argv[0] )

	sys.path.append( os.path.join( basepath, "pyexcelerator-0.6.4.1.zip") ) # http://sourceforge.net/projects/pyexcelerator/

	import pyExcelerator

	class patchedReader(pyExcelerator.CompoundDoc.Reader): # pyExceleratorの中を改造
	def get_stream_data(self, data, SAT, start_sid, sect_size):
	sid = start_sid
	chunks = [(sid, sid)]
	stream_data = ''

	try:
	while SAT[sid] >= 0:
	next_in_chain = SAT[sid]
	last_chunk_start, last_chunk_finish = chunks[-1]
	if next_in_chain - last_chunk_finish <= 1:
	chunks[-1] = last_chunk_start, next_in_chain
	else:
	chunks.extend([(next_in_chain, next_in_chain)])
	sid = next_in_chain
	except:
	import logging
	### logging.warn( sid ) # 無視するしか無さそう
	for s, f in chunks:
	stream_data += data[ssect_size:(f+1)sect_size]
	#print chunks
	return stream_data

	pyExcelerator.CompoundDoc.Reader = patchedReader# pyExceleratorの中で例外が出ちゃうので入れ替える

	def extractxlsbook(filename1): # xlsブック単位の処理
	xlsbooksheetdicts = pyExcelerator.parse_xls( filename1 ) # pyExcelerator/ImportXLS.py

	import md5, os.path, time
	md5o = md5.new()
	md5o.update( open(filename1).read() )

	print "# %s %d bytes modtime=%s md5=%s " % ( filename1,
	os.path.getsize( filename1 ),
	time.strftime("%Y%m%d-%H%M%S", time.localtime(os.path.getmtime(filename1)) ),
	md5o.hexdigest() )

	for (sheetName, sheetcellsdict) in xlsbooksheetdicts:
	print "" 50 + " " + sheetName.encode("utf-8") + " " + "" 50
	sheetdatamap = extractxlssheet( sheetName, sheetcellsdict )

	def extractxlssheet( sheetName, sheetcellsdict ): # シート単位の処理
	keys = sorted(sheetcellsdict.keys())
	### logging.warn(keys[:10])

	lastrow = None
	lastbuffer = None

	for (row, col) in keys:
	value = sheetcellsdict[(row, col)]
	value = unicode(value)

	#### if col > 300: continue

	if row == lastrow:
	if len(lastbuffer) > 1000:
	break
	padding = col - len(lastbuffer)
	if padding <= 1: padding = 1
	lastbuffer = lastbuffer + " " * padding + value
	else:
	## if re.compile( ur"" ).match('^ +$') != None:
	## lastbuffer = ""
	if lastbuffer != None:
	print lastbuffer.encode("utf-8")
	if lastrow != None:
	for ii in xrange( lastrow, row-1 ):
	print ""
	lastbuffer = " " * col + value
	lastrow = row

	if lastbuffer != None:
	print lastbuffer.encode("utf-8")

	for filename1 in sys.argv[1:]:
	extractxlsbook(filename1)