Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python scripts to download, extract, and output oil, gas, and/or fracking well information from various states in the United States of America.
# -*- coding: utf-8 -*-
'''
This Python script is oriented towards oil, gas, and fracking well
information download, extract, and output at the state-level within
the United States of America. Not all states are present as not all
states either allow fracking or have any sites available for download.
This Python script is depended upon by another Python script entitled
PyFrackETL.py, which is also available as a Gist under the same name.
'''
import collections
import csv
import ogr
import os
import pyodbc
import shutil
import tempfile
import time
import xlrd
import zipfile
# from dbfpy import dbf
class Src2Tgt:
cp1252 = {
# from http://www.microsoft.com/typography/unicode/1252.htm
u"\u20AC": u"\x80", # EURO SIGN
u"\u201A": u"\x82", # SINGLE LOW-9 QUOTATION MARK
u"\u0192": u"\x83", # LATIN SMALL LETTER F WITH HOOK
u"\u201E": u"\x84", # DOUBLE LOW-9 QUOTATION MARK
u"\u2026": u"\x85", # HORIZONTAL ELLIPSIS
u"\u2020": u"\x86", # DAGGER
u"\u2021": u"\x87", # DOUBLE DAGGER
u"\u02C6": u"\x88", # MODIFIER LETTER CIRCUMFLEX ACCENT
u"\u2030": u"\x89", # PER MILLE SIGN
u"\u0160": u"\x8A", # LATIN CAPITAL LETTER S WITH CARON
u"\u2039": u"\x8B", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
u"\u0152": u"\x8C", # LATIN CAPITAL LIGATURE OE
u"\u017D": u"\x8E", # LATIN CAPITAL LETTER Z WITH CARON
u"\u2018": u"\x91", # LEFT SINGLE QUOTATION MARK
u"\u2019": u"\x92", # RIGHT SINGLE QUOTATION MARK
u"\u201C": u"\x93", # LEFT DOUBLE QUOTATION MARK
u"\u201D": u"\x94", # RIGHT DOUBLE QUOTATION MARK
u"\u2022": u"\x95", # BULLET
u"\u2013": u"\x96", # EN DASH
u"\u2014": u"\x97", # EM DASH
u"\u02DC": u"\x98", # SMALL TILDE
u"\u2122": u"\x99", # TRADE MARK SIGN
u"\u0161": u"\x9A", # LATIN SMALL LETTER S WITH CARON
u"\u203A": u"\x9B", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
u"\u0153": u"\x9C", # LATIN SMALL LIGATURE OE
u"\u017E": u"\x9E", # LATIN SMALL LETTER Z WITH CARON
u"\u0178": u"\x9F", # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
def __init__(self):
return
# -------------------------------------------------------------------------
# define the CSV to KML file method
# -------------------------------------------------------------------------
def csv2kml(self, csvFile, tgtPath, kmlFile, maxRecords, flushCount):
print ("")
print ("========================")
print ("CSV to KML conversion...")
print ("------------------------")
ogr.UseExceptions()
# expand any leading tilde
# to the user's home path
csvFile = os.path.expanduser(csvFile)
kmlFile = os.path.expanduser(kmlFile)
# verify that SHP file exists
if not os.path.exists(csvFile):
print ("CSV file '%s' does NOT exist!" % csvFile)
return
# make sure the target folder exists,
# creating it recursively if it does not
if kmlFile != "":
tgtFldr = os.path.dirname(kmlFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
print ("csvFile = %s" % csvFile)
if kmlFile == "":
kmlFile = os.path.join(tgtPath, os.path.basename(csvFile).lower().replace('.csv','.kml').replace('.txt','.kml'))
if os.path.dirname(kmlFile) == "":
kmlFile = os.path.join(tgtPath, kmlFile)
print ("kmlFile = %s" % kmlFile)
csvreader=csv.reader(open(csvFile,'r'))
headers=csvreader.next()
kmlDs = ogr.GetDriverByName('KML').CreateDataSource(kmlFile)
kmlLyr = kmlDs.CreateLayer(os.path.splitext(os.path.basename(kmlFile))[0])
for field in headers[:-1]: #skip kmlgeometry (assumed to be in last column)
field_def = ogr.FieldDefn(field)
kmlLyr.CreateField(field_def)
bgnTime = time.time()
rows = 0
for rec in csvreader:
feat = ogr.Feature(kmlLyr.GetLayerDefn())
for i,field in enumerate(headers[:-1]): #skip kmlgeometry
feat.SetField(field, rec[i])
feat.SetGeometry(ogr.CreateGeometryFromGML(rec[-1]))
kmlLyr.CreateFeature(feat)
rows += 1
if rows % flushCount == 0:
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
#clean up
del kmlLyr,kmlDs
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the DBF file to CSV file method
# -------------------------------------------------------------------------
def dbf2csv(self, dbfFile, tgtPath, csvFile, maxRecords, flushCount, csvDelimiter=','):
print("")
print("=============================")
print("DBF to CSV file conversion...")
print("-----------------------------")
print("dbfFile '%s'" % dbfFile)
print("csvFile '%s'" % csvFile)
# expand any leading tilde
# to the user's home path
dbfFile = os.path.expanduser(dbfFile)
csvFile = os.path.expanduser(csvFile)
# verify that DBF file exists
if not os.path.exists(dbfFile):
print ("DBF file '%s' does NOT exist!" % dbfFile)
return
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(dbfFile).lower().replace('.dbf','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
dbffile = dbf.Dbf(dbfFile)
colNames = []
for colName in dbffile.header.fields:
colNames.append(colName)
fWriter = open(csvFile,'w',newline='')
csvwriter=csv.Writer(fWriter, delimiter=csvDelimiter)
csvwriter.writerow(colNames)
bgnTime = time.time()
rows = 0
for row in dbffile:
csvwriter.writerow(row)
rows += 1
if rows % flushCount == 0:
fWriter.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
#clean up
dbffile.close()
fWriter.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the KML to CSV file method
# -------------------------------------------------------------------------
def kml2csv(self, kmlFile, tgtPath, csvFile, maxRecords, flushCount):
return
# -------------------------------------------------------------------------
# define the MDB to CSV file method
# -------------------------------------------------------------------------
def mdb2csv(self, mdbFile, tgtPath, csvFile, tblName, maxRecords, flushCount, csvDelimiter=','):
print("")
print("=============================")
print("MDB to CSV file conversion...")
print("-----------------------------")
print("mdbFile '%s'" % mdbFile)
print("csvFile '%s'" % csvFile)
print("tblName '%s'" % tblName)
# expand any leading tilde
# to the user's home path
mdbFile = os.path.expanduser(mdbFile)
csvFile = os.path.expanduser(csvFile)
# verify that MDB file exists
if not os.path.exists(mdbFile):
print ("TAB file '%s' does NOT exist!" % mdbFile)
return
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(mdbFile).lower().replace('.mdb','.csv').replace('.accdb','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
bgnTime = time.time()
mdbFile = os.path.abspath(mdbFile)
connString = "Driver={Microsoft Access Driver (*.mdb)};DBQ=%s;" % mdbFile
print ("pyODBC connString: %s" % connString)
conn = pyodbc.connect(connString)
cursor = conn.cursor()
colNames = collections.OrderedDict()
colValues = collections.OrderedDict()
for row in cursor.columns(table='%s' % tblName):
colNames[row.column_name] = row.column_name
fWriter = open(csvFile,'w',newline='')
csvwriter=csv.DictWriter(fWriter, delimiter=csvDelimiter, fieldnames=colNames)
csvwriter.writeheader()
cursor.execute("select * from [%s]" % tblName)
rows = 0
for row in cursor:
colValues.clear()
idx = 0
for colName in colNames:
colValues[colName] = row[idx]
idx += 1
csvwriter.writerow(colValues)
rows += 1
if rows % flushCount == 0:
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
conn.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the MDB to CSV file method
# -------------------------------------------------------------------------
def pdf2csv(self, pdfFile, tgtPath, csvFile, maxRecords, flushCount):
return
# -------------------------------------------------------------------------
# define the SHP to CSV file method
# -------------------------------------------------------------------------
def shp2csv(self, shpFile, tgtPath, csvFile, maxRecords, flushCount):
print ("")
print ("========================")
print ("SHP to CSV conversion...")
print ("------------------------")
# expand any leading tilde
# to the user's home path
shpFile = os.path.expanduser(shpFile)
csvFile = os.path.expanduser(csvFile)
# verify that SHP file exists
if not os.path.exists(shpFile):
print ("SHP file '%s' does NOT exist!" % shpFile)
return
elif not shpFile.lower().endswith(".shp"):
print ("SHP file name '%s' does NOT end with '.shp'!" % shpFile)
return
# make sure the target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
print ("shpFile = %s" % shpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(shpFile).lower().replace('.shp','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# Open files
csvfile=open(csvFile,'w', newline='')
shpDs=ogr.Open(shpFile)
shpLayer=shpDs.GetLayer()
# Get field names
shpDfn=shpLayer.GetLayerDefn()
nfields=shpDfn.GetFieldCount()
headers=[]
for i in range(nfields):
headers.append(shpDfn.GetFieldDefn(i).GetName())
headers.append('kmlgeometry')
csvwriter = csv.DictWriter(csvfile, headers)
try:
csvwriter.writeheader() #python 2.7+
except:
csvfile.write(','.join(headers)+'\n')
bgnTime = time.time()
# Write attributes and kml geometry out to csv
rows = 0
for shpFeat in shpLayer:
try:
attributes=shpFeat.items()
shpGeom=shpFeat.GetGeometryRef()
try:
attributes['kmlgeometry']=shpGeom.ExportToKML()
except AttributeError as e:
attributes['kmlgeometry']=""
print (e)
csvwriter.writerow(attributes)
except UnicodeEncodeError as e:
print (e)
rows += 1
if rows % flushCount == 0:
csvfile.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
# clean up
del csvwriter,shpLayer,shpDs
csvfile.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the SHP to KML file method
# -------------------------------------------------------------------------
def shp2kml(self, shpFile, tgtPath, kmlFile, maxRecords, flushCount):
print ("")
print ("========================")
print ("SHP to KML conversion...")
print ("------------------------")
# expand any leading tilde
# to the user's home path
shpFile = os.path.expanduser(shpFile)
kmlFile = os.path.expanduser(kmlFile)
# verify that SHP file exists
if not os.path.exists(shpFile):
print ("SHP file '%s' does NOT exist!" % shpFile)
return
elif not shpFile.lower().endswith(".shp"):
print ("SHP file name '%s' does NOT end with '.shp'!" % shpFile)
return
# make sure the KML target folder exists,
# creating it recursively if it does not
if kmlFile != "":
tgtFldr = os.path.dirname(kmlFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
print ("shpFile = %s" % shpFile)
if kmlFile == "":
kmlFile = os.path.join(tgtPath, os.path.basename(shpFile).lower().replace('.shp','.kml'))
if os.path.dirname(kmlFile) == "":
kmlFile = os.path.join(tgtPath, kmlFile)
print ("kmlFile = %s" % kmlFile)
# Open files
shpDs=ogr.Open(shpFile)
shpLayer=shpDs.GetLayer()
kmlDs = ogr.GetDriverByName('KML').CreateDataSource(kmlFile)
kmlLayer = kmlDs.CreateLayer(os.path.splitext(os.path.basename(kmlFile))[0])
# Get field names
shpDfn=shpLayer.GetLayerDefn()
nfields=shpDfn.GetFieldCount()
headers=[]
for i in range(nfields):
headers.append(shpDfn.GetFieldDefn(i).GetName())
field = shpDfn.GetFieldDefn(i).GetName()
field_def = ogr.FieldDefn(field)
kmlLayer.CreateField(field_def)
headers.append('kmlgeometry')
bgnTime = time.time()
# Write attributes and kml out to csv
rows = 0
for shpFeat in shpLayer:
attributes=shpFeat.items()
shpGeom=shpFeat.GetGeometryRef()
attributes['kmlgeometry']=shpGeom.ExportToKML()
# print (attributes)
kmlFeat = ogr.Feature(kmlLayer.GetLayerDefn())
for field in headers[:-1]: #skip kmlgeometry (assumed to be in last column)
kmlFeat.SetField(field, attributes[field])
kmlFeat.SetGeometry(ogr.CreateGeometryFromGML(attributes['kmlgeometry']))
kmlLayer.CreateFeature(kmlFeat)
rows += 1
if rows % flushCount == 0:
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("Rows: {:,}".format(rows))
if maxRecords > 0 and rows >= maxRecords:
break
# clean up
del shpLayer,shpDs,kmlLayer,kmlDs
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the SHP to KML and CSV files method
# -------------------------------------------------------------------------
def shp2kmlcsv(self, shpFile, tgtPath, kmlFile, csvFile, maxRecords, flushCount):
print ("")
print ("================================")
print ("SHP to KML and CSV conversion...")
print ("--------------------------------")
# expand any leading tilde
# to the user's home path
shpFile = os.path.expanduser(shpFile)
kmlFile = os.path.expanduser(kmlFile)
csvFile = os.path.expanduser(csvFile)
# verify that SHP file exists
if not os.path.exists(shpFile):
print ("SHP file '%s' does NOT exist!" % shpFile)
return
elif not shpFile.lower().endswith(".shp"):
print ("SHP file name '%s' does NOT end with '.shp'!" % shpFile)
return
# make sure the KML target folder exists,
# creating it recursively if it does not
if kmlFile != "":
tgtFldr = os.path.dirname(kmlFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
print ("shpFile = %s" % shpFile)
if kmlFile == "":
kmlFile = os.path.join(tgtPath, os.path.basename(shpFile).lower().replace('.shp','.kml'))
if os.path.dirname(kmlFile) == "":
kmlFile = os.path.join(tgtPath, kmlFile)
print ("kmlFile = %s" % kmlFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(shpFile).lower().replace('.shp','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# Open files
csvfile=open(csvFile,'w', newline='')
shpDs=ogr.Open(shpFile)
shpLayer=shpDs.GetLayer()
kmlDs = ogr.GetDriverByName('KML').CreateDataSource(kmlFile)
kmlLayer = kmlDs.CreateLayer(os.path.splitext(os.path.basename(kmlFile))[0])
# Get field names
shpDfn=shpLayer.GetLayerDefn()
nfields=shpDfn.GetFieldCount()
headers=[]
for i in range(nfields):
headers.append(shpDfn.GetFieldDefn(i).GetName())
field = shpDfn.GetFieldDefn(i).GetName()
field_def = ogr.FieldDefn(field)
kmlLayer.CreateField(field_def)
headers.append('kmlgeometry')
csvwriter = csv.DictWriter(csvfile, headers)
try:
csvwriter.writeheader() #python 2.7+
except:
header = ','.join(headers) + '\n'
csvfile.write(header.encode('utf-8'))
bgnTime = time.time()
# Write attributes and kml out to csv
rows = 0
for shpFeat in shpLayer:
attributes=shpFeat.items()
shpGeom=shpFeat.GetGeometryRef()
attributes['kmlgeometry']=shpGeom.ExportToKML()
csvwriter.writerow(attributes)
kmlFeat = ogr.Feature(kmlLayer.GetLayerDefn())
for field in headers[:-1]: #skip kmlgeometry (assumed to be in last column)
kmlFeat.SetField(field, attributes[field])
kmlFeat.SetGeometry(ogr.CreateGeometryFromGML(attributes['kmlgeometry']))
kmlLayer.CreateFeature(kmlFeat)
rows += 1
if rows % flushCount == 0:
csvfile.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
# clean up
del csvwriter,shpLayer,shpDs,kmlLayer,kmlDs
csvfile.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the TAB-delimited file to CSV file method
# -------------------------------------------------------------------------
def tab2csv(self, tabFile, tgtPath, csvFile, maxRecords, flushCount, tabDelimiter='\t', csvDelimiter=','):
print("")
print("=============================")
print("TAB to CSV file conversion...")
print("-----------------------------")
print("tabFile '%s'" % tabFile)
print("csvFile '%s'" % csvFile)
# expand any leading tilde
# to the user's home path
tabFile = os.path.expanduser(tabFile)
csvFile = os.path.expanduser(csvFile)
# verify that TAB file exists
if not os.path.exists(tabFile):
print ("TAB file '%s' does NOT exist!" % tabFile)
return
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tabFile).lower().replace('.tab','.csv').replace('.txt','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
fReader = open(tabFile,'r')
csvreader=csv.DictReader(fReader, delimiter=tabDelimiter)
fWriter = open(csvFile,'w',newline='')
csvwriter=csv.DictWriter(fWriter, delimiter=csvDelimiter, fieldnames=csvreader.fieldnames)
csvwriter.writeheader()
bgnTime = time.time()
rows = 0
cols = dict()
for rowDict in csvreader:
cols.clear()
for colName in csvreader.fieldnames:
cols[colName] = ' '.join(rowDict[colName].strip().split())
csvwriter.writerow(cols)
rows += 1
if rows % flushCount == 0:
fWriter.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
#clean up
fReader.close()
fWriter.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the TXT file to CSV file method
# -------------------------------------------------------------------------
def txt2csv(self, txtFile, tgtPath, csvFile, maxRecords, flushCount, txtDelimiter=',', csvDelimiter=','):
print("")
print("=============================")
print("TXT to CSV file conversion...")
print("-----------------------------")
print("txtFile '%s'" % txtFile)
print("csvFile '%s'" % csvFile)
# expand any leading tilde
# to the user's home path
txtFile = os.path.expanduser(txtFile)
csvFile = os.path.expanduser(csvFile)
# verify that TXT file exists
if not os.path.exists(txtFile):
print ("TXT file '%s' does NOT exist!" % txtFile)
return
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(txtFile).lower().replace('.txt','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
fReader = open(txtFile,'r')
csvreader=csv.DictReader(fReader, delimiter=txtDelimiter)
fWriter = open(csvFile,'w',newline='')
csvwriter=csv.DictWriter(fWriter, delimiter=csvDelimiter, fieldnames=csvreader.fieldnames)
csvwriter.writeheader()
bgnTime = time.time()
rows = 0
cols = dict()
for rowDict in csvreader:
cols.clear()
for colName in csvreader.fieldnames:
try:
cols[colName] = ' '.join(rowDict[colName].strip().split())
except AttributeError as e:
cols[colName] = ''
print (e)
csvwriter.writerow(cols)
rows += 1
if rows % flushCount == 0:
fWriter.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
#clean up
fReader.close()
fWriter.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the XLS to CSV file method
# -------------------------------------------------------------------------
def xls2csv(self, xlsFile, tgtPath, csvFile, maxRecords, flushCount):
print("")
print("=============================")
print("XLS to CSV file conversion...")
print("-----------------------------")
print("xlsFile '%s'" % xlsFile)
print("csvFile '%s'" % csvFile)
# expand any leading tilde
# to the user's home path
xlsFile = os.path.expanduser(xlsFile)
csvFile = os.path.expanduser(csvFile)
# verify that TAB file exists
if not os.path.exists(xlsFile):
print ("XLS file '%s' does NOT exist!" % xlsFile)
return
# make sure the CSV target folder exists,
# creating it recursively if it does not
if csvFile != "":
tgtFldr = os.path.dirname(csvFile)
if tgtFldr == "":
tgtFldr = tgtPath
if not os.path.exists(tgtFldr):
os.makedirs(tgtFldr)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(xlsFile).lower().replace('.xls','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
xlsWB = xlrd.open_workbook(xlsFile)
xlsSH = xlsWB.sheet_by_index(0)
nbrRows = xlsSH.nrows
fWriter = open(csvFile,'w',newline='')
csvwriter=csv.writer(fWriter, delimiter=',')
bgnTime = time.time()
rows = 0
values = []
for rowNbr in range(0, nbrRows):
values.clear()
for entry in xlsSH.row_values(rowNbr):
values.append(str(entry).strip())
try:
csvwriter.writerow(values)
except UnicodeEncodeError as e:
print (e)
rows += 1
if rows % flushCount == 0:
fWriter.flush()
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
if maxRecords > 0 and rows >= maxRecords:
break
fWriter.close()
print ("-----------------------------------")
endTime = time.time()
rcdsPerSec = rows / (endTime - bgnTime)
print ("Rows: {:,} @ {:,.0f} records/second".format(rows, rcdsPerSec))
print ("")
return
# -------------------------------------------------------------------------
# define the unZIP a file to a target folder method
# -------------------------------------------------------------------------
def zip2dir(self, zipFile, tgtPath):
print("")
print("=================================")
print("ZIP to TGT directory expansion...")
print("---------------------------------")
print("zipFile '%s'" % zipFile)
print("tgtPath '%s'" % tgtPath)
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
# create the target folder
# recursively if not extant
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# extract all of the files/folders
# within the zip file to the target folder
with zipfile.ZipFile(zipFile, "r") as z:
z.extractall(tgtPath)
return
# -------------------------------------------------------------------------
# define the ZIP file to a TXT file and then to a CSV file method
# -------------------------------------------------------------------------
def zip2csv2csv(self, zipFile, tgtPath, csvFile, maxRecords, flushCount, unzPath, srcDelimiter=',', csvDelimiter=','):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the SHP file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".csv"):
tmpFile = os.path.join(unzPath, aFile)
break
# verify that a SHP file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a CSV file!" % zipFile)
return
else:
print ("zipFile = %s" % zipFile)
print ("srcFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower())
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV file from the SRC file
self.txt2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount, srcDelimiter, csvDelimiter)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# defin the ZIP file to a DBF file, then to a CSV file method
# -------------------------------------------------------------------------
def zip2dbf2csv(self, zipFile, tgtPath, csvFile, maxRecords, flushCount, unzPath, csvDelimiter=','):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the TAB or TXT file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".dbf"):
tmpFile = os.path.join(unzPath, aFile)
print ("zipFile = %s" % zipFile)
print ("dbfFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.dbf','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV file from the DBF file
self.dbf2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount, csvDelimiter)
# verify that a TAB or TXT file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a DBF file!" % zipFile)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# defin the ZIP file to a MDB file, then to a CSV file method
# -------------------------------------------------------------------------
def zip2mdb2csv(self, zipFile, tgtPath, csvFile, tblName, maxRecords, flushCount, unzPath, csvDelimiter=','):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the TAB or TXT file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".mdb") or aFile.lower().endswith(".accdb"):
tmpFile = os.path.join(unzPath, aFile)
print ("zipFile = %s" % zipFile)
print ("mdbFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.mdb','.csv').replace('.accdb','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV files from the TAB file
self.mdb2csv(tmpFile, tgtPath, csvFile, tblName, maxRecords, flushCount, csvDelimiter)
# verify that a TAB or TXT file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a MDB or ACCDB file!" % zipFile)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# define the ZIP file to a SHP file, then to CSV files method
# -------------------------------------------------------------------------
def zip2shp2csv(self, zipFile, tgtPath, csvFile, maxRecords, flushCount, unzPath):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the SHP file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".shp"):
tmpFile = os.path.join(unzPath, aFile)
break
# verify that a SHP file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a SHP file!" % zipFile)
return
else:
print ("zipFile = %s" % zipFile)
print ("shpFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.shp','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV file from the SHP file
self.shp2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# define the ZIP file to a SHP file, then to KML and CSV files method
# -------------------------------------------------------------------------
def zip2shp2kmlcsv(self, zipFile, tgtPath, kmlFile, csvFile, maxRecords, flushCount, unzPath):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the SHP file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".shp"):
tmpFile = os.path.join(unzPath, aFile)
break
# verify that a SHP file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a SHP file!" % zipFile)
return
else:
print ("zipFile = %s" % zipFile)
print ("shpFile = %s" % tmpFile)
if kmlFile == "":
kmlFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.shp','.kml'))
if os.path.dirname(kmlFile) == "":
kmlFile = os.path.join(tgtPath, kmlFile)
print ("kmlFile = %s" % kmlFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.shp','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the KML and CSV files from the SHP file
self.shp2kmlcsv(tmpFile, tgtPath, kmlFile, csvFile, maxRecords, flushCount)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# defin the ZIP file to a TAB file, then to a CSV file method
# -------------------------------------------------------------------------
def zip2tab2csv(self, zipFile, tgtPath, tabFile, csvFile, maxRecords, flushCount, unzPath, tabDelimiter='\t', csvDelimiter=','):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the TAB or TXT file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".tab") or aFile.lower().endswith(".txt"):
tmpFile = os.path.join(unzPath, aFile)
print ("zipFile = %s" % zipFile)
print ("tabFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.tab','.csv').replace('.txt','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV files from the TAB file
self.tab2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount, tabDelimiter, csvDelimiter)
# verify that a TAB or TXT file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a TAB or TXT file!" % zipFile)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# define the ZIP file to a TXT file and then to a CSV file method
# -------------------------------------------------------------------------
def zip2txt2csv(self, zipFile, tgtPath, csvFile, maxRecords, flushCount, unzPath, txtDelimiter=',', csvDelimiter=','):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the SHP file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".txt"):
tmpFile = os.path.join(unzPath, aFile)
break
# verify that a SHP file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a TXT file!" % zipFile)
return
else:
print ("zipFile = %s" % zipFile)
print ("txtFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.txt','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV file from the TXT file
self.txt2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount, txtDelimiter, csvDelimiter)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
# -------------------------------------------------------------------------
# defin the ZIP file to a XLS file, then to a CSV file method
# -------------------------------------------------------------------------
def zip2xls2csv(self, zipFile, tgtPath, csvFile, maxRecords, flushCount, unzPath):
# expand any leading tilde
# to the user's home path
zipFile = os.path.expanduser(zipFile)
tgtPath = os.path.expanduser(tgtPath)
unzPath = os.path.expanduser(unzPath)
# verify that ZIP file exists
if not os.path.exists(zipFile):
print ("ZIP file '%s' does NOT exist!" % zipFile)
return
rmvFldr = False
# if no target folder specified,
if tgtPath == "":
# output an error message
print ("Target folder was NOT specified!")
else:
print ("Target folder '%s'was specified." % tgtPath)
# make sure the target folder exists,
# creating it recursively if it does not
if not os.path.exists(tgtPath):
os.makedirs(tgtPath)
# if UNZIP folder was NOT specifed,
# generate a temporary UNZIP folder
# to be removed after processing
if unzPath == "":
unzPath = tempfile.mkdtemp()
rmvFldr = True
print ("Unzip folder '%s' was generated." % unzPath)
else:
print ("Unzip folder '%s' was specified." % unzPath)
# make sure the UNZIP folder exists,
# creating it recursively if it does not
if not os.path.exists(unzPath):
os.makedirs(unzPath)
# unzip the ZIP file
# to the UNZIP folder
self.zip2dir(zipFile, unzPath)
# find the TAB or TXT file in
# the specified UNZIP folder
tmpFile = ""
for aFile in os.listdir(unzPath):
if aFile.lower().endswith(".xls") or aFile.lower().endswith(".xlsx"):
tmpFile = os.path.join(unzPath, aFile)
print ("zipFile = %s" % zipFile)
print ("xlsFile = %s" % tmpFile)
if csvFile == "":
csvFile = os.path.join(tgtPath, os.path.basename(tmpFile).lower().replace('.xls','.csv').replace('.xlsx','.csv'))
if os.path.dirname(csvFile) == "":
csvFile = os.path.join(tgtPath, csvFile)
print ("csvFile = %s" % csvFile)
# create the CSV files from the XLS file
self.xls2csv(tmpFile, tgtPath, csvFile, maxRecords, flushCount)
# verify that a TAB or TXT file was found
if tmpFile.strip() == "":
print ("Zip file '%s' did NOT contain a XLS file!" % zipFile)
# if UNZIP folder removal specified,
# remove the UNZIP folder in question
if rmvFldr:
shutil.rmtree(unzPath)
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.