mtrcn/compare_dfs.py

## compare_dfs.py
def compareFields(jdbcDF, domainDf):
    print "Missing Columns:"
    print [col_name for col_name in [x.lower().replace(" ", "") for x in jdbcDF.columns] if not col_name in domainDf.columns]
    print "Redundant Columns:"
    print [col_name for col_name in domainDf.columns if not col_name in [x.lower().replace(" ", "") for x in jdbcDF.columns]]
    mismatched_types = dict()
    for col in (col_name for col_name in jdbcDF.columns if col_name.lower().replace(" ", "") in requests_consultaanvraag_df.columns):
        colLowerName = col.lower().replace(" ", "")
        destinationColType = str(domainDf.schema.fields[domainDf.columns.index(colLowerName)].dataType)
        sourceColType = str(jdbcDF.schema.fields[jdbcDF.columns.index(col)].dataType)
        if sourceColType != destinationColType:
            if sourceColType in mismatched_types:
                mismatched_types[sourceColType].append(colLowerName)
            else:
                mismatched_types[sourceColType] = [colLowerName]
    print "Source Column Count: " + str(len(jdbcDF.columns)) + " Destination Column Count: " + str(len(domainDf.columns))
    print "Mismatched Types:"
    print mismatched_types
	def compareFields(jdbcDF, domainDf):
	print "Missing Columns:"
	print [col_name for col_name in [x.lower().replace(" ", "") for x in jdbcDF.columns] if not col_name in domainDf.columns]
	print "Redundant Columns:"
	print [col_name for col_name in domainDf.columns if not col_name in [x.lower().replace(" ", "") for x in jdbcDF.columns]]
	mismatched_types = dict()
	for col in (col_name for col_name in jdbcDF.columns if col_name.lower().replace(" ", "") in requests_consultaanvraag_df.columns):
	colLowerName = col.lower().replace(" ", "")
	destinationColType = str(domainDf.schema.fields[domainDf.columns.index(colLowerName)].dataType)
	sourceColType = str(jdbcDF.schema.fields[jdbcDF.columns.index(col)].dataType)
	if sourceColType != destinationColType:
	if sourceColType in mismatched_types:
	mismatched_types[sourceColType].append(colLowerName)
	else:
	mismatched_types[sourceColType] = [colLowerName]
	print "Source Column Count: " + str(len(jdbcDF.columns)) + " Destination Column Count: " + str(len(domainDf.columns))
	print "Mismatched Types:"
	print mismatched_types