Created
November 23, 2023 13:31
-
-
Save orellabac/3e451ec15e08f0edfcde8a765d893938 to your computer and use it in GitHub Desktop.
Snowpark Java SPSS\.sav files metadata extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Go to https://mvnrepository.com/artifact/com.bedatadriven.spss/spss-reader | |
and download the jar file. | |
And upload it into an stage | |
*/ | |
create or replace function read_spss_metadata(filename string) | |
RETURNS TABLE(COLUMN_NAME TEXT,TYPE TEXT,NULLABLE BOOLEAN,EXPRESSION TEXT,FILENAMES TEXT,ORDER_ID INT) | |
language java | |
imports = ('@mystage/spss-reader-1.3.jar') | |
handler='SPSSMetadataUDTF' | |
target_path='@mystage/spss-reader-metadata.jar' | |
as $$ | |
import com.bedatadriven.spss.SpssDataFileReader; | |
import com.bedatadriven.spss.SpssVariable; | |
import java.util.stream.Stream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.util.*; | |
import com.snowflake.snowpark_java.types.SnowflakeFile; | |
class OutputRowMetadata { | |
public String COLUMN_NAME; | |
public String TYPE; | |
public boolean NULLABLE; | |
public String EXPRESSION; | |
public String FILENAMES; | |
public int ORDER_ID; | |
public OutputRowMetadata(String COLUMN_NAME, String TYPE, boolean NULLABLE, String EXPRESSION, String FILENAMES, int ORDER_ID) { | |
this.COLUMN_NAME = COLUMN_NAME; | |
this.TYPE = TYPE; | |
this.NULLABLE = NULLABLE; | |
this.EXPRESSION = EXPRESSION; | |
this.FILENAMES = FILENAMES; | |
this.ORDER_ID = ORDER_ID; | |
} | |
} | |
class SPSSDecoder { | |
public static Stream<OutputRowMetadata> getMedatata(InputStream inputStream) { | |
try { | |
var reader = new SpssDataFileReader(inputStream); | |
var result = new ArrayList<OutputRowMetadata>(); | |
for(var variable: reader.getVariables()) | |
{ | |
String snowflakeType = getType(variable); | |
var columnName = variable.getVariableName(); | |
var expression = "$1:"+ columnName+ "::"+ snowflakeType; | |
var metadataInfo = new OutputRowMetadata( | |
columnName, | |
snowflakeType, | |
false, | |
expression,null, | |
variable.getIndex()); | |
result.add(metadataInfo); | |
} | |
return result.stream(); | |
} catch (IOException e) { | |
return null; | |
} | |
} | |
private static String getType(SpssVariable variable) { | |
var format =variable.getPrintFormat(); | |
switch(format.getType()) | |
{ | |
case 1: return "STRING"; | |
case 5: return "NUMBER(38," + format.getDecimalPlaces() + ")"; | |
case 21: return "TIME"; | |
case 22: return "DATETIME"; | |
case 38: return "DATE"; | |
} | |
return null; | |
} | |
} | |
class SPSSMetadataUDTF { | |
public static Class getOutputClass() { | |
return OutputRowMetadata.class; | |
} | |
public Stream<OutputRowMetadata> process(String inputFile) { | |
SnowflakeFile file = SnowflakeFile.newInstance(inputFile, false); | |
return SPSSDecoder.getMedatata(file.getInputStream()); | |
} | |
public Stream<OutputRowMetadata> endPartition() { | |
return Stream.empty(); | |
} | |
} | |
$$; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment