Skip to content

Instantly share code, notes, and snippets.

@r39132
Created May 23, 2023 01:16
Show Gist options
  • Save r39132/f5324fa46449f6e6f78114239d0b8035 to your computer and use it in GitHub Desktop.
Save r39132/f5324fa46449f6e6f78114239d0b8035 to your computer and use it in GitHub Desktop.
ChatGPT Json-to-Parquet-Converter
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
public class JsonToParquetConverter {
public static void main(String[] args) throws IOException {
// Load the JSON data into a Jackson JSON tree
ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode = mapper.readTree(new File("data.json"));
// Create an Avro schema to represent the JSON data
Schema schema = createAvroSchema(rootNode);
// Create a Parquet writer using the Avro schema
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path outputPath = new Path("data.parquet");
ParquetWriter<GenericRecord> writer = AvroParquetWriter
.<GenericRecord>builder(outputPath)
.withSchema(schema)
.withConf(conf)
.build();
// Convert each JSON record to a GenericRecord and write it to the Parquet file
for (JsonNode jsonNode : rootNode) {
GenericRecord record = createGenericRecord(jsonNode, schema);
writer.write(record);
}
// Close the Parquet writer
writer.close();
}
private static Schema createAvroSchema(JsonNode jsonNode) {
// Create an Avro schema based on the JSON data structure
// This example assumes that all JSON records have the same structure
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(jsonNode.get(0).toString());
return schema;
}
private static GenericRecord createGenericRecord(JsonNode jsonNode, Schema schema) {
// Convert a JSON record to a GenericRecord using the Avro schema
GenericRecordBuilder recordBuilder = new GenericRecordBuilder(schema);
for (Schema.Field field : schema.getFields()) {
String fieldName = field.name();
if (jsonNode.has(fieldName)) {
JsonNode fieldValue = jsonNode.get(fieldName);
recordBuilder.set(fieldName, createAvroValue(fieldValue, field.schema()));
}
}
GenericRecord record = recordBuilder.build();
return record;
}
private static Object createAvroValue(JsonNode jsonNode, Schema schema) {
// Convert a JSON value to an Avro value based on the Avro schema type
switch (schema.getType()) {
case BOOLEAN:
return jsonNode.booleanValue();
case INT:
return jsonNode.intValue();
case LONG:
return jsonNode.longValue();
case FLOAT:
return jsonNode.floatValue();
case DOUBLE:
return jsonNode.doubleValue();
case STRING:
return jsonNode.textValue();
case BYTES:
return jsonNode.binaryValue();
case RECORD:
return createGenericRecord(jsonNode, schema);
default:
throw new IllegalArgumentException("Unsupported Avro schema type: " + schema.getType());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment