Skip to content

Instantly share code, notes, and snippets.

@daz10000
Created May 17, 2024 06:32
Show Gist options
  • Save daz10000/d65ede204e845ed097d58f750a797ec2 to your computer and use it in GitHub Desktop.
Save daz10000/d65ede204e845ed097d58f750a797ec2 to your computer and use it in GitHub Desktop.
Make a synthetic parquet file, query using python and dotnet wrappers for duckdb
#r "nuget:DuckDB.NET.Data.Full"
open DuckDB.NET.Data
open System
open System.IO
let keys = File.ReadAllLines("keys.txt")
let getGeneActualMutations (path:string) (key:string) =
use connection = new DuckDBConnection("DataSource=:memory:")
connection.Open()
use command = connection.CreateCommand()
command.CommandText <- $"SELECT col71,col2,col3,col20,col21,col34, col8,col51,col60,col72,col43,col19 FROM read_parquet('{path}') where col71 = '{key}'"
[| use reader = command.ExecuteReader()
while reader.Read() do
reader.GetString(0)
|]
printfn "Starting"
let start = DateTime.Now
for g in keys do
let localNow = DateTime.Now
let x = getGeneActualMutations "test.pq" g
printfn $"{g}: {x.Length} {(DateTime.Now-localNow).TotalSeconds}"
printfn $"Total time={(DateTime.Now-start).TotalSeconds}"
import duckdb
keys = open('keys.txt').readlines()#[:10]
import time
start = time.time()
for g in keys:
g = g.strip()
start2 = time.time()
x = duckdb.sql(f"""SELECT col1,col2,col3,col20,col21,col34, col8,col51,col60,col72,col43,col19 FROM read_parquet('test.pq') where col71 = '{g}'""").df()
print(f"{g}: {len(x)} {time.time()-start2:.2f} seconds")
print(f"Total time={time.time()-start}")
#r "nuget:Parquet.Net"
open Parquet
open Parquet.Schema
open Parquet.Data
open System.IO
open System
open System.Text
let outFile = "test.pq"
let keyFile = "keys.txt"
let colPatterns = "INT64,DOUBLE,INT64,INT64,BYTE_ARRAY,INT64,INT64,DOUBLE,DOUBLE,DOUBLE,INT64,INT64,DOUBLE,INT64,INT64,DOUBLE,INT64,INT64,DOUBLE,BYTE_ARRAY,BYTE_ARRAY,BYTE_ARRAY,BYTE_ARRAY,INT64,INT64,INT64,INT64,DOUBLE,DOUBLE,DOUBLE,INT64,DOUBLE,DOUBLE,BOOLEAN,INT64,DOUBLE,DOUBLE,INT64,FLOAT,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,INT64,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,DOUBLE,INT64,FLOAT,INT64,FLOAT,INT64,DOUBLE,BYTE_ARRAY,BOOLEAN,STRING,BYTE_ARRAY".Split(",")
let byteArraySizes = [| 5; 10 ; 4; 4600 ; 5 ; 6;3 |]
let keyField = 71
let rowGroups = 20
let groupSize = 500000
let rnd = new Random()
let randomString (maxLength:int) =
let length = rnd.Next(1,maxLength)
let chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
let sb = new System.Text.StringBuilder()
for i in 1..length do
let idx = rnd.Next(0,chars.Length)
sb.Append(chars.[idx]) |> ignore
sb.ToString()
let randomStringAsBytes (maxLength:int) = randomString maxLength |> Encoding.UTF8.GetBytes
// pre make these to get a realistic total size of keys across the data
let keyValues = Array.init<string> 10000 (fun _ -> randomString 10)
// write keys to file
File.WriteAllLines(keyFile,keyValues)
let finalSchema =
colPatterns
|> Array.mapi (fun i x ->
match x with
| "INT64" -> new DataField<int64>($"col{i}") :> Field
| "DOUBLE" -> new DataField<double>($"col{i}") :> Field
| "BYTE_ARRAY" -> new DataField<byte[]>($"col{i}") :> Field
| "STRING" -> new DataField<string>($"col{i}") :> Field
| "BOOLEAN" -> new DataField<bool>($"col{i}") :> Field
| "FLOAT" -> new DataField<float>($"col{i}") :> Field
| _ -> new DataField<int>($"col{i}") :> Field )
task {
let outSchema = new ParquetSchema(finalSchema)
use outFs = File.OpenWrite(outFile)
use! writer = ParquetWriter.CreateAsync(outSchema,outFs)
// write columns
for _ in 1..rowGroups do
use outWriteGroup = writer.CreateRowGroup()
let mutable nextByteArrayIndex = 0
for i in 0..colPatterns.Length-1 do
match colPatterns.[i] with
| "INT64" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<int64> groupSize (fun _ -> int64 (rnd.Next(0,1000000)))))
| "DOUBLE" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<double> groupSize (fun _ -> rnd.NextDouble())))
| "STRING" ->
// choose a random key value
do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init groupSize (fun _ -> keyValues.[rnd.Next(0,keyValues.Length)])))
| "BYTE_ARRAY" ->
// generate random strings
do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init groupSize (fun _ -> randomStringAsBytes (byteArraySizes.[nextByteArrayIndex]))))
nextByteArrayIndex <- nextByteArrayIndex + 1
| "BOOLEAN" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<bool> groupSize (fun _ -> rnd.Next(0,2) = 1)))
| "FLOAT" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<float> groupSize (fun _ -> float (rnd.NextDouble() ))))
| _ -> failwithf "unknowntype" // do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,int i)
return ()
} |> Async.AwaitTask |> Async.RunSynchronously
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment