Created
May 17, 2024 06:32
-
-
Save daz10000/d65ede204e845ed097d58f750a797ec2 to your computer and use it in GitHub Desktop.
Make a synthetic parquet file, query using python and dotnet wrappers for duckdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "nuget:DuckDB.NET.Data.Full" | |
open DuckDB.NET.Data | |
open System | |
open System.IO | |
let keys = File.ReadAllLines("keys.txt") | |
let getGeneActualMutations (path:string) (key:string) = | |
use connection = new DuckDBConnection("DataSource=:memory:") | |
connection.Open() | |
use command = connection.CreateCommand() | |
command.CommandText <- $"SELECT col71,col2,col3,col20,col21,col34, col8,col51,col60,col72,col43,col19 FROM read_parquet('{path}') where col71 = '{key}'" | |
[| use reader = command.ExecuteReader() | |
while reader.Read() do | |
reader.GetString(0) | |
|] | |
printfn "Starting" | |
let start = DateTime.Now | |
for g in keys do | |
let localNow = DateTime.Now | |
let x = getGeneActualMutations "test.pq" g | |
printfn $"{g}: {x.Length} {(DateTime.Now-localNow).TotalSeconds}" | |
printfn $"Total time={(DateTime.Now-start).TotalSeconds}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import duckdb | |
keys = open('keys.txt').readlines()#[:10] | |
import time | |
start = time.time() | |
for g in keys: | |
g = g.strip() | |
start2 = time.time() | |
x = duckdb.sql(f"""SELECT col1,col2,col3,col20,col21,col34, col8,col51,col60,col72,col43,col19 FROM read_parquet('test.pq') where col71 = '{g}'""").df() | |
print(f"{g}: {len(x)} {time.time()-start2:.2f} seconds") | |
print(f"Total time={time.time()-start}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "nuget:Parquet.Net" | |
open Parquet | |
open Parquet.Schema | |
open Parquet.Data | |
open System.IO | |
open System | |
open System.Text | |
let outFile = "test.pq" | |
let keyFile = "keys.txt" | |
let colPatterns = "INT64,DOUBLE,INT64,INT64,BYTE_ARRAY,INT64,INT64,DOUBLE,DOUBLE,DOUBLE,INT64,INT64,DOUBLE,INT64,INT64,DOUBLE,INT64,INT64,DOUBLE,BYTE_ARRAY,BYTE_ARRAY,BYTE_ARRAY,BYTE_ARRAY,INT64,INT64,INT64,INT64,DOUBLE,DOUBLE,DOUBLE,INT64,DOUBLE,DOUBLE,BOOLEAN,INT64,DOUBLE,DOUBLE,INT64,FLOAT,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,INT64,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,BOOLEAN,DOUBLE,INT64,FLOAT,INT64,FLOAT,INT64,DOUBLE,BYTE_ARRAY,BOOLEAN,STRING,BYTE_ARRAY".Split(",") | |
let byteArraySizes = [| 5; 10 ; 4; 4600 ; 5 ; 6;3 |] | |
let keyField = 71 | |
let rowGroups = 20 | |
let groupSize = 500000 | |
let rnd = new Random() | |
let randomString (maxLength:int) = | |
let length = rnd.Next(1,maxLength) | |
let chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" | |
let sb = new System.Text.StringBuilder() | |
for i in 1..length do | |
let idx = rnd.Next(0,chars.Length) | |
sb.Append(chars.[idx]) |> ignore | |
sb.ToString() | |
let randomStringAsBytes (maxLength:int) = randomString maxLength |> Encoding.UTF8.GetBytes | |
// pre make these to get a realistic total size of keys across the data | |
let keyValues = Array.init<string> 10000 (fun _ -> randomString 10) | |
// write keys to file | |
File.WriteAllLines(keyFile,keyValues) | |
let finalSchema = | |
colPatterns | |
|> Array.mapi (fun i x -> | |
match x with | |
| "INT64" -> new DataField<int64>($"col{i}") :> Field | |
| "DOUBLE" -> new DataField<double>($"col{i}") :> Field | |
| "BYTE_ARRAY" -> new DataField<byte[]>($"col{i}") :> Field | |
| "STRING" -> new DataField<string>($"col{i}") :> Field | |
| "BOOLEAN" -> new DataField<bool>($"col{i}") :> Field | |
| "FLOAT" -> new DataField<float>($"col{i}") :> Field | |
| _ -> new DataField<int>($"col{i}") :> Field ) | |
task { | |
let outSchema = new ParquetSchema(finalSchema) | |
use outFs = File.OpenWrite(outFile) | |
use! writer = ParquetWriter.CreateAsync(outSchema,outFs) | |
// write columns | |
for _ in 1..rowGroups do | |
use outWriteGroup = writer.CreateRowGroup() | |
let mutable nextByteArrayIndex = 0 | |
for i in 0..colPatterns.Length-1 do | |
match colPatterns.[i] with | |
| "INT64" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<int64> groupSize (fun _ -> int64 (rnd.Next(0,1000000))))) | |
| "DOUBLE" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<double> groupSize (fun _ -> rnd.NextDouble()))) | |
| "STRING" -> | |
// choose a random key value | |
do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init groupSize (fun _ -> keyValues.[rnd.Next(0,keyValues.Length)]))) | |
| "BYTE_ARRAY" -> | |
// generate random strings | |
do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init groupSize (fun _ -> randomStringAsBytes (byteArraySizes.[nextByteArrayIndex])))) | |
nextByteArrayIndex <- nextByteArrayIndex + 1 | |
| "BOOLEAN" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<bool> groupSize (fun _ -> rnd.Next(0,2) = 1))) | |
| "FLOAT" -> do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,Array.init<float> groupSize (fun _ -> float (rnd.NextDouble() )))) | |
| _ -> failwithf "unknowntype" // do! outWriteGroup.WriteColumnAsync(DataColumn(finalSchema[i] :?> DataField,int i) | |
return () | |
} |> Async.AwaitTask |> Async.RunSynchronously |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment