Created
July 16, 2013 04:41
-
-
Save bohdanszymanik/6005813 to your computer and use it in GitHub Desktop.
Sample using both Windows Azure Table Storage and a bit of simulation with MathNet.Numerics.Statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// We're going to simulate some financial transactions and experiement with storing them in azure table storage | |
// | |
//#r "..\packages\Fog.0.1.3.1\Lib\Net40\Fog.dll" | |
//#r "..\packages\FogMyBuild\Debug\Fog.dll" | |
//#r "Microsoft.WindowsAzure.Diagnostics" | |
#r @"C:\wd\AzureTxnUploader\packages\WindowsAzure.Storage.2.0.6.0\lib\net40\Microsoft.WindowsAzure.Storage.dll" | |
//#r @"C:\wd\AzureTxnUploader\packages\Microsoft.Data.Edm.5.5.0\lib\net40\Microsoft.Data.Edm.dll" | |
//#r @"C:\wd\AzureTxnUploader\packages\System.Spatial.5.5.0\lib\net40\System.Spatial.dll" | |
#r @"C:\wd\AzureTxnUploader\packages\Microsoft.Data.OData.5.2.0\lib\net40\Microsoft.Data.OData.dll" | |
#r @"C:\wd\AzureTxnUploader\packages\Microsoft.Data.Edm.5.2.0\lib\net40\Microsoft.Data.Edm.dll" | |
#r @"C:\wd\AzureTxnUploader\packages\System.Spatial.5.2.0\lib\net40\System.Spatial.dll" | |
#r "Microsoft.WindowsAzure.ServiceRuntime" | |
#r "System.Data.Services.Client" | |
#r "System.Linq" | |
open System | |
open Microsoft.WindowsAzure | |
open Microsoft.WindowsAzure.Storage | |
open System.Windows.Forms | |
// Account name and key. Modify for your account. | |
let accountName = "someAccountName" | |
let accountKey = "someAccountKey" | |
//Get a reference to the storage account, with authentication credentials | |
let credentials = new Auth.StorageCredentials(accountName, accountKey) | |
let storageAccount = new CloudStorageAccount(credentials, true) | |
storageAccount.BlobEndpoint = Uri("http://bohszy.blob.core.windows.net/") | |
storageAccount.TableEndpoint = Uri("http://bohszy.table.core.windows.net/") | |
storageAccount.QueueEndpoint = Uri("http://bohszy.queue.core.windows.net/") | |
//Create a new client and use it to create a table | |
let tableClient = storageAccount.CreateCloudTableClient() | |
let table = tableClient.GetTableReference("someTable") | |
table.CreateIfNotExists() | |
// quickly experiment putting some entities into the table | |
// for our test scenario let's consider fictional transcations | |
// we'll use the partition and row keys to identify an individual month's worth of transactions for a customer's account | |
//[<DataServiceKey("PartitionKey", "RowKey")>] | |
type YearMonthTxns() = | |
inherit Table.TableEntity() | |
member val txns = "" with get, set | |
// we have a few choices for how to identify a customer-account-year-month's worth of transactions | |
// we can make the partition key more or less fine grained | |
// that is, we could use cust-acct-year as a partition key, month as row key, or | |
// cust-year as partition key, account-month as row key, or even | |
// How to decide??? Lot's of good advice here: http://msdn.microsoft.com/en-us/library/windowsazure/hh508997.aspx | |
// Partition key is for resiliency to enable partitioning and replication of data | |
// Entity Group Transactions requires the same partition key for the batch - but that's probably not important in this scenario | |
// For best performance you want items likely to be retrieved together within a single partition | |
// but partitions are local to servers - so retrieving too much, too often could impact performance (or hit partition rate limit of 500 requests/sec) | |
// So I think in real use you'd need to try a variety of strategies, also data is returned sorted by key so perhaps we need to decide on the order of the | |
// parts making up the keys | |
// How about we run with partition key being year-customer and use month-account for the row key | |
// note that a number of characters are not allowed within the key eg /\#? and it looks like all the control characters | |
let someYearMonthTxns = YearMonthTxns( PartitionKey = "2013-Fred", RowKey = "06-Savings", txns = "test" ) | |
let cmd = Table.TableOperation.Insert(someYearMonthTxns) | |
let tableResult = table.Execute(cmd) | |
// Output container URI to debug window. | |
System.Diagnostics.Debug.WriteLine(container.Uri) | |
// what I need to do now is generate some higher volume data to get a handle on pricing | |
// this means a bit of simulation | |
// let's create a set of ~1000 test customers | |
let cs = seq{1..1000} | |
// with a random number of accounts centered around 2, log-normal like so there's a good tail to it | |
// and some random txns spread across months in a single year, maybe centered on 10, let's use MathNet.Numerics | |
#r @"C:\wd\AzureTxnUploader\packages\MathNet.Numerics.2.5.0\lib\net40\MathNet.Numerics.dll" | |
#r @"C:\wd\AzureTxnUploader\packages\MathNet.Numerics.FSharp.2.5.0\lib\net40\MathNet.Numerics.FSharp.dll" | |
open MathNet.Numerics.Distributions | |
open MathNet.Numerics.Random | |
open MathNet.Numerics.Statistics | |
#load @"C:\wd\AzureTxnUploader\packages\FSharp.Charting.0.83\FSharp.Charting.fsx" | |
open FSharp.Charting | |
let logNormal = new LogNormal(1.0, 0.5) | |
let ha = MathNet.Numerics.Statistics.Histogram( (logNormal.Samples() |> Seq.take 10000) , 20, 0.0, 20.0) | |
Chart.Column( seq {for b in 1 .. ha.BucketCount - 1 do yield (float)b, ha.Item(b).Count} ) // looks good! | |
// and for transactions we can try weibull, just to be different | |
let weibull = new Weibull(4.0, 11.0) | |
let ht = MathNet.Numerics.Statistics.Histogram( (weibull.Samples() |> Seq.take 10000) , 20, 0.0, 20.0) | |
Chart.Column( seq {for b in 1 .. ht.BucketCount - 1 do yield (float)b, ht.Item(b).Count} ) // honestly, this doesn't look that great... but we'll use it | |
// going to be generating a few random numbers ahead, for both performance and randomness better to have one instance which we use across future code | |
// creating System.Random() within various functions called repeatedly over short intervals just generates the same number sequence - and actually, I | |
// find on this laptop timing is so quick that the based seeding of the default constructor starts the sequence at the same value for most of the iterations | |
let random = new System.Random() | |
// clever little function from: http://stackoverflow.com/questions/1855150/randomly-choose-an-instance-from-union-in-f | |
let randInst<'t>() = | |
let cases = Reflection.FSharpType.GetUnionCases(typeof<'t>) | |
let index = random.Next(cases.Length) | |
let case = cases.[index] | |
Reflection.FSharpValue.MakeUnion(case, [||]) :?> 't | |
// do I need to make all these types classes? | |
type TxnType = ATM | POS | BP | Cash | Deposit | |
type Txn = | |
{ | |
Id : Guid | |
Posted : DateTime; | |
Value : decimal; | |
Type : TxnType; | |
Description : string; | |
} | |
// some imaginative sample text data | |
let txnDescLookup = ["payment to fred"; "money for me"; "savings for house"; "supermarket"; "petrol"; "loan repayment"] | |
let createRandMonthlyTxnList year month = | |
// we use the sample from our distribution for txns/month to give the number of times we randomly choose a day in the month | |
[ for i in 1 .. (int)(weibull.Sample()) do | |
yield { | |
Txn.Id = Guid.NewGuid(); | |
Txn.Posted=( new DateTime(year, month, random.Next( 1, DateTime.DaysInMonth(year, month) ) ) ); | |
Txn.Value = (decimal)( random.Next(0,100000) )/100M; | |
Txn.Type = randInst<TxnType>(); | |
Txn.Description = txnDescLookup.[ (random.Next(0, (List.length txnDescLookup)-1)) ]; | |
} ] |> List.sortBy (fun t -> t.Posted) | |
type Account = { | |
Id : int; | |
Txns : Txn list // going to deal with List here but change to Array before submitting to Azure | |
} | |
let createRandomAccountList dummy = // need to include a dummy parameter here to force re-evaluation on each call, interesting default behaviour is to cache fns without parameters | |
[1 .. (int)(logNormal.Sample())] | |
|> List.map (fun a -> {Account.Id = a; Account.Txns = [1 .. 12] |> List.map (fun m -> createRandMonthlyTxnList 2012 m) |> List.concat } ) | |
type Customer = { | |
Id : int | |
Accounts : Account list | |
} | |
let createCustomers = | |
[ | |
for i in 1 .. 100 do | |
// this is too ugly - using a dummy parameter to force evaluation. I'm going to revert to copying the body of the function below | |
// yield {Customer.Id = i; Customer.Accounts = createRandomAccountList 1} | |
yield {Customer.Id = i; Customer.Accounts = | |
[1 .. (int)(logNormal.Sample())] | |
|> List.map (fun a -> {Account.Id = a; Account.Txns = [1 .. 12] |> List.map (fun m -> createRandMonthlyTxnList 2012 m) |> List.concat } ) | |
} | |
] | |
// so, now I have a whole lot of customers, accounts and transactions, time to put them into azure | |
// and was I intending year-customer for the partition key and month-account for the row key with transactions in an entity, doh! I'm going to have reformat all this | |
// fortunately, I'm only storing data for one year - 2012 | |
// so here's my customers for 2012 with their accounts, month and txns by posted date | |
createCustomers |> Seq.iter (fun c -> | |
c.Accounts |> Seq.iter (fun a -> | |
a.Txns |> Seq.groupBy (fun t -> t.Posted.Month) | |
|> Seq.iter (fun (m, ts) -> | |
ts |> Seq.iter (fun t -> printfn "Cust %i, Account %i, Month %i, Txn %A" c.Id a.Id m t.Posted ) | |
) | |
) | |
) | |
// and looking at it as the number of txns in each cust acct month group | |
createCustomers |> Seq.iter (fun c -> | |
c.Accounts |> Seq.iter (fun a -> | |
a.Txns |> Seq.groupBy (fun t -> t.Posted.Month) | |
|> Seq.iter (fun (m, ts) -> printfn "Cust %i, Account %i, Month %i, Txns %A" c.Id a.Id m (Seq.length(ts)) ) | |
) | |
) | |
// so I think this is the entity type we need to use | |
// first attempt was to use <Txn list> but that wouldn't serialize and end up in azure | |
// then I tried Txn[] but that wouldn't work either | |
// then I figured why not serialise the txn list to a string first and put that in table - of course you can't then query it within azure either | |
type YearCustomerMonthAccountTxns() = | |
inherit Table.TableEntity() | |
member val txnJson : string = "" with get, set | |
#r "System.Xml" | |
#r "System.Runtime.Serialization" | |
open System.Text | |
open System.Runtime.Serialization.Json | |
open System.IO | |
let serializeTxnListToJson (txnList: Txn list) = | |
let serializer = new DataContractJsonSerializer( typeof<Txn list> ) | |
let stream = new MemoryStream() | |
let data = serializer.WriteObject(stream, txnList) | |
let updateData = stream.ToArray() | |
(Encoding.UTF8.GetString(updateData)) | |
// let's populate a bunch of entities and send to azure | |
createCustomers | |
|> Seq.iter (fun c -> | |
c.Accounts | |
|> Seq.iter (fun a -> | |
a.Txns | |
|> Seq.groupBy (fun t -> t.Posted.Month) | |
|> Seq.iter (fun (m, ts) -> | |
let cmd = Table.TableOperation.InsertOrMerge( YearCustomerMonthAccountTxns( | |
PartitionKey = (sprintf "2007-%i" c.Id), | |
RowKey = (sprintf "%i-%i" m a.Id), | |
txnJson = (ts |> List.ofSeq |> serializeTxnListToJson ) | |
) | |
) | |
let tableResult = table.Execute(cmd) | |
printfn "Cust %i, Account %i, Month %i, Txns %A" c.Id a.Id m (Seq.length ts) | |
) | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment