Skip to content

Instantly share code, notes, and snippets.

@KenBonny
Last active January 5, 2024 16:37
Show Gist options
  • Save KenBonny/d283123af1303764b7bb447baf0ca368 to your computer and use it in GitHub Desktop.
Save KenBonny/d283123af1303764b7bb447baf0ca368 to your computer and use it in GitHub Desktop.
One Billion Rows F#
open System
open System.Diagnostics
open System.IO
[<Literal>]
let file = "./measurements.csv"
let stations = [
"Abha"
"Abidjan"
"Abéché"
"Accra"
"Addis Ababa"
"Adelaide"
"Aden"
"Ahvaz"
"Albuquerque"
"Alexandra"
"Alexandria"
"Algiers"
"Alice Springs"
"Almaty"
"Amsterdam"
"Anadyr"
"Anchorage"
"Andorra la Vella"
"Ankara"
"Antananarivo"
"Antsiranana"
"Arkhangelsk"
"Ashgabat"
"Asmara"
"Assab"
"Astana"
"Athens"
"Atlanta"
"Auckland"
"Austin"
"Baghdad"
"Baguio"
"Baku"
"Baltimore"
"Bamako"
"Bangkok"
"Bangui"
"Banjul"
"Barcelona"
"Bata"
"Batumi"
"Beijing"
"Beirut"
"Belgrade"
"Belize City"
"Benghazi"
"Bergen"
"Berlin"
"Bilbao"
"Birao"
"Bishkek"
"Bissau"
"Blantyre"
"Bloemfontein"
"Boise"
"Bordeaux"
"Bosaso"
"Boston"
"Bouaké"
"Bratislava"
"Brazzaville"
"Bridgetown"
"Brisbane"
"Brussels"
"Bucharest"
"Budapest"
"Bujumbura"
"Bulawayo"
"Burnie"
"Busan"
"Cabo San Lucas"
"Cairns"
"Cairo"
"Calgary"
"Canberra"
"Cape Town"
"Changsha"
"Charlotte"
"Chiang Mai"
"Chicago"
"Chihuahua"
"Chișinău"
"Chittagong"
"Chongqing"
"Christchurch"
"City of San Marino"
"Colombo"
"Columbus"
"Conakry"
"Copenhagen"
"Cotonou"
"Cracow"
"Da Lat"
"Da Nang"
"Dakar"
"Dallas"
"Damascus"
"Dampier"
"Dar es Salaam"
"Darwin"
"Denpasar"
"Denver"
"Detroit"
"Dhaka"
"Dikson"
"Dili"
"Djibouti"
"Dodoma"
"Dolisie"
"Douala"
"Dubai"
"Dublin"
"Dunedin"
"Durban"
"Dushanbe"
"Edinburgh"
"Edmonton"
"El Paso"
"Entebbe"
"Erbil"
"Erzurum"
"Fairbanks"
"Fianarantsoa"
"Flores, Petén"
"Frankfurt"
"Fresno"
"Fukuoka"
"Gabès"
"Gaborone"
"Gagnoa"
"Gangtok"
"Garissa"
"Garoua"
"George Town"
"Ghanzi"
"Gjoa Haven"
"Guadalajara"
"Guangzhou"
"Guatemala City"
"Halifax"
"Hamburg"
"Hamilton"
"Hanga Roa"
"Hanoi"
"Harare"
"Harbin"
"Hargeisa"
"Hat Yai"
"Havana"
"Helsinki"
"Heraklion"
"Hiroshima"
"Ho Chi Minh City"
"Hobart"
"Hong Kong"
"Honiara"
"Honolulu"
"Houston"
"Ifrane"
"Indianapolis"
"Iqaluit"
"Irkutsk"
"Istanbul"
"İzmir"
"Jacksonville"
"Jakarta"
"Jayapura"
"Jerusalem"
"Johannesburg"
"Jos"
"Juba"
"Kabul"
"Kampala"
"Kandi"
"Kankan"
"Kano"
"Kansas City"
"Karachi"
"Karonga"
"Kathmandu"
"Khartoum"
"Kingston"
"Kinshasa"
"Kolkata"
"Kuala Lumpur"
"Kumasi"
"Kunming"
"Kuopio"
"Kuwait City"
"Kyiv"
"Kyoto"
"La Ceiba"
"La Paz"
"Lagos"
"Lahore"
"Lake Havasu City"
"Lake Tekapo"
"Las Palmas de Gran Canaria"
"Las Vegas"
"Launceston"
"Lhasa"
"Libreville"
"Lisbon"
"Livingstone"
"Ljubljana"
"Lodwar"
"Lomé"
"London"
"Los Angeles"
"Louisville"
"Luanda"
"Lubumbashi"
"Lusaka"
"Luxembourg City"
"Lviv"
"Lyon"
"Madrid"
"Mahajanga"
"Makassar"
"Makurdi"
"Malabo"
"Malé"
"Managua"
"Manama"
"Mandalay"
"Mango"
"Manila"
"Maputo"
"Marrakesh"
"Marseille"
"Maun"
"Medan"
"Mek'ele"
"Melbourne"
"Memphis"
"Mexicali"
"Mexico City"
"Miami"
"Milan"
"Milwaukee"
"Minneapolis"
"Minsk"
"Mogadishu"
"Mombasa"
"Monaco"
"Moncton"
"Monterrey"
"Montreal"
"Moscow"
"Mumbai"
"Murmansk"
"Muscat"
"Mzuzu"
"N'Djamena"
"Naha"
"Nairobi"
"Nakhon Ratchasima"
"Napier"
"Napoli"
"Nashville"
"Nassau"
"Ndola"
"New Delhi"
"New Orleans"
"New York City"
"Ngaoundéré"
"Niamey"
"Nicosia"
"Niigata"
"Nouadhibou"
"Nouakchott"
"Novosibirsk"
"Nuuk"
"Odesa"
"Odienné"
"Oklahoma City"
"Omaha"
"Oranjestad"
"Oslo"
"Ottawa"
"Ouagadougou"
"Ouahigouya"
"Ouarzazate"
"Oulu"
"Palembang"
"Palermo"
"Palm Springs"
"Palmerston North"
"Panama City"
"Parakou"
"Paris"
"Perth"
"Petropavlovsk-Kamchatsky"
"Philadelphia"
"Phnom Penh"
"Phoenix"
"Pittsburgh"
"Podgorica"
"Pointe-Noire"
"Pontianak"
"Port Moresby"
"Port Sudan"
"Port Vila"
"Port-Gentil"
"Portland (OR)"
"Porto"
"Prague"
"Praia"
"Pretoria"
"Pyongyang"
"Rabat"
"Rangpur"
"Reggane"
"Reykjavík"
"Riga"
"Riyadh"
"Rome"
"Roseau"
"Rostov-on-Don"
"Sacramento"
"Saint Petersburg"
"Saint-Pierre"
"Salt Lake City"
"San Antonio"
"San Diego"
"San Francisco"
"San Jose"
"San José"
"San Juan"
"San Salvador"
"Sana'a"
"Santo Domingo"
"Sapporo"
"Sarajevo"
"Saskatoon"
"Seattle"
"Ségou"
"Seoul"
"Seville"
"Shanghai"
"Singapore"
"Skopje"
"Sochi"
"Sofia"
"Sokoto"
"Split"
"St. John's"
"St. Louis"
"Stockholm"
"Surabaya"
"Suva"
"Suwałki"
"Sydney"
"Tabora"
"Tabriz"
"Taipei"
"Tallinn"
"Tamale"
"Tamanrasset"
"Tampa"
"Tashkent"
"Tauranga"
"Tbilisi"
"Tegucigalpa"
"Tehran"
"Tel Aviv"
"Thessaloniki"
"Thiès"
"Tijuana"
"Timbuktu"
"Tirana"
"Toamasina"
"Tokyo"
"Toliara"
"Toluca"
"Toronto"
"Tripoli"
"Tromsø"
"Tucson"
"Tunis"
"Ulaanbaatar"
"Upington"
"Ürümqi"
"Vaduz"
"Valencia"
"Valletta"
"Vancouver"
"Veracruz"
"Vienna"
"Vientiane"
"Villahermosa"
"Vilnius"
"Virginia Beach"
"Vladivostok"
"Warsaw"
"Washington, D.C."
"Wau"
"Wellington"
"Whitehorse"
"Wichita"
"Willemstad"
"Winnipeg"
"Wrocław"
"Xi'an"
"Yakutsk"
"Yangon"
"Yaoundé"
"Yellowknife"
"Yerevan"
"Yinchuan"
"Zagreb"
"Zanzibar City"
"Zürich"
]
let rnd = Random(int DateTime.Now.Ticks)
let temp () = decimal (rnd.Next(-100, 500)) / 10m
let writer = new StreamWriter(file, false)
let stopwatch = Stopwatch.StartNew ()
let args = fsi.CommandLineArgs |> Array.tail
let rows = if args.Length > 0 then int args[0] else 1_000_000_000
[1..rows]
|> List.map (fun _ ->
let station = stations[rnd.Next(stations.Length)]
writer.WriteLine($"{station};%g{temp ()}")
)
stopwatch.Stop ()
printfn $"Time to create data: {stopwatch.Elapsed}"
writer.Dispose ()
type Numbers = {
mutable Min: decimal
mutable Max: decimal
mutable Total: decimal
mutable Count: decimal
}
let update newNumber numbers =
if (newNumber < numbers.Min) then
numbers.Min <- newNumber
else if (newNumber > numbers.Max) then
numbers.Max <- newNumber
numbers.Total <- numbers.Total + newNumber
numbers.Count <- numbers.Count + 1m
()
let mean numbers = System.Math.Round (numbers.Total / numbers.Count, 2)
let file = fsi.CommandLineArgs |> Array.tail |> String.concat ""
let reader = new System.IO.StreamReader(file)
let stopwatch = System.Diagnostics.Stopwatch.StartNew()
let stations = System.Collections.Generic.Dictionary<string, Numbers>()
while not reader.EndOfStream do
let line = reader.ReadLine()
let semicolonIndex = line.IndexOf ';'
let key = line.Substring(0, semicolonIndex)
let temp = decimal (line.Substring(semicolonIndex + 1))
if stations.ContainsKey key then
update temp stations[key]
else
stations.Add(key, {Min = temp; Max = temp; Total = temp; Count = 1m})
for item in stations |> Seq.sortBy (_.Key) do
let numbers = item.Value
printfn $"{item.Key} = {numbers.Min} / {mean numbers} / {item.Value.Max}"
stopwatch.Stop()
printfn $"Elapsed: {stopwatch.Elapsed}"
dotnet fsi .\generate-rows.fsx 1000000000
dotnet fsi .\pretty-slow-solution-to-parsing-one-billion-rows.fsx measurements.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment