Skip to content

Instantly share code, notes, and snippets.

@simonbyrne
Last active October 1, 2018 23:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonbyrne/fb33de343bf870fd670313a118ca104e to your computer and use it in GitHub Desktop.
Save simonbyrne/fb33de343bf870fd670313a118ca104e to your computer and use it in GitHub Desktop.
CSV benchmark
using Random, DataFrames, CSV, CSVFiles, Pandas, CSVReader, TextParse, RCall
R"library(data.table)"
# only use a few decimal places: we're trying to avoid trigerring slow paths
function writedata(n)
df = DataFrames.DataFrame(a=rand(0:1e6,n)./100,
b=rand(0:1e6,n)./100,
c=rand(0:1e6,n)./100,
d=[randstring(10) for i=1:n])
CSV.write("data.csv", df)
end
writedata(10_000)
# Precompile
@time CSV.read("data.csv"); # CSV
@time DataFrames.DataFrame(CSVFiles.load("data.csv")); #CSVFiles
@time Pandas.read_csv("data.csv"); # Pandas
@time CSVReader.read_csv("data.csv");
@time TextParse.csvread("data.csv", pooledstrings=false, type_detect_rows=100);
@time R"fread('data.csv')"; # data.table
@time CSV.read("data.csv"); # CSV
@time DataFrames.DataFrame(CSVFiles.load("data.csv")); #CSVFiles
@time Pandas.read_csv("data.csv"); # Pandas
@time CSVReader.read_csv("data.csv");
@time TextParse.csvread("data.csv", pooledstrings=false, type_detect_rows=100);
@time R"fread('data.csv')"; # data.table
writedata(10_000_000)
@time CSV.read("data.csv"); # CSV
@time Pandas.read_csv("data.csv"); # Pandas
@time R"fread('data.csv')"; # data.table
[[AxisArrays]]
deps = ["Compat", "Dates", "IntervalSets", "IterTools", "Pkg", "Random", "RangeArrays", "Test"]
git-tree-sha1 = "2e2536e9e6f27c4f8d09d8442b61a7ae0b910c28"
uuid = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
version = "0.3.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BinaryProvider]]
deps = ["Libdl", "Pkg", "SHA", "Test"]
git-tree-sha1 = "48c147e63431adbcee69bc40b04c3f0fec0a4982"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.0"
[[CSV]]
deps = ["CategoricalArrays", "DataFrames", "DataStreams", "Dates", "Mmap", "Parsers", "Pkg", "Profile", "Random", "Tables", "Test", "Unicode", "WeakRefStrings"]
git-tree-sha1 = "da83cb359d838758adf057719cdbfeffb074aabe"
uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
version = "0.4.1"
[[CSVFiles]]
deps = ["DataValues", "FileIO", "HTTP", "IterableTables", "IteratorInterfaceExtensions", "TableShowUtils", "TableTraits", "TableTraitsUtils", "Test", "TextParse"]
git-tree-sha1 = "b39c9d94d944ab5c7ee9d7503509a63ea21d564c"
uuid = "5d742f6a-9f54-50ce-8119-2520741973ca"
version = "0.9.1"
[[CSVReader]]
deps = ["DataFrames", "InternedStrings", "Parsers"]
git-tree-sha1 = "2ca3695def16c1ba3f8b6cca23f4c1c8364de478"
repo-rev = "master"
repo-url = "https://github.com/tk3369/CSVReader.jl"
uuid = "6320db66-f659-5b09-9a97-e9f7ce0d36e4"
version = "0.1.0"
[[CategoricalArrays]]
deps = ["Compat", "Future", "JSON", "Missings", "Printf", "Reexport"]
git-tree-sha1 = "6362c49130b5888f5628bc197ee5f17aec7d2a88"
uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
version = "0.4.0"
[[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "Pkg", "Test", "TranscodingStreams"]
git-tree-sha1 = "83cb3d65c37ea1364c2d5bf7bcea41843ba645dc"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.5.0"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "ff2595695fc4f14427358ce2593f867085c45dcb"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "1.2.0"
[[Conda]]
deps = ["Compat", "JSON", "VersionParsing"]
git-tree-sha1 = "85b5bf3ffcf4f39abe019dab1dd00a0aead8d882"
uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
version = "1.0.2"
[[DataFrames]]
deps = ["CategoricalArrays", "CodecZlib", "Compat", "DataStreams", "Dates", "InteractiveUtils", "IteratorInterfaceExtensions", "LinearAlgebra", "Missings", "Pkg", "Printf", "Random", "Reexport", "SortingAlgorithms", "Statistics", "StatsBase", "TableTraits", "Tables", "Test", "TranscodingStreams", "Unicode", "WeakRefStrings"]
git-tree-sha1 = "0fcb0c9914f31e0607b1965dc5a9e15c969c4806"
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
version = "0.14.0"
[[DataStreams]]
deps = ["Dates", "Missings", "Pkg", "Test", "WeakRefStrings"]
git-tree-sha1 = "69c72a1beb4fc79490c361635664e13c8e4a9548"
uuid = "9a8bc11e-79be-5b39-94d7-1ccc349a1a85"
version = "0.4.1"
[[DataStructures]]
deps = ["InteractiveUtils", "OrderedCollections", "REPL", "Random", "Serialization", "Test"]
git-tree-sha1 = "8fc6e166e24fda04b2b648d4260cdad241788c54"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.14.0"
[[DataValues]]
deps = ["Dates", "InteractiveUtils", "LinearAlgebra", "Random", "Test"]
git-tree-sha1 = "4fedccda7e5111354c7dcc832c7da83ff7258765"
uuid = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
version = "0.4.5"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[FileIO]]
deps = ["Pkg", "Random", "Test"]
git-tree-sha1 = "b80161b7e679a1241f9441ebfa60b62d4239cf99"
uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
version = "1.0.1"
[[Future]]
deps = ["Random"]
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
[[HTTP]]
deps = ["Base64", "Dates", "Distributed", "IniFile", "MbedTLS", "Sockets", "Test"]
git-tree-sha1 = "b881f69331e85642be315c63d05ed65d6fc8a05b"
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
version = "0.7.1"
[[IniFile]]
deps = ["Test"]
git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8"
uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
version = "0.5.0"
[[InteractiveUtils]]
deps = ["LinearAlgebra", "Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[InternedStrings]]
deps = ["Random", "Test"]
git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b"
uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
version = "0.7.0"
[[IntervalSets]]
deps = ["Compat"]
git-tree-sha1 = "bf1c727a12bbe0beb4888d439ee4e91b9ba7944a"
uuid = "8197267c-284f-5f27-9208-e0e47529a953"
version = "0.3.0"
[[IterTools]]
deps = ["Pkg", "SparseArrays", "Test"]
git-tree-sha1 = "ed0787e62dc46b8d8c7c3db54391d71e0da5fefd"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.0.0"
[[IterableTables]]
deps = ["DataValues", "IteratorInterfaceExtensions", "Requires", "TableTraits", "TableTraitsUtils", "Test"]
git-tree-sha1 = "486612943cd16ebb7fcc5b0a4dc2c80e8b9c7dbc"
uuid = "1c8ee90f-4401-5389-894e-7a04a3dc0f4d"
version = "0.9.0"
[[IteratorInterfaceExtensions]]
deps = ["Test"]
git-tree-sha1 = "5484e5ede2a4137b9643f4d646e8e7b87b794415"
uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "0.1.1"
[[JSON]]
deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]
git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.19.0"
[[Lazy]]
deps = ["Compat", "MacroTools", "Test"]
git-tree-sha1 = "1c2c5566f0eeaaad6979c156562384458f966e6a"
uuid = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0"
version = "0.13.1"
[[LibGit2]]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Compat"]
git-tree-sha1 = "c443e1c8d58a4e9f61b708ad0a88286c7042145b"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.4.4"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS]]
deps = ["BinaryProvider", "Libdl", "Pkg", "Random", "Sockets", "Test"]
git-tree-sha1 = "3775d205b09b624aa06d39012a8920ba99cb3b8b"
uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
version = "0.6.3"
[[Missings]]
deps = ["Dates", "InteractiveUtils", "SparseArrays", "Test"]
git-tree-sha1 = "adc26d2ee85a49c413464110d922cf21efc9d233"
uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
version = "0.3.1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[Nullables]]
deps = ["Compat", "Pkg"]
git-tree-sha1 = "ae1a63457e14554df2159b0b028f48536125092d"
uuid = "4d1e1d77-625e-5b40-9113-a560ec7a8ecd"
version = "0.0.8"
[[OrderedCollections]]
deps = ["Pkg", "Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.0.2"
[[Pandas]]
deps = ["Compat", "DataValues", "IteratorInterfaceExtensions", "Lazy", "Pkg", "PyCall", "Statistics", "TableTraits", "TableTraitsUtils", "Test"]
git-tree-sha1 = "a6b6f02de94029e72d8bec68c5413fa80eb1156f"
uuid = "eadc2687-ae89-51f9-a5d9-86b5a6373a9c"
version = "1.0.2"
[[Parsers]]
deps = ["Dates", "Mmap", "Pkg", "Test"]
git-tree-sha1 = "d5252e3f228a513b9947585e95b94d146b7d66e4"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "0.2.7"
[[Pkg]]
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[PooledArrays]]
deps = ["Test"]
git-tree-sha1 = "5c5ded7adc52867f599c21d3f43542fce491afda"
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
version = "0.4.1"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[Profile]]
deps = ["Printf"]
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
[[PyCall]]
deps = ["Compat", "Conda", "MacroTools", "Statistics", "VersionParsing"]
git-tree-sha1 = "f56428481fd0caf01cc8ecd2a0892fdaf8fddd50"
uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
version = "1.18.4"
[[RCall]]
deps = ["AxisArrays", "CategoricalArrays", "Conda", "DataFrames", "DataStructures", "Dates", "Libdl", "Missings", "Pkg", "REPL", "Random", "Requires", "StatsModels", "Test", "WinReg"]
git-tree-sha1 = "fe763209d3be186abfa4a8003b6678889b2ff679"
uuid = "6f49c342-dc21-5d91-9882-a32aef131414"
version = "0.12.1"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[RangeArrays]]
deps = ["Compat"]
git-tree-sha1 = "d925adfd5b01cb46fde89dc9548d167b3b136f4a"
uuid = "b3c3ace0-ae52-54e7-9d0b-2c1406fd6b9d"
version = "0.3.1"
[[Reexport]]
deps = ["Pkg"]
git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "0.2.0"
[[Requires]]
deps = ["Test"]
git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "0.5.2"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SortingAlgorithms]]
deps = ["DataStructures", "Random", "Test"]
git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
version = "0.3.1"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[StatsBase]]
deps = ["DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "Test"]
git-tree-sha1 = "723193a13e8078cec6dcd0b8fe245c8bfd81690e"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.25.0"
[[StatsModels]]
deps = ["Compat", "DataFrames", "Pkg", "StatsBase", "Test"]
git-tree-sha1 = "8af0d8dbcdee59daa386167ee0cf0278e5c44263"
uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
version = "0.3.1"
[[TableShowUtils]]
deps = ["DataValues", "Dates", "JSON", "Markdown", "Test"]
git-tree-sha1 = "7295e0ed103c41e71e0a893685090816527350ec"
uuid = "5e66a065-1f0a-5976-b372-e0b8c017ca10"
version = "0.2.0"
[[TableTraits]]
deps = ["IteratorInterfaceExtensions", "Test"]
git-tree-sha1 = "afee1fb3bc99c28eb4533ff0f22e33f6effcec18"
uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
version = "0.3.1"
[[TableTraitsUtils]]
deps = ["DataValues", "IteratorInterfaceExtensions", "Missings", "Pkg", "TableTraits", "Test"]
git-tree-sha1 = "a355f1882d64881a11f853e64dcc353975c4df6e"
uuid = "382cd787-c1b6-5bf2-a167-d5b971a19bda"
version = "0.3.1"
[[Tables]]
deps = ["Pkg", "Requires", "Test"]
git-tree-sha1 = "277464179bc7cfb1b4d5a4f3ccde0fc75792157f"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "0.1.8"
[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TextParse]]
deps = ["CodecZlib", "Compat", "DataStructures", "Dates", "Mmap", "Nullables", "PooledArrays", "Test", "WeakRefStrings"]
git-tree-sha1 = "f33529861ce1126edb9c0160243d5e67888bc5cc"
uuid = "e0df1984-e451-5cb5-8b61-797a481e67e3"
version = "0.6.0"
[[TranscodingStreams]]
deps = ["DelimitedFiles", "Pkg", "Random", "Test"]
git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.8.1"
[[UUIDs]]
deps = ["Random"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[VersionParsing]]
deps = ["Compat"]
git-tree-sha1 = "c9d5aa108588b978bd859554660c8a5c4f2f7669"
uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
version = "1.1.3"
[[WeakRefStrings]]
deps = ["Missings", "Random", "Test"]
git-tree-sha1 = "1087e8be380f2c8b96434b02bb1150fc1c511135"
uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
version = "0.5.3"
[[WinReg]]
deps = ["Test"]
git-tree-sha1 = "808380e0a0483e134081cc54150be4177959b5f4"
uuid = "1b915085-20d7-51cf-bf83-8f477d6f5128"
version = "0.3.1"
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CSVFiles = "5d742f6a-9f54-50ce-8119-2520741973ca"
CSVReader = "6320db66-f659-5b09-9a97-e9f7ce0d36e4"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Pandas = "eadc2687-ae89-51f9-a5d9-86b5a6373a9c"
RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
TextParse = "e0df1984-e451-5cb5-8b61-797a481e67e3"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment