Skip to content

Instantly share code, notes, and snippets.

@quinnj
Last active August 29, 2015 14:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quinnj/9a5dd7598e34b2bec310 to your computer and use it in GitHub Desktop.
Save quinnj/9a5dd7598e34b2bec310 to your computer and use it in GitHub Desktop.
Managing Data in Julia: Old Tricks, New Tricks Code
Pkg.clone("https://github.com/quinnj/Mmap.jl")
Pkg.clone("https://github.com/quinnj/CSV.jl")
Pkg.add("ODBC")
Pkg.checkout("SQLite","jq/remodel")
Pkg.add("SQLite")
Pkg.checkout("SQLite","jq/updates")
reload("Mmap")
reload("CSV")
reload("ODBC")
reload("SQLite")
### Mmap.jl
# Let's create a toy file
t = tempname()
open(t,"w") do f
write(f,"Hello World\n")
end
# mmap it
m = Mmap.mmap(t)
# Make a string out of it
ASCIIString(m)
# Can modify
m[5] = 0x00
# See the modification
ASCIIString(m)
# Sync the changes to disk
Mmap.sync!(m)
# Verify changes from disk
open(readall,t)
# Let's make an mmapped-array linked to a new file
t2 = tempname()
m2 = Mmap.mmap(t2,Float64,100)
# put some data in our mmapped-array
copy!(m2,randn(100))
m2
# Sync changes to disk
Mmap.sync!(m2)
# close it
m2 = nothing; gc()
# Read the array back into Julia
Mmap.mmap(t2,Float64)
# Can also get chunks of memory
m3 = Mmap.mmap(Uint8,100)
### CSV.jl
# Pretty straightforward
DIR = joinpath(Pkg.dir("CSV"),"test/test_files")
readdir(DIR)
f = CSV.File(joinpath(DIR,"baseball.csv"))
data = CSV.read(f)
f = CSV.File(joinpath(DIR,"Sacramentorealestatetransactions.csv");newline='\r')
CSV.read(f)
f = CSV.File("/Users/jacobquinn/Downloads/bids.csv")
@time data = CSV.read(f) # 12.2 seconds
@time readcsv("/Users/jacobquinn/Downloads/bids.csv")
# 42.13 seconds
### R
# > system.time(fread("/Users/jacobquinn/Downloads/bids.csv"))
# Read 7656334 rows and 9 (of 9) columns from 0.862 GB file in 00:00:34
# user system elapsed
# 32.829 0.662 33.506
### with require(bit64)
# > system.time(fread("/Users/jacobquinn/Downloads/bids.csv"))
# Read 7656334 rows and 9 (of 9) columns from 0.862 GB file in 00:00:19
# user system elapsed
# 18.424 0.569 19.002
### Pandas
# start_time = time.time()
# pandas.read_csv("/Users/jacobquinn/Downloads/bids.csv")
# print("--- %s seconds ---" % (time.time() - start_time))
# [7656334 rows x 9 columns]
# >>> print("--- %s seconds ---" % (time.time() - start_time))
# --- 11.8574368954 seconds ---
# >>>
### ODBC
reload("ODBC")
co = ODBC.advancedconnect("Driver={MySQL Unicode};Server=ensembldb.ensembl.org;User=anonymous")
ODBC.query(co,"use homo_sapiens_vega_69_37")
ODBC.query(co,"select count(*) from exon")
ODBC.query(co,"show columns from exon")
ODBC.query(co,"select phase from exon group by phase")
ODBC.query(co,"select count(*) from exon where phase = 0")
# SQLite
db = SQLite.DB(joinpath(Pkg.dir("SQLite"),"test/Chinook_Sqlite.sqlite"))
results = SQLite.query(db,"SELECT name FROM sqlite_master WHERE type='table';")
results1 = SQLite.tables(db)
results = SQLite.query(db,"SELECT * FROM Employee;")
SQLite.query(db,"SELECT * FROM Album;")
SQLite.query(db,"SELECT a.*, b.AlbumId
FROM Artist a
LEFT OUTER JOIN Album b ON b.ArtistId = a.ArtistId
ORDER BY name;")
r = SQLite.query(db,"create table temp as select * from album")
r = SQLite.query(db,"select * from temp limit 10")
SQLite.query(db,"alter table temp add column colyear int")
SQLite.query(db,"update temp set colyear = 2014")
r = SQLite.query(db,"select * from temp limit 10")
SQLite.query(db,"alter table temp add column dates blob")
stmt = SQLite.Stmt(db,"update temp set dates = ?")
SQLite.bind!(stmt,1,Date(2014,1,1))
SQLite.execute!(stmt)
r = SQLite.query(db,"select * from temp limit 10")
finalize(stmt)
SQLite.query(db,"drop table temp")
SQLite.create(db,"temp",zeros(5,5))
r = SQLite.query(db,"select * from temp")
SQLite.drop!(db,"temp")
SQLite.create(db,"temp",zeros(Int,5,5))
r = SQLite.query(db,"select * from temp")
SQLite.append!(db,"temp",ones(Int,5,5))
r = SQLite.query(db,"select * from temp")
SQLite.drop!(db,"temp")
rng = Date(2013):Date(2013,1,5)
SQLite.create(db,"temp",[i for i = rng, j = rng])
r = SQLite.query(db,"select * from temp")
SQLite.drop!(db,"temp")
triple(x) = 3x
SQLite.register(db, triple, nargs=1)
r = SQLite.query(db, "SELECT Total FROM Invoice ORDER BY InvoiceId LIMIT 5")
s = SQLite.query(db, "SELECT triple(Total) FROM Invoice ORDER BY InvoiceId LIMIT 5")
[3r[1] s[1]]
SQLite.@register db function add4(q)
q+4
end
r = SQLite.query(db, "SELECT add4(AlbumId) FROM Album")
s = SQLite.query(db, "SELECT AlbumId FROM Album")
[r[1] s[1]+4]
SQLite.@register db mult(args...) = *(args...)
r = SQLite.query(db, "SELECT Milliseconds, Bytes FROM Track")
s = SQLite.query(db, "SELECT mult(Milliseconds, Bytes) FROM Track")
t = SQLite.query(db, "SELECT mult(Milliseconds, Bytes, 3, 4) FROM Track")
SQLite.@register db sin
u = SQLite.query(db, "select sin(milliseconds) from track limit 5")
SQLite.register(db, hypot; nargs=2, name="hypotenuse")
v = SQLite.query(db, "select hypotenuse(Milliseconds,bytes) from track limit 5")
SQLite.@register db str2arr(s) = convert(Array{UInt8}, s)
r = SQLite.query(db, "SELECT str2arr(LastName) FROM Employee LIMIT 2")
SQLite.@register db big
r = SQLite.query(db, "SELECT big(5)")
@assert r[1][1] == big(5)
doublesum_step(persist, current) = persist + current
doublesum_final(persist) = 2 * persist
SQLite.register(db, 0, doublesum_step, doublesum_final, name="doublesum")
r = SQLite.query(db, "SELECT doublesum(UnitPrice) FROM Track")
r = SQLite.query(db, "SELECT doublesum(Total) FROM Invoice")
s = SQLite.query(db, "SELECT UnitPrice FROM Track")
Base.Test.@test_approx_eq r[1][1] 2*sum(s[1])
mycount(p, c) = p + 1
SQLite.register(db, 0, mycount)
r = SQLite.query(db, "SELECT mycount(TrackId) FROM PlaylistTrack")
s = SQLite.query(db, "SELECT count(TrackId) FROM PlaylistTrack")
bigsum(p, c) = p + big(c)
SQLite.register(db, big(0), bigsum)
r = SQLite.query(db, "SELECT bigsum(TrackId) FROM PlaylistTrack")
s = SQLite.query(db, "SELECT TrackId FROM PlaylistTrack")
@assert r[1][1] == big(sum(s[1]))
@time SQLite.create(db,CSV.File("/Users/jacobquinn/Downloads/bids.csv"),"temp")
@time SQLite.append!(db,"temp",CSV.File("/Users/jacobquinn/Downloads/bids.csv"))
SQLite.scalarquery(db,"select count(*) from temp")
@time SQLite.query(db,"select country from temp group by 1")
SQLite.createindex(db,"temp","ind","country";unique=false)
@time SQLite.query(db,"select country from temp group by 1")
SQLite.drop!(db,"temp")
# Tables.jl
Pkg.clone("https://github.com/quinnj/Tables.jl")
using Tables
t = Tables.Table(Int,10,10)
size(t) == (10,10)
typeof(t) === Tables.Table
Tables.columns(t)
Tables.rows(t)
Tables.types(t)
t[1,1] = 1
t
t[1,1:10] = 2
for i = 1:10
Base.Test.@test t[1,i] == 2
end
t2 = t[1,1:10]
t3 = t[1:10,1]
t4 = t[wh"Column2 > 0"]
t5 = t[wh"Column2 > 0",1:2]
unique(t3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment