bicycle1885/structure.jl

## structure.jl
import NamedTuples: NamedTuples, @NT

# Atom
# ----

# NOTE: This must be synchronized with `AtomCoords`.
struct Atom
    id::Int32
    x::Float32
    y::Float32
    z::Float32
end

function Base.convert(::Type{Atom}, atom::Tuple{Integer,AbstractFloat,AbstractFloat,AbstractFloat})
    return Atom(atom[1], atom[2], atom[3], atom[4])
end


# AtomCoords
# ----------

struct AtomCoords <: AbstractMatrix{Float32}
    data::Vector{Atom}
end

function Base.size(coords::AtomCoords)
    return (length(coords.data), 3)
end

function Base.getindex(coords::AtomCoords, i::Integer)
    return coords.data[i]
end

function Base.getindex(coords::AtomCoords, r::Range)
    return coords.data[r]
end

function Base.getindex(coords::AtomCoords, i::Integer, j::Integer)
    offset = sizeof(Atom) * (i - 1) + sizeof(Int32) * j
    return unsafe_load(Ptr{Float32}(pointer(coords.data) + offset))
end


# Structure
# ---------

struct Structure{A,R,C}
    # atom coordinates
    coords::AtomCoords

    # valid atom range
    range::UnitRange{Int}

    # hetatom?
    hetatom::BitVector

    # residue end positions
    residues::Vector{Int32}

    # chain end positions
    chains::Vector{Int32}

    # atom metadata
    atommeta::Vector{A}

    # residue metadata
    residuemeta::Vector{R}

    # chain metadata
    chainmeta::Vector{C}
end

function Structure(str::Structure, range::UnitRange{Int})
    return Structure(str.coords, range, str.hetatom, str.residues, str.chains, str.atommeta, str.residuemeta, str.chainmeta)
end

function Base.:(==)(str1::Structure, str2::Structure)
    return str1.coords === str2.coords && str1.range == str2.range
end

function Base.show(io::IO, str::Structure)
    if get(io, :compact, false)
        print(io, first(str.range), '-', last(str.range), " atoms")
    else
        prettyprint(io, str, displaysize(io)[1] - 4)
    end
end

function prettyprint(io::IO, str::Structure{A}, linemax::Integer) where A
    natoms = length(str.range)
    header = vcat(["id", "x", "y", "z"], string.(fieldnames(A)))
    lines = Vector{String}[]
    for i in str.range[1:min(linemax-2, endof(str.range))]
        atom = str.coords[i]
        line = [string(atom.id), @sprintf("%8.3f", atom.x), @sprintf("%8.3f", atom.y), @sprintf("%8.3f", atom.z)]
        meta = str.atommeta[i]
        for fname in fieldnames(A)
            push!(line, replace(string(meta[fname]), ' ', '·'))
            #push!(line, replace(string(meta[fname]), ' ', '␣'))
        end
        push!(lines, line)
    end
    if length(str.range) > linemax - 2
        push!(lines, fill("⋮", length(header)))
    end
    colwidth = map(length, header)
    for i in 1:endof(lines), j in 1:endof(lines[i])
        colwidth[j] = max(colwidth[j], length(lines[i][j]))
    end
    println(io, "Structure with $(natoms) atoms:")
    for j in 1:endof(header)
        print(io, lpad(header[j], colwidth[j]+1))
    end
    for i in 1:endof(lines)
        println(io)
        for j in 1:endof(lines[i])
            print(io, lpad(lines[i][j], colwidth[j]+1))
        end
    end
end

function Base.length(str::Structure)
    return length(str.range)
end

function Base.endof(str::Structure)
    return length(str.range)
end

function Base.getindex(str::Structure, i::Integer)
    return str.coords[str.range[i]]
end

function Base.getindex(str::Structure, r::UnitRange)
    return str.coords.data[str.range[r]]
end

function residueof(str::Structure, atom::Atom)
    pos = searchsortedfirst(str.residues, atom.id)
    return Structure(str, str.residues[pos-1]+1:str.residues[pos])
end

function chainof(str::Structure, atom::Atom)
    pos = searchsortedfirst(str.chains, atom.id)
    return Structure(str, str.chains[pos-1]+1:str.chains[pos])
end

function Base.filter(f::Function, str::Structure{A,R,C}) where {A,R,C}
    atoms = Atom[]
    hetatom = BitVector(0)
    residues = Int32[0]
    chains = Int32[0]
    atommeta = A[]
    residuemeta = R[]
    chainmeta = C[]
    cstart = searchsortedfirst(str.chains, first(str.range))
    cstop = searchsortedfirst(str.chains, last(str.range))
    for c in cstart:cstop
        crange = str.chains[c-1]+1:str.chains[c]
        rstart = searchsortedfirst(str.residues, first(crange))
        rstop = searchsortedfirst(str.residues, last(crange))
        for r in rstart:rstop
            rrange = str.residues[r-1]+1:str.residues[r]
            for ri in rrange
                atom = str.coords[ri]
                meta = str.atommeta[ri]
                # This is ~10x slower.
                #if ri ∈ str.range && f(merge_namedtuple(atom, meta))
                if ri ∈ str.range && f(atom, meta)
                    push!(atoms, atom)
                    push!(atommeta, meta)
                    push!(hetatom, str.hetatom[ri])
                end
            end
            push!(residues, endof(atoms))
            push!(residuemeta, str.residuemeta[r-1])
        end
        push!(chains, endof(atoms))
        push!(chainmeta, str.chainmeta[c-1])
    end
    return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
end

# Type-stable merge.
@generated function merge_namedtuple(atom::Atom, meta::NamedTuples.NamedTuple)
    fields = vcat(fieldnames(atom), fieldnames(meta))
    T = NamedTuples.create_namedtuple_type(fields)
    vals = [f ∈ fieldnames(meta) ? :(getfield(meta, $(Expr(:quote, f)))) : :(getfield(atom, $(Expr(:quote, f)))) for f in fields]
    return :($(T)($(vals...)))
end


# PDB Reader
# ----------

struct Reader
    file::IO
end

function read(reader::Reader)
    return read_impl(reader.file)
end

function read_impl(input::IO)
    # metadata types
    A = @NT(serial::Int32, name::String, occupancy::Float32, tempfactor::Float32, element::String)
    R = @NT(serial::Int32, name::String)
    C = @NT(name::Char)
    # create the named tuple type in advance
    NamedTuples.create_namedtuple_type(vcat(fieldnames(Atom), fieldnames(A)))
    line = 0
    atommeta = A[]
    residuemeta = R[]
    chainmeta = C[]
    try
        id = 0
        startpos = 1
        buf = Vector{UInt8}(256)
        lastchain = 0x00
        lastresidue = 0
        atoms = Atom[]
        hetatom = BitVector(0)
        residues = Int32[]
        chains = Int32[]
        while !eof(input)
            copy!(buf, readline(input))
            rng = 1:80
            line += 1
            isatom = startswith(buf, b"ATOM  ") || startswith(buf, b"HETATM")
            if isatom
                chain = buf[rng[22]]
                if chain != lastchain
                    push!(chains, id)
                    chainname = buf[rng[22]]
                    push!(chainmeta, C(chainname))
                    lastchain = chain
                end
                residue = get(tryparse_int(buf, rng[23:26]))
                if residue != lastresidue
                    push!(residues, id)
                    resserial = get(tryparse_int(buf, rng[23:26]))
                    resname = String(buf[rng[18:20]])
                    push!(residuemeta, R(resserial, resname))
                    lastresidue = residue
                end
                # read coordinates
                id += 1
                x = get(unsafe_tryparse_f32(buf, rng[31:38]))
                y = get(unsafe_tryparse_f32(buf, rng[39:46]))
                z = get(unsafe_tryparse_f32(buf, rng[47:54]))
                push!(atoms, (id, x, y, z))
                # HETATM?
                ishet = buf[1] == UInt8('H')
                push!(hetatom, ishet)
                # read other fields
                serial = get(tryparse_int(buf, rng[7:11]))
                name = String(buf[rng[13:16]])
                occupancy = get(unsafe_tryparse_f32(buf, rng[55:60]))
                tempfactor = get(unsafe_tryparse_f32(buf, rng[61:66]))
                element = String(buf[rng[77:78]])
                push!(atommeta, A(serial, name, occupancy, tempfactor, element))
            end
        end
        # push last residue and chain
        if lastchain != 0x00
            push!(chains, id)
        end
        if lastresidue != 0
            push!(residues, id)
        end
        return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
    catch ex
        # parsing error
        if isa(ex, NullException)
            throw(ArgumentError("malformed line at $(line)"))
        end
        # other error
        rethrow()
    end
end

function unsafe_tryparse_f32(data, range)
    ccall(
        :jl_try_substrtof,
        Nullable{Float32},
        (Ptr{UInt8}, Csize_t, Csize_t),
        data, first(range), length(range))
end

function tryparse_int(data, range)
    i = first(range)
    # skip left spaces
    while i ≤ last(range) && data[i] == UInt8(' ')
        i += 1
    end
    if i > last(range)
        return Nullable{Int}()
    end
    x = 0
    while i ≤ last(range)
        d = data[i]
        if d == UInt8(' ')
            break
        elseif d ∉ UInt8('0'):UInt8('9')
            return Nullable{Int}()
        end
        x = 10x + (d - UInt8('0'))
        i += 1
    end
    return Nullable(x)
end


# Atom Iterator
# -------------

struct AtomIterator
    str::Structure
end

function Base.eltype(::Type{AtomIterator})
    return Atom
end

function eachatom(str::Structure)
    return AtomIterator(str)
end

function Base.length(iter::AtomIterator)
    return length(iter.str.range)
end

function Base.start(::AtomIterator)
    return 1
end

function Base.done(iter::AtomIterator, i)
    return i > length(iter)
end

function Base.next(iter::AtomIterator, i)
    return Atom(iter.str.coords[i]), i + 1
end


# Structure Iterator
# ------------------

struct StructureIterator{unit,T}
    parent::Structure{T}
    range::UnitRange{Int}
end

function Base.eltype(::Type{StructureIterator{unit,T}}) where {unit,T}
    return Structure{T}
end

function Base.length(iter::StructureIterator)
    return length(iter.range) - 1
end

function eachresidue(str::Structure{T}) where T
    start = searchsortedfirst(str.residues, first(str.range))
    stop = searchsortedfirst(str.residues, last(str.range)+1)
    return StructureIterator{:residue,T}(str, start:stop)
end

function eachchain(str::Structure{T}) where T
    start = searchsortedfirst(str.chains, first(str.range))
    stop = searchsortedfirst(str.chains, last(str.range)+1)
    return StructureIterator{:chain,T}(str, start:stop)
end

function Base.start(iter::StructureIterator)
    return first(iter.range)
end

function Base.done(iter::StructureIterator, i)
    return i ≥ last(iter.range)
end

function Base.next(iter::StructureIterator{:residue}, i)
    return Structure(iter.parent, iter.parent.residues[i-1]+1:iter.parent.residues[i]), i + 1
end

function Base.next(iter::StructureIterator{:chain}, i)
    return Structure(iter.parent, iter.parent.chains[i-1]+1:iter.parent.chains[i]), i + 1
end


# Test
# ----

using Base.Test

str = read(Reader(open("1crn.pdb")))

@test length(str) == endof(str) == 327
@test str[1] === Atom(1, 17.047, 14.099, 3.625)
@test str[end] === Atom(327, 12.703, 4.973, 10.746)
@test isa(str[1:2], Vector{Atom})
@test length(str[1:2]) == 2

@test length(eachatom(str)) == 327
@test length(eachresidue(str)) == 46
@test length(eachchain(str)) == 1

@test residueof(str, first(eachatom(str))) == first(eachresidue(str))
@test chainof(str, first(eachatom(str))) == first(eachchain(str))

@time read(Reader(open("1crn.pdb")))
@time read(Reader(open("1htq.pdb")))
	import NamedTuples: NamedTuples, @NT

	# Atom
	# ----

	# NOTE: This must be synchronized with `AtomCoords`.
	struct Atom
	id::Int32
	x::Float32
	y::Float32
	z::Float32
	end

	function Base.convert(::Type{Atom}, atom::Tuple{Integer,AbstractFloat,AbstractFloat,AbstractFloat})
	return Atom(atom[1], atom[2], atom[3], atom[4])
	end


	# AtomCoords
	# ----------

	struct AtomCoords <: AbstractMatrix{Float32}
	data::Vector{Atom}
	end

	function Base.size(coords::AtomCoords)
	return (length(coords.data), 3)
	end

	function Base.getindex(coords::AtomCoords, i::Integer)
	return coords.data[i]
	end

	function Base.getindex(coords::AtomCoords, r::Range)
	return coords.data[r]
	end

	function Base.getindex(coords::AtomCoords, i::Integer, j::Integer)
	offset = sizeof(Atom) * (i - 1) + sizeof(Int32) * j
	return unsafe_load(Ptr{Float32}(pointer(coords.data) + offset))
	end


	# Structure
	# ---------

	struct Structure{A,R,C}
	# atom coordinates
	coords::AtomCoords

	# valid atom range
	range::UnitRange{Int}

	# hetatom?
	hetatom::BitVector

	# residue end positions
	residues::Vector{Int32}

	# chain end positions
	chains::Vector{Int32}

	# atom metadata
	atommeta::Vector{A}

	# residue metadata
	residuemeta::Vector{R}

	# chain metadata
	chainmeta::Vector{C}
	end

	function Structure(str::Structure, range::UnitRange{Int})
	return Structure(str.coords, range, str.hetatom, str.residues, str.chains, str.atommeta, str.residuemeta, str.chainmeta)
	end

	function Base.:(==)(str1::Structure, str2::Structure)
	return str1.coords === str2.coords && str1.range == str2.range
	end

	function Base.show(io::IO, str::Structure)
	if get(io, :compact, false)
	print(io, first(str.range), '-', last(str.range), " atoms")
	else
	prettyprint(io, str, displaysize(io)[1] - 4)
	end
	end

	function prettyprint(io::IO, str::Structure{A}, linemax::Integer) where A
	natoms = length(str.range)
	header = vcat(["id", "x", "y", "z"], string.(fieldnames(A)))
	lines = Vector{String}[]
	for i in str.range[1:min(linemax-2, endof(str.range))]
	atom = str.coords[i]
	line = [string(atom.id), @sprintf("%8.3f", atom.x), @sprintf("%8.3f", atom.y), @sprintf("%8.3f", atom.z)]
	meta = str.atommeta[i]
	for fname in fieldnames(A)
	push!(line, replace(string(meta[fname]), ' ', '·'))
	#push!(line, replace(string(meta[fname]), ' ', '␣'))
	end
	push!(lines, line)
	end
	if length(str.range) > linemax - 2
	push!(lines, fill("⋮", length(header)))
	end
	colwidth = map(length, header)
	for i in 1:endof(lines), j in 1:endof(lines[i])
	colwidth[j] = max(colwidth[j], length(lines[i][j]))
	end
	println(io, "Structure with $(natoms) atoms:")
	for j in 1:endof(header)
	print(io, lpad(header[j], colwidth[j]+1))
	end
	for i in 1:endof(lines)
	println(io)
	for j in 1:endof(lines[i])
	print(io, lpad(lines[i][j], colwidth[j]+1))
	end
	end
	end

	function Base.length(str::Structure)
	return length(str.range)
	end

	function Base.endof(str::Structure)
	return length(str.range)
	end

	function Base.getindex(str::Structure, i::Integer)
	return str.coords[str.range[i]]
	end

	function Base.getindex(str::Structure, r::UnitRange)
	return str.coords.data[str.range[r]]
	end

	function residueof(str::Structure, atom::Atom)
	pos = searchsortedfirst(str.residues, atom.id)
	return Structure(str, str.residues[pos-1]+1:str.residues[pos])
	end

	function chainof(str::Structure, atom::Atom)
	pos = searchsortedfirst(str.chains, atom.id)
	return Structure(str, str.chains[pos-1]+1:str.chains[pos])
	end

	function Base.filter(f::Function, str::Structure{A,R,C}) where {A,R,C}
	atoms = Atom[]
	hetatom = BitVector(0)
	residues = Int32[0]
	chains = Int32[0]
	atommeta = A[]
	residuemeta = R[]
	chainmeta = C[]
	cstart = searchsortedfirst(str.chains, first(str.range))
	cstop = searchsortedfirst(str.chains, last(str.range))
	for c in cstart:cstop
	crange = str.chains[c-1]+1:str.chains[c]
	rstart = searchsortedfirst(str.residues, first(crange))
	rstop = searchsortedfirst(str.residues, last(crange))
	for r in rstart:rstop
	rrange = str.residues[r-1]+1:str.residues[r]
	for ri in rrange
	atom = str.coords[ri]
	meta = str.atommeta[ri]
	# This is ~10x slower.
	#if ri ∈ str.range && f(merge_namedtuple(atom, meta))
	if ri ∈ str.range && f(atom, meta)
	push!(atoms, atom)
	push!(atommeta, meta)
	push!(hetatom, str.hetatom[ri])
	end
	end
	push!(residues, endof(atoms))
	push!(residuemeta, str.residuemeta[r-1])
	end
	push!(chains, endof(atoms))
	push!(chainmeta, str.chainmeta[c-1])
	end
	return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
	end

	# Type-stable merge.
	@generated function merge_namedtuple(atom::Atom, meta::NamedTuples.NamedTuple)
	fields = vcat(fieldnames(atom), fieldnames(meta))
	T = NamedTuples.create_namedtuple_type(fields)
	vals = [f ∈ fieldnames(meta) ? :(getfield(meta, $(Expr(:quote, f)))) : :(getfield(atom, $(Expr(:quote, f)))) for f in fields]
	return :($(T)($(vals...)))
	end


	# PDB Reader
	# ----------

	struct Reader
	file::IO
	end

	function read(reader::Reader)
	return read_impl(reader.file)
	end

	function read_impl(input::IO)
	# metadata types
	A = @NT(serial::Int32, name::String, occupancy::Float32, tempfactor::Float32, element::String)
	R = @NT(serial::Int32, name::String)
	C = @NT(name::Char)
	# create the named tuple type in advance
	NamedTuples.create_namedtuple_type(vcat(fieldnames(Atom), fieldnames(A)))
	line = 0
	atommeta = A[]
	residuemeta = R[]
	chainmeta = C[]
	try
	id = 0
	startpos = 1
	buf = Vector{UInt8}(256)
	lastchain = 0x00
	lastresidue = 0
	atoms = Atom[]
	hetatom = BitVector(0)
	residues = Int32[]
	chains = Int32[]
	while !eof(input)
	copy!(buf, readline(input))
	rng = 1:80
	line += 1
	isatom = startswith(buf, b"ATOM ") \|\| startswith(buf, b"HETATM")
	if isatom
	chain = buf[rng[22]]
	if chain != lastchain
	push!(chains, id)
	chainname = buf[rng[22]]
	push!(chainmeta, C(chainname))
	lastchain = chain
	end
	residue = get(tryparse_int(buf, rng[23:26]))
	if residue != lastresidue
	push!(residues, id)
	resserial = get(tryparse_int(buf, rng[23:26]))
	resname = String(buf[rng[18:20]])
	push!(residuemeta, R(resserial, resname))
	lastresidue = residue
	end
	# read coordinates
	id += 1
	x = get(unsafe_tryparse_f32(buf, rng[31:38]))
	y = get(unsafe_tryparse_f32(buf, rng[39:46]))
	z = get(unsafe_tryparse_f32(buf, rng[47:54]))
	push!(atoms, (id, x, y, z))
	# HETATM?
	ishet = buf[1] == UInt8('H')
	push!(hetatom, ishet)
	# read other fields
	serial = get(tryparse_int(buf, rng[7:11]))
	name = String(buf[rng[13:16]])
	occupancy = get(unsafe_tryparse_f32(buf, rng[55:60]))
	tempfactor = get(unsafe_tryparse_f32(buf, rng[61:66]))
	element = String(buf[rng[77:78]])
	push!(atommeta, A(serial, name, occupancy, tempfactor, element))
	end
	end
	# push last residue and chain
	if lastchain != 0x00
	push!(chains, id)
	end
	if lastresidue != 0
	push!(residues, id)
	end
	return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
	catch ex
	# parsing error
	if isa(ex, NullException)
	throw(ArgumentError("malformed line at $(line)"))
	end
	# other error
	rethrow()
	end
	end

	function unsafe_tryparse_f32(data, range)
	ccall(
	:jl_try_substrtof,
	Nullable{Float32},
	(Ptr{UInt8}, Csize_t, Csize_t),
	data, first(range), length(range))
	end

	function tryparse_int(data, range)
	i = first(range)
	# skip left spaces
	while i ≤ last(range) && data[i] == UInt8(' ')
	i += 1
	end
	if i > last(range)
	return Nullable{Int}()
	end
	x = 0
	while i ≤ last(range)
	d = data[i]
	if d == UInt8(' ')
	break
	elseif d ∉ UInt8('0'):UInt8('9')
	return Nullable{Int}()
	end
	x = 10x + (d - UInt8('0'))
	i += 1
	end
	return Nullable(x)
	end


	# Atom Iterator
	# -------------

	struct AtomIterator
	str::Structure
	end

	function Base.eltype(::Type{AtomIterator})
	return Atom
	end

	function eachatom(str::Structure)
	return AtomIterator(str)
	end

	function Base.length(iter::AtomIterator)
	return length(iter.str.range)
	end

	function Base.start(::AtomIterator)
	return 1
	end

	function Base.done(iter::AtomIterator, i)
	return i > length(iter)
	end

	function Base.next(iter::AtomIterator, i)
	return Atom(iter.str.coords[i]), i + 1
	end


	# Structure Iterator
	# ------------------

	struct StructureIterator{unit,T}
	parent::Structure{T}
	range::UnitRange{Int}
	end

	function Base.eltype(::Type{StructureIterator{unit,T}}) where {unit,T}
	return Structure{T}
	end

	function Base.length(iter::StructureIterator)
	return length(iter.range) - 1
	end

	function eachresidue(str::Structure{T}) where T
	start = searchsortedfirst(str.residues, first(str.range))
	stop = searchsortedfirst(str.residues, last(str.range)+1)
	return StructureIterator{:residue,T}(str, start:stop)
	end

	function eachchain(str::Structure{T}) where T
	start = searchsortedfirst(str.chains, first(str.range))
	stop = searchsortedfirst(str.chains, last(str.range)+1)
	return StructureIterator{:chain,T}(str, start:stop)
	end

	function Base.start(iter::StructureIterator)
	return first(iter.range)
	end

	function Base.done(iter::StructureIterator, i)
	return i ≥ last(iter.range)
	end

	function Base.next(iter::StructureIterator{:residue}, i)
	return Structure(iter.parent, iter.parent.residues[i-1]+1:iter.parent.residues[i]), i + 1
	end

	function Base.next(iter::StructureIterator{:chain}, i)
	return Structure(iter.parent, iter.parent.chains[i-1]+1:iter.parent.chains[i]), i + 1
	end


	# Test
	# ----

	using Base.Test

	str = read(Reader(open("1crn.pdb")))

	@test length(str) == endof(str) == 327
	@test str[1] === Atom(1, 17.047, 14.099, 3.625)
	@test str[end] === Atom(327, 12.703, 4.973, 10.746)
	@test isa(str[1:2], Vector{Atom})
	@test length(str[1:2]) == 2

	@test length(eachatom(str)) == 327
	@test length(eachresidue(str)) == 46
	@test length(eachchain(str)) == 1

	@test residueof(str, first(eachatom(str))) == first(eachresidue(str))
	@test chainof(str, first(eachatom(str))) == first(eachchain(str))

	@time read(Reader(open("1crn.pdb")))
	@time read(Reader(open("1htq.pdb")))