Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Created June 24, 2017 21:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bicycle1885/768298687e00aef2826a7a2ad9fa129d to your computer and use it in GitHub Desktop.
Save bicycle1885/768298687e00aef2826a7a2ad9fa129d to your computer and use it in GitHub Desktop.
Biomolecular structure
import NamedTuples: NamedTuples, @NT
# Atom
# ----
# NOTE: This must be synchronized with `AtomCoords`.
struct Atom
id::Int32
x::Float32
y::Float32
z::Float32
end
function Base.convert(::Type{Atom}, atom::Tuple{Integer,AbstractFloat,AbstractFloat,AbstractFloat})
return Atom(atom[1], atom[2], atom[3], atom[4])
end
# AtomCoords
# ----------
struct AtomCoords <: AbstractMatrix{Float32}
data::Vector{Atom}
end
function Base.size(coords::AtomCoords)
return (length(coords.data), 3)
end
function Base.getindex(coords::AtomCoords, i::Integer)
return coords.data[i]
end
function Base.getindex(coords::AtomCoords, r::Range)
return coords.data[r]
end
function Base.getindex(coords::AtomCoords, i::Integer, j::Integer)
offset = sizeof(Atom) * (i - 1) + sizeof(Int32) * j
return unsafe_load(Ptr{Float32}(pointer(coords.data) + offset))
end
# Structure
# ---------
struct Structure{A,R,C}
# atom coordinates
coords::AtomCoords
# valid atom range
range::UnitRange{Int}
# hetatom?
hetatom::BitVector
# residue end positions
residues::Vector{Int32}
# chain end positions
chains::Vector{Int32}
# atom metadata
atommeta::Vector{A}
# residue metadata
residuemeta::Vector{R}
# chain metadata
chainmeta::Vector{C}
end
function Structure(str::Structure, range::UnitRange{Int})
return Structure(str.coords, range, str.hetatom, str.residues, str.chains, str.atommeta, str.residuemeta, str.chainmeta)
end
function Base.:(==)(str1::Structure, str2::Structure)
return str1.coords === str2.coords && str1.range == str2.range
end
function Base.show(io::IO, str::Structure)
if get(io, :compact, false)
print(io, first(str.range), '-', last(str.range), " atoms")
else
prettyprint(io, str, displaysize(io)[1] - 4)
end
end
function prettyprint(io::IO, str::Structure{A}, linemax::Integer) where A
natoms = length(str.range)
header = vcat(["id", "x", "y", "z"], string.(fieldnames(A)))
lines = Vector{String}[]
for i in str.range[1:min(linemax-2, endof(str.range))]
atom = str.coords[i]
line = [string(atom.id), @sprintf("%8.3f", atom.x), @sprintf("%8.3f", atom.y), @sprintf("%8.3f", atom.z)]
meta = str.atommeta[i]
for fname in fieldnames(A)
push!(line, replace(string(meta[fname]), ' ', '·'))
#push!(line, replace(string(meta[fname]), ' ', '␣'))
end
push!(lines, line)
end
if length(str.range) > linemax - 2
push!(lines, fill("⋮", length(header)))
end
colwidth = map(length, header)
for i in 1:endof(lines), j in 1:endof(lines[i])
colwidth[j] = max(colwidth[j], length(lines[i][j]))
end
println(io, "Structure with $(natoms) atoms:")
for j in 1:endof(header)
print(io, lpad(header[j], colwidth[j]+1))
end
for i in 1:endof(lines)
println(io)
for j in 1:endof(lines[i])
print(io, lpad(lines[i][j], colwidth[j]+1))
end
end
end
function Base.length(str::Structure)
return length(str.range)
end
function Base.endof(str::Structure)
return length(str.range)
end
function Base.getindex(str::Structure, i::Integer)
return str.coords[str.range[i]]
end
function Base.getindex(str::Structure, r::UnitRange)
return str.coords.data[str.range[r]]
end
function residueof(str::Structure, atom::Atom)
pos = searchsortedfirst(str.residues, atom.id)
return Structure(str, str.residues[pos-1]+1:str.residues[pos])
end
function chainof(str::Structure, atom::Atom)
pos = searchsortedfirst(str.chains, atom.id)
return Structure(str, str.chains[pos-1]+1:str.chains[pos])
end
function Base.filter(f::Function, str::Structure{A,R,C}) where {A,R,C}
atoms = Atom[]
hetatom = BitVector(0)
residues = Int32[0]
chains = Int32[0]
atommeta = A[]
residuemeta = R[]
chainmeta = C[]
cstart = searchsortedfirst(str.chains, first(str.range))
cstop = searchsortedfirst(str.chains, last(str.range))
for c in cstart:cstop
crange = str.chains[c-1]+1:str.chains[c]
rstart = searchsortedfirst(str.residues, first(crange))
rstop = searchsortedfirst(str.residues, last(crange))
for r in rstart:rstop
rrange = str.residues[r-1]+1:str.residues[r]
for ri in rrange
atom = str.coords[ri]
meta = str.atommeta[ri]
# This is ~10x slower.
#if ri ∈ str.range && f(merge_namedtuple(atom, meta))
if ri ∈ str.range && f(atom, meta)
push!(atoms, atom)
push!(atommeta, meta)
push!(hetatom, str.hetatom[ri])
end
end
push!(residues, endof(atoms))
push!(residuemeta, str.residuemeta[r-1])
end
push!(chains, endof(atoms))
push!(chainmeta, str.chainmeta[c-1])
end
return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
end
# Type-stable merge.
@generated function merge_namedtuple(atom::Atom, meta::NamedTuples.NamedTuple)
fields = vcat(fieldnames(atom), fieldnames(meta))
T = NamedTuples.create_namedtuple_type(fields)
vals = [f ∈ fieldnames(meta) ? :(getfield(meta, $(Expr(:quote, f)))) : :(getfield(atom, $(Expr(:quote, f)))) for f in fields]
return :($(T)($(vals...)))
end
# PDB Reader
# ----------
struct Reader
file::IO
end
function read(reader::Reader)
return read_impl(reader.file)
end
function read_impl(input::IO)
# metadata types
A = @NT(serial::Int32, name::String, occupancy::Float32, tempfactor::Float32, element::String)
R = @NT(serial::Int32, name::String)
C = @NT(name::Char)
# create the named tuple type in advance
NamedTuples.create_namedtuple_type(vcat(fieldnames(Atom), fieldnames(A)))
line = 0
atommeta = A[]
residuemeta = R[]
chainmeta = C[]
try
id = 0
startpos = 1
buf = Vector{UInt8}(256)
lastchain = 0x00
lastresidue = 0
atoms = Atom[]
hetatom = BitVector(0)
residues = Int32[]
chains = Int32[]
while !eof(input)
copy!(buf, readline(input))
rng = 1:80
line += 1
isatom = startswith(buf, b"ATOM ") || startswith(buf, b"HETATM")
if isatom
chain = buf[rng[22]]
if chain != lastchain
push!(chains, id)
chainname = buf[rng[22]]
push!(chainmeta, C(chainname))
lastchain = chain
end
residue = get(tryparse_int(buf, rng[23:26]))
if residue != lastresidue
push!(residues, id)
resserial = get(tryparse_int(buf, rng[23:26]))
resname = String(buf[rng[18:20]])
push!(residuemeta, R(resserial, resname))
lastresidue = residue
end
# read coordinates
id += 1
x = get(unsafe_tryparse_f32(buf, rng[31:38]))
y = get(unsafe_tryparse_f32(buf, rng[39:46]))
z = get(unsafe_tryparse_f32(buf, rng[47:54]))
push!(atoms, (id, x, y, z))
# HETATM?
ishet = buf[1] == UInt8('H')
push!(hetatom, ishet)
# read other fields
serial = get(tryparse_int(buf, rng[7:11]))
name = String(buf[rng[13:16]])
occupancy = get(unsafe_tryparse_f32(buf, rng[55:60]))
tempfactor = get(unsafe_tryparse_f32(buf, rng[61:66]))
element = String(buf[rng[77:78]])
push!(atommeta, A(serial, name, occupancy, tempfactor, element))
end
end
# push last residue and chain
if lastchain != 0x00
push!(chains, id)
end
if lastresidue != 0
push!(residues, id)
end
return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta)
catch ex
# parsing error
if isa(ex, NullException)
throw(ArgumentError("malformed line at $(line)"))
end
# other error
rethrow()
end
end
function unsafe_tryparse_f32(data, range)
ccall(
:jl_try_substrtof,
Nullable{Float32},
(Ptr{UInt8}, Csize_t, Csize_t),
data, first(range), length(range))
end
function tryparse_int(data, range)
i = first(range)
# skip left spaces
while i ≤ last(range) && data[i] == UInt8(' ')
i += 1
end
if i > last(range)
return Nullable{Int}()
end
x = 0
while i ≤ last(range)
d = data[i]
if d == UInt8(' ')
break
elseif d ∉ UInt8('0'):UInt8('9')
return Nullable{Int}()
end
x = 10x + (d - UInt8('0'))
i += 1
end
return Nullable(x)
end
# Atom Iterator
# -------------
struct AtomIterator
str::Structure
end
function Base.eltype(::Type{AtomIterator})
return Atom
end
function eachatom(str::Structure)
return AtomIterator(str)
end
function Base.length(iter::AtomIterator)
return length(iter.str.range)
end
function Base.start(::AtomIterator)
return 1
end
function Base.done(iter::AtomIterator, i)
return i > length(iter)
end
function Base.next(iter::AtomIterator, i)
return Atom(iter.str.coords[i]), i + 1
end
# Structure Iterator
# ------------------
struct StructureIterator{unit,T}
parent::Structure{T}
range::UnitRange{Int}
end
function Base.eltype(::Type{StructureIterator{unit,T}}) where {unit,T}
return Structure{T}
end
function Base.length(iter::StructureIterator)
return length(iter.range) - 1
end
function eachresidue(str::Structure{T}) where T
start = searchsortedfirst(str.residues, first(str.range))
stop = searchsortedfirst(str.residues, last(str.range)+1)
return StructureIterator{:residue,T}(str, start:stop)
end
function eachchain(str::Structure{T}) where T
start = searchsortedfirst(str.chains, first(str.range))
stop = searchsortedfirst(str.chains, last(str.range)+1)
return StructureIterator{:chain,T}(str, start:stop)
end
function Base.start(iter::StructureIterator)
return first(iter.range)
end
function Base.done(iter::StructureIterator, i)
return i ≥ last(iter.range)
end
function Base.next(iter::StructureIterator{:residue}, i)
return Structure(iter.parent, iter.parent.residues[i-1]+1:iter.parent.residues[i]), i + 1
end
function Base.next(iter::StructureIterator{:chain}, i)
return Structure(iter.parent, iter.parent.chains[i-1]+1:iter.parent.chains[i]), i + 1
end
# Test
# ----
using Base.Test
str = read(Reader(open("1crn.pdb")))
@test length(str) == endof(str) == 327
@test str[1] === Atom(1, 17.047, 14.099, 3.625)
@test str[end] === Atom(327, 12.703, 4.973, 10.746)
@test isa(str[1:2], Vector{Atom})
@test length(str[1:2]) == 2
@test length(eachatom(str)) == 327
@test length(eachresidue(str)) == 46
@test length(eachchain(str)) == 1
@test residueof(str, first(eachatom(str))) == first(eachresidue(str))
@test chainof(str, first(eachatom(str))) == first(eachchain(str))
@time read(Reader(open("1crn.pdb")))
@time read(Reader(open("1htq.pdb")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment