Created
June 24, 2017 21:59
-
-
Save bicycle1885/768298687e00aef2826a7a2ad9fa129d to your computer and use it in GitHub Desktop.
Biomolecular structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import NamedTuples: NamedTuples, @NT | |
# Atom | |
# ---- | |
# NOTE: This must be synchronized with `AtomCoords`. | |
struct Atom | |
id::Int32 | |
x::Float32 | |
y::Float32 | |
z::Float32 | |
end | |
function Base.convert(::Type{Atom}, atom::Tuple{Integer,AbstractFloat,AbstractFloat,AbstractFloat}) | |
return Atom(atom[1], atom[2], atom[3], atom[4]) | |
end | |
# AtomCoords | |
# ---------- | |
struct AtomCoords <: AbstractMatrix{Float32} | |
data::Vector{Atom} | |
end | |
function Base.size(coords::AtomCoords) | |
return (length(coords.data), 3) | |
end | |
function Base.getindex(coords::AtomCoords, i::Integer) | |
return coords.data[i] | |
end | |
function Base.getindex(coords::AtomCoords, r::Range) | |
return coords.data[r] | |
end | |
function Base.getindex(coords::AtomCoords, i::Integer, j::Integer) | |
offset = sizeof(Atom) * (i - 1) + sizeof(Int32) * j | |
return unsafe_load(Ptr{Float32}(pointer(coords.data) + offset)) | |
end | |
# Structure | |
# --------- | |
struct Structure{A,R,C} | |
# atom coordinates | |
coords::AtomCoords | |
# valid atom range | |
range::UnitRange{Int} | |
# hetatom? | |
hetatom::BitVector | |
# residue end positions | |
residues::Vector{Int32} | |
# chain end positions | |
chains::Vector{Int32} | |
# atom metadata | |
atommeta::Vector{A} | |
# residue metadata | |
residuemeta::Vector{R} | |
# chain metadata | |
chainmeta::Vector{C} | |
end | |
function Structure(str::Structure, range::UnitRange{Int}) | |
return Structure(str.coords, range, str.hetatom, str.residues, str.chains, str.atommeta, str.residuemeta, str.chainmeta) | |
end | |
function Base.:(==)(str1::Structure, str2::Structure) | |
return str1.coords === str2.coords && str1.range == str2.range | |
end | |
function Base.show(io::IO, str::Structure) | |
if get(io, :compact, false) | |
print(io, first(str.range), '-', last(str.range), " atoms") | |
else | |
prettyprint(io, str, displaysize(io)[1] - 4) | |
end | |
end | |
function prettyprint(io::IO, str::Structure{A}, linemax::Integer) where A | |
natoms = length(str.range) | |
header = vcat(["id", "x", "y", "z"], string.(fieldnames(A))) | |
lines = Vector{String}[] | |
for i in str.range[1:min(linemax-2, endof(str.range))] | |
atom = str.coords[i] | |
line = [string(atom.id), @sprintf("%8.3f", atom.x), @sprintf("%8.3f", atom.y), @sprintf("%8.3f", atom.z)] | |
meta = str.atommeta[i] | |
for fname in fieldnames(A) | |
push!(line, replace(string(meta[fname]), ' ', '·')) | |
#push!(line, replace(string(meta[fname]), ' ', '␣')) | |
end | |
push!(lines, line) | |
end | |
if length(str.range) > linemax - 2 | |
push!(lines, fill("⋮", length(header))) | |
end | |
colwidth = map(length, header) | |
for i in 1:endof(lines), j in 1:endof(lines[i]) | |
colwidth[j] = max(colwidth[j], length(lines[i][j])) | |
end | |
println(io, "Structure with $(natoms) atoms:") | |
for j in 1:endof(header) | |
print(io, lpad(header[j], colwidth[j]+1)) | |
end | |
for i in 1:endof(lines) | |
println(io) | |
for j in 1:endof(lines[i]) | |
print(io, lpad(lines[i][j], colwidth[j]+1)) | |
end | |
end | |
end | |
function Base.length(str::Structure) | |
return length(str.range) | |
end | |
function Base.endof(str::Structure) | |
return length(str.range) | |
end | |
function Base.getindex(str::Structure, i::Integer) | |
return str.coords[str.range[i]] | |
end | |
function Base.getindex(str::Structure, r::UnitRange) | |
return str.coords.data[str.range[r]] | |
end | |
function residueof(str::Structure, atom::Atom) | |
pos = searchsortedfirst(str.residues, atom.id) | |
return Structure(str, str.residues[pos-1]+1:str.residues[pos]) | |
end | |
function chainof(str::Structure, atom::Atom) | |
pos = searchsortedfirst(str.chains, atom.id) | |
return Structure(str, str.chains[pos-1]+1:str.chains[pos]) | |
end | |
function Base.filter(f::Function, str::Structure{A,R,C}) where {A,R,C} | |
atoms = Atom[] | |
hetatom = BitVector(0) | |
residues = Int32[0] | |
chains = Int32[0] | |
atommeta = A[] | |
residuemeta = R[] | |
chainmeta = C[] | |
cstart = searchsortedfirst(str.chains, first(str.range)) | |
cstop = searchsortedfirst(str.chains, last(str.range)) | |
for c in cstart:cstop | |
crange = str.chains[c-1]+1:str.chains[c] | |
rstart = searchsortedfirst(str.residues, first(crange)) | |
rstop = searchsortedfirst(str.residues, last(crange)) | |
for r in rstart:rstop | |
rrange = str.residues[r-1]+1:str.residues[r] | |
for ri in rrange | |
atom = str.coords[ri] | |
meta = str.atommeta[ri] | |
# This is ~10x slower. | |
#if ri ∈ str.range && f(merge_namedtuple(atom, meta)) | |
if ri ∈ str.range && f(atom, meta) | |
push!(atoms, atom) | |
push!(atommeta, meta) | |
push!(hetatom, str.hetatom[ri]) | |
end | |
end | |
push!(residues, endof(atoms)) | |
push!(residuemeta, str.residuemeta[r-1]) | |
end | |
push!(chains, endof(atoms)) | |
push!(chainmeta, str.chainmeta[c-1]) | |
end | |
return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta) | |
end | |
# Type-stable merge. | |
@generated function merge_namedtuple(atom::Atom, meta::NamedTuples.NamedTuple) | |
fields = vcat(fieldnames(atom), fieldnames(meta)) | |
T = NamedTuples.create_namedtuple_type(fields) | |
vals = [f ∈ fieldnames(meta) ? :(getfield(meta, $(Expr(:quote, f)))) : :(getfield(atom, $(Expr(:quote, f)))) for f in fields] | |
return :($(T)($(vals...))) | |
end | |
# PDB Reader | |
# ---------- | |
struct Reader | |
file::IO | |
end | |
function read(reader::Reader) | |
return read_impl(reader.file) | |
end | |
function read_impl(input::IO) | |
# metadata types | |
A = @NT(serial::Int32, name::String, occupancy::Float32, tempfactor::Float32, element::String) | |
R = @NT(serial::Int32, name::String) | |
C = @NT(name::Char) | |
# create the named tuple type in advance | |
NamedTuples.create_namedtuple_type(vcat(fieldnames(Atom), fieldnames(A))) | |
line = 0 | |
atommeta = A[] | |
residuemeta = R[] | |
chainmeta = C[] | |
try | |
id = 0 | |
startpos = 1 | |
buf = Vector{UInt8}(256) | |
lastchain = 0x00 | |
lastresidue = 0 | |
atoms = Atom[] | |
hetatom = BitVector(0) | |
residues = Int32[] | |
chains = Int32[] | |
while !eof(input) | |
copy!(buf, readline(input)) | |
rng = 1:80 | |
line += 1 | |
isatom = startswith(buf, b"ATOM ") || startswith(buf, b"HETATM") | |
if isatom | |
chain = buf[rng[22]] | |
if chain != lastchain | |
push!(chains, id) | |
chainname = buf[rng[22]] | |
push!(chainmeta, C(chainname)) | |
lastchain = chain | |
end | |
residue = get(tryparse_int(buf, rng[23:26])) | |
if residue != lastresidue | |
push!(residues, id) | |
resserial = get(tryparse_int(buf, rng[23:26])) | |
resname = String(buf[rng[18:20]]) | |
push!(residuemeta, R(resserial, resname)) | |
lastresidue = residue | |
end | |
# read coordinates | |
id += 1 | |
x = get(unsafe_tryparse_f32(buf, rng[31:38])) | |
y = get(unsafe_tryparse_f32(buf, rng[39:46])) | |
z = get(unsafe_tryparse_f32(buf, rng[47:54])) | |
push!(atoms, (id, x, y, z)) | |
# HETATM? | |
ishet = buf[1] == UInt8('H') | |
push!(hetatom, ishet) | |
# read other fields | |
serial = get(tryparse_int(buf, rng[7:11])) | |
name = String(buf[rng[13:16]]) | |
occupancy = get(unsafe_tryparse_f32(buf, rng[55:60])) | |
tempfactor = get(unsafe_tryparse_f32(buf, rng[61:66])) | |
element = String(buf[rng[77:78]]) | |
push!(atommeta, A(serial, name, occupancy, tempfactor, element)) | |
end | |
end | |
# push last residue and chain | |
if lastchain != 0x00 | |
push!(chains, id) | |
end | |
if lastresidue != 0 | |
push!(residues, id) | |
end | |
return Structure(AtomCoords(atoms), 1:endof(atoms), hetatom, residues, chains, atommeta, residuemeta, chainmeta) | |
catch ex | |
# parsing error | |
if isa(ex, NullException) | |
throw(ArgumentError("malformed line at $(line)")) | |
end | |
# other error | |
rethrow() | |
end | |
end | |
function unsafe_tryparse_f32(data, range) | |
ccall( | |
:jl_try_substrtof, | |
Nullable{Float32}, | |
(Ptr{UInt8}, Csize_t, Csize_t), | |
data, first(range), length(range)) | |
end | |
function tryparse_int(data, range) | |
i = first(range) | |
# skip left spaces | |
while i ≤ last(range) && data[i] == UInt8(' ') | |
i += 1 | |
end | |
if i > last(range) | |
return Nullable{Int}() | |
end | |
x = 0 | |
while i ≤ last(range) | |
d = data[i] | |
if d == UInt8(' ') | |
break | |
elseif d ∉ UInt8('0'):UInt8('9') | |
return Nullable{Int}() | |
end | |
x = 10x + (d - UInt8('0')) | |
i += 1 | |
end | |
return Nullable(x) | |
end | |
# Atom Iterator | |
# ------------- | |
struct AtomIterator | |
str::Structure | |
end | |
function Base.eltype(::Type{AtomIterator}) | |
return Atom | |
end | |
function eachatom(str::Structure) | |
return AtomIterator(str) | |
end | |
function Base.length(iter::AtomIterator) | |
return length(iter.str.range) | |
end | |
function Base.start(::AtomIterator) | |
return 1 | |
end | |
function Base.done(iter::AtomIterator, i) | |
return i > length(iter) | |
end | |
function Base.next(iter::AtomIterator, i) | |
return Atom(iter.str.coords[i]), i + 1 | |
end | |
# Structure Iterator | |
# ------------------ | |
struct StructureIterator{unit,T} | |
parent::Structure{T} | |
range::UnitRange{Int} | |
end | |
function Base.eltype(::Type{StructureIterator{unit,T}}) where {unit,T} | |
return Structure{T} | |
end | |
function Base.length(iter::StructureIterator) | |
return length(iter.range) - 1 | |
end | |
function eachresidue(str::Structure{T}) where T | |
start = searchsortedfirst(str.residues, first(str.range)) | |
stop = searchsortedfirst(str.residues, last(str.range)+1) | |
return StructureIterator{:residue,T}(str, start:stop) | |
end | |
function eachchain(str::Structure{T}) where T | |
start = searchsortedfirst(str.chains, first(str.range)) | |
stop = searchsortedfirst(str.chains, last(str.range)+1) | |
return StructureIterator{:chain,T}(str, start:stop) | |
end | |
function Base.start(iter::StructureIterator) | |
return first(iter.range) | |
end | |
function Base.done(iter::StructureIterator, i) | |
return i ≥ last(iter.range) | |
end | |
function Base.next(iter::StructureIterator{:residue}, i) | |
return Structure(iter.parent, iter.parent.residues[i-1]+1:iter.parent.residues[i]), i + 1 | |
end | |
function Base.next(iter::StructureIterator{:chain}, i) | |
return Structure(iter.parent, iter.parent.chains[i-1]+1:iter.parent.chains[i]), i + 1 | |
end | |
# Test | |
# ---- | |
using Base.Test | |
str = read(Reader(open("1crn.pdb"))) | |
@test length(str) == endof(str) == 327 | |
@test str[1] === Atom(1, 17.047, 14.099, 3.625) | |
@test str[end] === Atom(327, 12.703, 4.973, 10.746) | |
@test isa(str[1:2], Vector{Atom}) | |
@test length(str[1:2]) == 2 | |
@test length(eachatom(str)) == 327 | |
@test length(eachresidue(str)) == 46 | |
@test length(eachchain(str)) == 1 | |
@test residueof(str, first(eachatom(str))) == first(eachresidue(str)) | |
@test chainof(str, first(eachatom(str))) == first(eachchain(str)) | |
@time read(Reader(open("1crn.pdb"))) | |
@time read(Reader(open("1htq.pdb"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment