Skip to content

Commit a902b4e

Browse files
authored
Add bonding data for residues (#66)
This provides deterministic bonding between atoms in peptides that follow standard naming conventions. The bonding data is taken from OpenMM's ff14SB force field. To encourage careful work, this is designed to error unless all atoms (including hydrogens) are present in the structure, and that HIS has been disambiguated as HIE, HID, or HIP. One can override this behavior by passing `strict=false` as a keyword argument to `MetaGraph`.
1 parent b18bec3 commit a902b4e

File tree

7 files changed

+557
-2
lines changed

7 files changed

+557
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ benchmark/tune.json
88
benchmark/results
99
Manifest.toml
1010
*.swp
11+
extractdata/protein.ff14SB.*

ext/BioStructuresGraphsExt.jl

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@ module BioStructuresGraphsExt
33
using BioStructures
44
using Graphs
55
using MetaGraphs
6+
using BioStructures: findatombyname
67

78
"""
89
MetaGraph(element, contact_distance)
910
10-
Construct a graph of atoms where edges are contacts.
11+
Construct a graph of atoms where edges are contacts separated by less than `contact_distance`.
1112
1213
See Graphs.jl and MetaGraphs.jl for more on how to use graphs.
1314
"""
@@ -28,4 +29,92 @@ function MetaGraphs.MetaGraph(el::StructuralElementOrList, contact_dist::Real)
2829
return mg
2930
end
3031

32+
"""
33+
MetaGraph(chain::Chain; strict::Bool = true)
34+
35+
Construct a graph of atoms where edges are determined by the known bonds of residues in the chain.
36+
37+
By default, the graph is constructed in `strict` mode, which means that:
38+
39+
- residue and atom names must be standard
40+
- all hydrogens are present
41+
- HIS must be disambiguated as HIE, HID, or HIP
42+
43+
These constraints can be relaxed by setting `strict = false`, at some risk to accuracy.
44+
45+
See Graphs.jl and MetaGraphs.jl for more on how to use graphs.
46+
"""
47+
function MetaGraphs.MetaGraph(chain::Chain; strict::Bool = true)
48+
el_list = collectatoms(chain; expand_disordered = true)
49+
mg = MetaGraph(length(el_list))
50+
for (i, el) in enumerate(el_list)
51+
set_prop!(mg, i, :element, el)
52+
end
53+
set_indexing_prop!(mg, :element)
54+
55+
prev = nothing
56+
for r in chain
57+
ishetero(r) && continue
58+
if prev !== nothing
59+
if resnumber(r) == resnumber(prev) + 1
60+
# Add the peptide bond(s) (disordered residues may need multiples)
61+
for _r in collectresidues(r; expand_disordered=true), _rp in collectresidues(prev; expand_disordered=true)
62+
for a in _r["N"], ap in _rp["C"]
63+
add_edge!(mg, mg[a, :element], mg[ap, :element])
64+
end
65+
end
66+
else
67+
prev = nothing
68+
end
69+
end
70+
# Add the residue bonds
71+
addresiduebonds!(mg, r, strict)
72+
prev = r
73+
# The "OXT" (C-terminus oxygen) is not connected
74+
for _r in collectresidues(r; expand_disordered=true)
75+
aj = findatombyname(_r, "OXT"; strict=false)
76+
if aj !== nothing
77+
ai = findatombyname(_r, "C"; strict)
78+
connect_atoms!(mg, ai, aj)
79+
end
80+
end
81+
end
82+
83+
return mg
84+
end
85+
86+
function addresiduebonds!(mg, r::Residue, strict::Bool)
87+
rname = residuekey(r, strict)
88+
strict || haskey(BioStructures.residuedata, rname) || return
89+
rd = BioStructures.residuedata[rname]
90+
for (ni, nj) in rd.bonds
91+
connect_atoms!(mg, r, ni, nj, strict)
92+
end
93+
end
94+
95+
connect_atoms!(mg, r, ni::AbstractString, nj::AbstractString, strict::Bool) =
96+
connect_atoms!(mg, findatombyname(r, ni; strict), findatombyname(r, nj; strict))
97+
98+
function connect_atoms!(mg, ai::Union{AbstractAtom,Nothing}, aj::Union{AbstractAtom,Nothing})
99+
(ai === nothing || aj === nothing) && return
100+
for _ai in ai, _aj in aj # handle disordered atoms
101+
i = mg[_ai, :element]
102+
j = mg[_aj, :element]
103+
add_edge!(mg, i, j)
104+
end
105+
end
106+
107+
addresiduebonds!(mg, dr::DisorderedResidue, strict::Bool) = for r in values(dr.names)
108+
addresiduebonds!(mg, r, strict)
109+
end
110+
111+
function residuekey(r::Residue, strict::Bool)
112+
rname = resname(r)
113+
if rname == "HIS"
114+
strict && error("HIS is not a valid residue name in strict mode; use HIE, HID, or HIP")
115+
rname = "HIE"
116+
end
117+
return rname
118+
end
119+
31120
end # BioStructuresGraphsExt

extractdata/bonding.jl

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Running this file generates src/bonding.jl, which contains the atomtypes and residuedata dictionaries.
2+
# Thanks to OpenMM for the ff14SB force field XML file.
3+
4+
using Downloads
5+
6+
if !isfile(joinpath(@__DIR__, "protein.ff14SB.xml"))
7+
Downloads.download("https://raw.githubusercontent.com/openmm/openmm/refs/heads/master/wrappers/python/openmm/app/data/amber14/protein.ff14SB.xml", "protein.ff14SB.xml")
8+
end
9+
10+
function parsexmblock(f, io::IO, key)
11+
while !eof(io)
12+
line = strip(readline(io))
13+
line == key && return nothing
14+
f(line)
15+
end
16+
end
17+
18+
function parsestring(str)
19+
@assert startswith(str, '"')
20+
@assert endswith(str, '"')
21+
return String(str[2:end-1])
22+
end
23+
24+
function parsexmlline(f, line, tag, keyname)
25+
@assert startswith(line, "<$tag ")
26+
@assert endswith(line, "/>")
27+
kv = split(strip(line[length(tag)+2:end-2]), ' ')
28+
key = ""
29+
vals = Pair{Symbol,Any}[]
30+
for kvp in kv
31+
k, v = split(kvp, '=')
32+
if k == keyname
33+
key = parsestring(v)
34+
else
35+
push!(vals, Symbol(k) => f(k, v))
36+
end
37+
end
38+
return key => (; vals...)
39+
end
40+
41+
42+
atomtypes, residues = open("protein.ff14SB.xml", "r") do io
43+
line = readline(io)
44+
@assert line == "<ForceField>"
45+
atomtypes = Dict{String, @NamedTuple{element::String, mass::Float32, name::String}}()
46+
residues = Dict{String, @NamedTuple{atoms::Dict{String, @NamedTuple{charge::Float32, type::String}}, bonds::Vector{Tuple{String,String}}, externalbonds::Vector{String}}}()
47+
parsexmblock(io, "</ForceField>") do line
48+
if line == "<AtomTypes>"
49+
parsexmblock(io, "</AtomTypes>") do line
50+
push!(atomtypes, parsexmlline(line, "Type", "class") do k, v
51+
if k == "element"
52+
return parsestring(v)
53+
elseif k == "mass"
54+
return parse(Float32, v[2:end-1]) # strip the quotes
55+
elseif k == "name"
56+
return parsestring(v)
57+
else
58+
error("Unknown AtomType key $k")
59+
end
60+
end)
61+
end
62+
elseif line == "<Residues>"
63+
parsexmblock(io, "</Residues>") do line
64+
if startswith(line, "<Residue name=")
65+
resname = parsestring(line[15:end-1])
66+
atoms = Dict{String, @NamedTuple{charge::Float32, type::String}}()
67+
bonds = Vector{Tuple{String,String}}()
68+
externalbonds = Vector{String}()
69+
parsexmblock(io, "</Residue>") do line
70+
if startswith(line, "<Atom")
71+
push!(atoms, parsexmlline(line, "Atom", "name") do k, v
72+
if k == "charge"
73+
return parse(Float32, v[2:end-1]) # strip the quotes
74+
elseif k == "type"
75+
return parsestring(v)
76+
else
77+
error("Unknown Atom key $k")
78+
end
79+
end)
80+
elseif startswith(line, "<Bond")
81+
line = line[6:end-2]
82+
a1, a2 = split(strip(line), ' ')
83+
push!(bonds, (only(match(r"atomName1=\"(.*)\"", a1).captures), only(match(r"atomName2=\"(.*)\"", a2).captures)))
84+
elseif startswith(line, "<ExternalBond")
85+
line = line[14:end-2]
86+
push!(externalbonds, only(match(r"atomName=\"(.*)\"", line).captures))
87+
else
88+
error("Unknown Residue line $line")
89+
end
90+
end
91+
residues[resname] = (; atoms, bonds, externalbonds)
92+
else
93+
error("Unknown Residues line $line")
94+
end
95+
end
96+
end
97+
end
98+
atomtypes, residues
99+
end
100+
101+
open(joinpath(dirname(@__DIR__), "src", "bonding.jl"), "w") do io
102+
println(io, "const atomtypes = Dict{String, @NamedTuple{element::String, mass::Float32, name::String}}(")
103+
at = sort!(collect(atomtypes); by=first)
104+
for pr in at
105+
println(io, " ", pr, ',')
106+
end
107+
println(io, ")\n")
108+
109+
println(io, "const RDADict = Dict{String, @NamedTuple{charge::Float32, type::String}}")
110+
111+
println(io, "const residuedata = Dict{String, @NamedTuple{atoms::RDADict, bonds::Vector{Tuple{String,String}}, externalbonds::Vector{String}}}(")
112+
rd = sort!(collect(residues); by=first)
113+
for (k, v) in rd
114+
print(io, " "*" "^(4-length(k)))
115+
show(io, k)
116+
println(io, " => (atoms = ", replace(sprint(show, v.atoms), "Dict{String, @NamedTuple{charge::Float32, type::String}}" => "RDADict"), ',')
117+
println(io, " bonds = ", v.bonds, ",")
118+
println(io, " externalbonds = ", v.externalbonds, "),")
119+
end
120+
println(io, ")")
121+
end

src/BioStructures.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ include("pdb.jl")
2626
include("mmcif.jl")
2727
include("download.jl")
2828
include("spatial.jl")
29+
include("bonding.jl")
2930

3031
@compile_workload begin
3132
let

0 commit comments

Comments
 (0)