From 3733e85370b77b5c481a01246e96c7fda21e5356 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 8 May 2025 05:52:52 -0500
Subject: [PATCH 1/2] Improve read perf, store coords as SVector

This substantially reduces the amount of memory used to store
coordinates and parse files. While parsing a large mmCIF file, the
memory usage dropped by approximately 50%, and read time by ~15%.

The most potentially-disruptive change is that the coordinates are now
stored as SVectors instead of Vectors. This means that the coordinates
are now immutable, and you cannot change them in place by manual
indexing. The `x!`, `y!`, and `z!` functions still work, as do in-place
transformations, by making the `coords` field itself mutable.
---
 Project.toml         |  2 ++
 src/BioStructures.jl |  1 +
 src/mmcif.jl         | 75 ++++++++++++++++++++++----------------------
 src/model.jl         | 41 ++++++++++++------------
 src/pdb.jl           |  4 +--
 test/runtests.jl     |  2 +-
 6 files changed, 64 insertions(+), 61 deletions(-)

diff --git a/Project.toml b/Project.toml
index 589f8d6..746158e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
@@ -50,5 +51,6 @@ MetaGraphs = "0.7, 0.8"
 PrecompileTools = "1"
 RecipesBase = "1"
 STRIDE_jll = "1"
+StaticArraysCore = "1"
 Statistics = "1.9"
 julia = "1.9"
diff --git a/src/BioStructures.jl b/src/BioStructures.jl
index 16c20c0..2b36639 100644
--- a/src/BioStructures.jl
+++ b/src/BioStructures.jl
@@ -18,6 +18,7 @@ using PrecompileTools
 using RecipesBase
 
 using LinearAlgebra
+using StaticArraysCore
 using Statistics
 
 include("model.jl")
diff --git a/src/mmcif.jl b/src/mmcif.jl
index 1ca28c2..e2b250f 100644
--- a/src/mmcif.jl
+++ b/src/mmcif.jl
@@ -104,8 +104,7 @@ end
 
 # Split a mmCIF line into tokens
 # See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax
-function splitline(s::AbstractString)
-    tokens = String[]
+function splitline!(tokens, s::AbstractString)
     in_token = false
     # Quote character of the currently open quote, or ' ' if no quote open
     quote_open_char = ' '
@@ -114,7 +113,7 @@ function splitline(s::AbstractString)
         if c in whitespacechars
             if in_token && quote_open_char == ' '
                 in_token = false
-                push!(tokens, s[start_i:(i - 1)])
+                push!(tokens, @view(s[start_i:(i - 1)]))
             end
         elseif c in quotechars
             if quote_open_char == ' '
@@ -127,7 +126,7 @@ function splitline(s::AbstractString)
             elseif c == quote_open_char && (i == length(s) || s[i + 1] in whitespacechars)
                 quote_open_char = ' '
                 in_token = false
-                push!(tokens, s[start_i:(i - 1)])
+                push!(tokens, @view(s[start_i:(i - 1)]))
             end
         elseif c == '#' && !in_token
             return tokens
@@ -137,13 +136,14 @@ function splitline(s::AbstractString)
         end
     end
     if in_token
-        push!(tokens, s[start_i:end])
+        push!(tokens, @view(s[start_i:end]))
     end
     if quote_open_char != ' '
         throw(ArgumentError("Line ended with quote open: $s"))
     end
     return tokens
 end
+splitline(s::AbstractString) = splitline!(String[], s)   # mostly for testing
 
 # Get tokens from a mmCIF file
 function tokenizecif(f::IO)
@@ -162,7 +162,7 @@ function tokenizecif(f::IO)
             end
             push!(tokens, join(token_buffer, "\n"))
         else
-            append!(tokens, splitline(line))
+            splitline!(tokens, line)
         end
     end
     return tokens
@@ -204,7 +204,7 @@ function tokenizecifstructure(f::IO)
             in_keys = true
         else
             in_keys = false
-            append!(tokens, splitline(line))
+            splitline!(tokens, line)
         end
     end
     return tokens
@@ -236,7 +236,7 @@ function MMCIFDict(f::IO; gzip::Bool=false)
 end
 
 # Add tokens to a mmCIF dictionary
-function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
+function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString})
     key = ""
     keys = String[]
     loop_flag = false
@@ -264,16 +264,8 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String})
                     continue
                 end
             else
-                try
-                    push!(mmcif_dict[keys[i % n + 1]], token)
-                catch ex
-                    # A zero division error means we have not found any keys
-                    if isa(ex, DivideError)
-                        throw(ArgumentError("Loop keys not found, token: \"$token\""))
-                    else
-                        rethrow()
-                    end
-                end
+                iszero(n) && throw(ArgumentError("Loop keys not found, token: \"$token\""))
+                push!(mmcif_dict[keys[i % n + 1]], token)
                 i += 1
                 continue
             end
@@ -384,25 +376,34 @@ function MolecularStructure(mmcif_dict::MMCIFDict;
 end
 
 # Constructor from mmCIF ATOM/HETATM line
-AtomRecord(d::MMCIFDict, i::Integer) = AtomRecord(
-    d["_atom_site.group_PDB"][i] == "HETATM",
-    parse(Int, d["_atom_site.id"][i]),
-    d["_atom_site.auth_atom_id"][i],
-    d["_atom_site.label_alt_id"][i] in missingvals ? ' ' : d["_atom_site.label_alt_id"][i][1],
-    d["_atom_site.auth_comp_id"][i],
-    d["_atom_site.auth_asym_id"][i],
-    parse(Int, d["_atom_site.auth_seq_id"][i]),
-    d["_atom_site.pdbx_PDB_ins_code"][i] in missingvals ? ' ' : d["_atom_site.pdbx_PDB_ins_code"][i][1],
-    [
-        parse(Float64, d["_atom_site.Cartn_x"][i]),
-        parse(Float64, d["_atom_site.Cartn_y"][i]),
-        parse(Float64, d["_atom_site.Cartn_z"][i])
-    ],
-    d["_atom_site.occupancy"][i] in missingvals ? 1.0 : parse(Float64, d["_atom_site.occupancy"][i]),
-    d["_atom_site.B_iso_or_equiv"][i] in missingvals ? 0.0 : parse(Float64, d["_atom_site.B_iso_or_equiv"][i]),
-    d["_atom_site.type_symbol"][i] in missingvals ? "  " : d["_atom_site.type_symbol"][i],
-    d["_atom_site.pdbx_formal_charge"][i] in missingvals ? "  " : d["_atom_site.pdbx_formal_charge"][i],
-)
+function AtomRecord(d::MMCIFDict, i::Integer)
+    alt_id = d["_atom_site.label_alt_id"][i]
+    ins_code = d["_atom_site.pdbx_PDB_ins_code"][i]
+    occupancy = d["_atom_site.occupancy"][i]
+    temp_factor = d["_atom_site.B_iso_or_equiv"][i]
+    typesym = d["_atom_site.type_symbol"][i]
+    charge = d["_atom_site.pdbx_formal_charge"][i]
+
+    return AtomRecord(
+        d["_atom_site.group_PDB"][i] == "HETATM",
+        parse(Int, d["_atom_site.id"][i]),
+        d["_atom_site.auth_atom_id"][i],
+        alt_id in missingvals ? ' ' : alt_id[1],
+        d["_atom_site.auth_comp_id"][i],
+        d["_atom_site.auth_asym_id"][i],
+        parse(Int, d["_atom_site.auth_seq_id"][i]),
+        ins_code in missingvals ? ' ' : ins_code[1],
+        SVector{3,Float64}((
+            parse(Float64, d["_atom_site.Cartn_x"][i]),
+            parse(Float64, d["_atom_site.Cartn_y"][i]),
+            parse(Float64, d["_atom_site.Cartn_z"][i]),
+        )),
+        occupancy in missingvals ? 1.0 : parse(Float64, occupancy),
+        temp_factor in missingvals ? 0.0 : parse(Float64, temp_factor),
+        typesym in missingvals ? "  " : typesym,
+        charge in missingvals ? "  " : charge,
+    )
+end
 
 # Format a mmCIF data value by enclosing with quotes or semicolon lines where
 #   appropriate. See
diff --git a/src/model.jl b/src/model.jl
index 52545e2..3c57aef 100644
--- a/src/model.jl
+++ b/src/model.jl
@@ -102,20 +102,20 @@ end
 Base.showerror(io::IO, e::PDBConsistencyError) = print(io, "PDBConsistencyError: ", e.message)
 
 "An atom that is part of a macromolecule."
-struct Atom <: AbstractAtom
-    serial::Int
-    name::String
-    alt_loc_id::Char
-    coords::Vector{Float64}
-    occupancy::Float64
-    temp_factor::Float64
-    element::String
-    charge::String
-    residue::StructuralElement
+mutable struct Atom <: AbstractAtom
+    const serial::Int
+    const name::String
+    const alt_loc_id::Char
+    coords::SVector{3,Float64}
+    const occupancy::Float64
+    const temp_factor::Float64
+    const element::String
+    const charge::String
+    const residue::StructuralElement
 end
 
 function Atom(a::Atom, r::StructuralElement)
-    return Atom(a.serial, a.name, a.alt_loc_id, copy(a.coords), a.occupancy,
+    return Atom(a.serial, a.name, a.alt_loc_id, a.coords, a.occupancy,
                 a.temp_factor, a.element, a.charge, r)
 end
 
@@ -234,7 +234,7 @@ struct AtomRecord
     chain_id::String
     res_number::Int
     ins_code::Char
-    coords::Vector{Float64}
+    coords::SVector{3,Float64}
     occupancy::Float64
     temp_factor::Float64
     element::String
@@ -483,7 +483,7 @@ Set the x coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-x!(at::Atom, x::Real) = (at.coords[1] = x; at)
+x!(at::Atom, x::Real) = (at.coords = SVector{3,Float64}((x, at.coords[2], at.coords[3])))
 x!(dis_at::DisorderedAtom, x::Real) = x!(defaultatom(dis_at), x)
 
 """
@@ -501,7 +501,7 @@ Set the y coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-y!(at::Atom, y::Real) = (at.coords[2] = y; at)
+y!(at::Atom, y::Real) = (at.coords = SVector{3,Float64}((at.coords[1], y, at.coords[3])))
 y!(dis_at::DisorderedAtom, y::Real) = y!(defaultatom(dis_at), y)
 
 """
@@ -519,13 +519,13 @@ Set the z coordinate in Å of an `AbstractAtom` to `val`.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
-z!(at::Atom, z::Real) = (at.coords[3] = z; at)
+z!(at::Atom, z::Real) = (at.coords = SVector{3,Float64}((at.coords[1], at.coords[2], z)))
 z!(dis_at::DisorderedAtom, z::Real) = z!(defaultatom(dis_at), z)
 
 """
     coords(at)
 
-Get the coordinates in Å of an `AbstractAtom` as a `Vector{Float64}`.
+Get the coordinates in Å of an `AbstractAtom` as a `SVector{3,Float64}`.
 """
 coords(at::Atom) = at.coords
 coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at))
@@ -533,7 +533,7 @@ coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at))
 """
     coords!(at, new_coords)
 
-Set the coordinates in Å of an `AbstractAtom` to a `Vector` of 3 numbers.
+Set the coordinates in Å of an `AbstractAtom` to `new_coords`, an iterable of 3 numbers.
 
 For `DisorderedAtom`s only the default atom is updated.
 """
@@ -541,9 +541,8 @@ function coords!(at::Atom, new_coords)
     if length(new_coords) != 3
         throw(ArgumentError("3 coordinates must be given"))
     end
-    x!(at, new_coords[1])
-    y!(at, new_coords[2])
-    z!(at, new_coords[3])
+    x, y, z = new_coords
+    at.coords = SVector{3,Float64}((x, y, z))
     return at
 end
 
@@ -784,7 +783,7 @@ function resid(hetatm::Bool, resnum::Int, inscode::Char)
         end
     else
         if inscode == ' '
-            return "$resnum"
+            return string(resnum)
         else
             return "$resnum$inscode"
         end
diff --git a/src/pdb.jl b/src/pdb.jl
index 3f7e6b6..eb36891 100644
--- a/src/pdb.jl
+++ b/src/pdb.jl
@@ -127,11 +127,11 @@ function AtomRecord(pdb_line::String, line_n::Integer=1)
         parsechainid(pdb_line, line_n),
         parseresnumber(pdb_line, line_n),
         parseinscode(pdb_line, line_n),
-        [
+        SVector{3,Float64}((
             parsecoordx(pdb_line, line_n),
             parsecoordy(pdb_line, line_n),
             parsecoordz(pdb_line, line_n)
-        ],
+        )),
         n >= 60 ? parseoccupancy(pdb_line) : 1.0,
         n >= 66 ? parsetempfac(pdb_line) : 0.0,
         n >= 78 ? parseelement(pdb_line) : "  ",
diff --git a/test/runtests.jl b/test/runtests.jl
index 46a794c..e7ed512 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -246,7 +246,7 @@ end
     testparent(getchildren(struc), struc)
     struc_copy = copy(struc)
     testparent(getchildren(struc_copy), struc_copy)
-    struc_copy['A'][10]["CA"].coords[2] = 100
+    y!(struc_copy['A'][10]["CA"], 100)
     @test struc_copy['A'][10]["CA"].coords[2] == 100
     @test a.coords[2] == 2
     @test struc['A'][10]["CA"].coords[2] == 2

From be81bbf7bcb9110b2a34e391dca4be58664d22fb Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Wed, 14 May 2025 03:59:28 -0500
Subject: [PATCH 2/2] Delay SubString->String conversion

This has a surprisingly large benefit for performance
---
 src/mmcif.jl     | 32 ++++++++++++++++++--------------
 test/runtests.jl |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/mmcif.jl b/src/mmcif.jl
index e2b250f..970ad8b 100644
--- a/src/mmcif.jl
+++ b/src/mmcif.jl
@@ -75,16 +75,20 @@ Call `MMCIFDict` with a filepath or stream to read the dictionary from that
 source.
 The keyword argument `gzip` (default `false`) determines if the input is gzipped.
 """
-struct MMCIFDict <: AbstractDict{String, Vector{String}}
-    dict::Dict{String, Vector{String}}
+struct MMCIFDict{K<:AbstractString} <: AbstractDict{K, Vector{K}}
+    dict::Dict{K, Vector{K}}
 end
 
-MMCIFDict() = MMCIFDict(Dict())
+MMCIFDict{K}() where K<:AbstractString = MMCIFDict{K}(Dict{K,Vector{K}}())
+MMCIFDict() = MMCIFDict{String}()
+
+MMCIFDict(d::AbstractDict{K, Vector{K}}) where K<:AbstractString = MMCIFDict{K}(d)
+MMCIFDict(d::AbstractDict) = MMCIFDict{String}(Dict(d))
 
 Base.getindex(mmcif_dict::MMCIFDict, field::AbstractString) = mmcif_dict.dict[field]
 
 function Base.setindex!(mmcif_dict::MMCIFDict,
-                    val::AbstractVector{<:String},
+                    val::AbstractVector{<:AbstractString},
                     field::AbstractString)
     mmcif_dict.dict[field] = val
     return mmcif_dict
@@ -147,7 +151,7 @@ splitline(s::AbstractString) = splitline!(String[], s)   # mostly for testing
 
 # Get tokens from a mmCIF file
 function tokenizecif(f::IO)
-    tokens = String[]
+    tokens = SubString{String}[]
     for line in eachline(f)
         if startswith(line, "#")
             continue
@@ -172,7 +176,7 @@ end
 # This will fail if there is only a single atom record in the file
 #   and it is not in the loop format
 function tokenizecifstructure(f::IO)
-    tokens = String[]
+    tokens = SubString{String}[]
     reading = false
     in_keys = true
     category_groups = ["_atom_site.", "_struct_conf."]
@@ -218,7 +222,6 @@ end
 
 # Read a mmCIF file into a MMCIFDict
 function MMCIFDict(f::IO; gzip::Bool=false)
-    mmcif_dict = MMCIFDict()
     if gzip
         gz = GzipDecompressorStream(f)
         tokens = tokenizecif(gz)
@@ -226,6 +229,7 @@ function MMCIFDict(f::IO; gzip::Bool=false)
     else
         tokens = tokenizecif(f)
     end
+    mmcif_dict = MMCIFDict{eltype(tokens)}()
     # Data label token is read first
     if length(tokens) == 0
         return mmcif_dict
@@ -236,16 +240,16 @@ function MMCIFDict(f::IO; gzip::Bool=false)
 end
 
 # Add tokens to a mmCIF dictionary
-function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString})
+function populatedict!(mmcif_dict::MMCIFDict{K}, tokens::AbstractVector{<:AbstractString}) where K<:AbstractString
     key = ""
-    keys = String[]
+    keys = K[]
     loop_flag = false
     i = 0 # Value counter
     n = 0 # Key counter
     for token in tokens
         if token == "loop_" || token == "LOOP_"
             loop_flag = true
-            keys = String[]
+            keys = K[]
             i = 0
             n = 0
             continue
@@ -258,7 +262,7 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractS
                 if i > 0
                     loop_flag = false
                 else
-                    mmcif_dict[token] = String[]
+                    mmcif_dict[token] = K[]
                     push!(keys, token)
                     n += 1
                     continue
@@ -290,7 +294,6 @@ function Base.read(input::IO,
             run_dssp::Bool=false,
             run_stride::Bool=false,
             gzip::Bool=false)
-    mmcif_dict = MMCIFDict()
     if gzip
         gz = GzipDecompressorStream(input)
         tokens = tokenizecifstructure(gz)
@@ -298,6 +301,7 @@ function Base.read(input::IO,
     else
         tokens = tokenizecifstructure(input)
     end
+    mmcif_dict = MMCIFDict{eltype(tokens)}()
     populatedict!(mmcif_dict, tokens)
     return MolecularStructure(
         mmcif_dict;
@@ -673,13 +677,13 @@ end
 Write multiple `MMCIFDict`s as a `Dict{String, MMCIFDict}` to a filepath or stream.
 The keyword argument `gzip` (default `false`) determines if the output is gzipped.
 """
-function writemultimmcif(filepath::AbstractString, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
+function writemultimmcif(filepath::AbstractString, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
     open(filepath, "w") do f
         writemultimmcif(f, cifs; gzip=gzip)
     end
 end
 
-function writemultimmcif(io::IO, cifs::Dict{String, MMCIFDict}; gzip::Bool=false)
+function writemultimmcif(io::IO, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false)
     if gzip
         io = GzipCompressorStream(io)
     end
diff --git a/test/runtests.jl b/test/runtests.jl
index e7ed512..93abe51 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1783,7 +1783,7 @@ end
     mmcif_1ake = testfilepath("mmCIF", "1AKE.cif")
     gzip_file(mmcif_1ake, temp_filename)
     for dic in (MMCIFDict(mmcif_1ake), MMCIFDict(temp_filename; gzip=true))
-        @test isa(dic.dict, Dict{String, Vector{String}})
+        @test isa(dic.dict, Dict{K, Vector{K}} where K<:AbstractString)
         @test dic["_pdbx_database_status.recvd_initial_deposition_date"] == ["1991-11-08"]
         @test dic["_audit_author.name"] == ["Mueller, C.W.", "Schulz, G.E."]
         @test length(dic["_atom_site.group_PDB"]) == 3816