From 3733e85370b77b5c481a01246e96c7fda21e5356 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Thu, 8 May 2025 05:52:52 -0500 Subject: [PATCH 1/2] Improve read perf, store coords as SVector This substantially reduces the amount of memory used to store coordinates and parse files. While parsing a large mmCIF file, the memory usage dropped by approximately 50%, and read time by ~15%. The most potentially-disruptive change is that the coordinates are now stored as SVectors instead of Vectors. This means that the coordinates are now immutable, and you cannot change them in place by manual indexing. The `x!`, `y!`, and `z!` functions still work, as do in-place transformations, by making the `coords` field itself mutable. --- Project.toml | 2 ++ src/BioStructures.jl | 1 + src/mmcif.jl | 75 ++++++++++++++++++++++---------------------- src/model.jl | 41 ++++++++++++------------ src/pdb.jl | 4 +-- test/runtests.jl | 2 +- 6 files changed, 64 insertions(+), 61 deletions(-) diff --git a/Project.toml b/Project.toml index 589f8d6..746158e 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [weakdeps] @@ -50,5 +51,6 @@ MetaGraphs = "0.7, 0.8" PrecompileTools = "1" RecipesBase = "1" STRIDE_jll = "1" +StaticArraysCore = "1" Statistics = "1.9" julia = "1.9" diff --git a/src/BioStructures.jl b/src/BioStructures.jl index 16c20c0..2b36639 100644 --- a/src/BioStructures.jl +++ b/src/BioStructures.jl @@ -18,6 +18,7 @@ using PrecompileTools using RecipesBase using LinearAlgebra +using StaticArraysCore using Statistics include("model.jl") diff --git a/src/mmcif.jl b/src/mmcif.jl index 1ca28c2..e2b250f 100644 --- a/src/mmcif.jl +++ b/src/mmcif.jl @@ -104,8 +104,7 @@ end # Split a mmCIF line into tokens # See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax -function splitline(s::AbstractString) - tokens = String[] +function splitline!(tokens, s::AbstractString) in_token = false # Quote character of the currently open quote, or ' ' if no quote open quote_open_char = ' ' @@ -114,7 +113,7 @@ function splitline(s::AbstractString) if c in whitespacechars if in_token && quote_open_char == ' ' in_token = false - push!(tokens, s[start_i:(i - 1)]) + push!(tokens, @view(s[start_i:(i - 1)])) end elseif c in quotechars if quote_open_char == ' ' @@ -127,7 +126,7 @@ function splitline(s::AbstractString) elseif c == quote_open_char && (i == length(s) || s[i + 1] in whitespacechars) quote_open_char = ' ' in_token = false - push!(tokens, s[start_i:(i - 1)]) + push!(tokens, @view(s[start_i:(i - 1)])) end elseif c == '#' && !in_token return tokens @@ -137,13 +136,14 @@ function splitline(s::AbstractString) end end if in_token - push!(tokens, s[start_i:end]) + push!(tokens, @view(s[start_i:end])) end if quote_open_char != ' ' throw(ArgumentError("Line ended with quote open: $s")) end return tokens end +splitline(s::AbstractString) = splitline!(String[], s) # mostly for testing # Get tokens from a mmCIF file function tokenizecif(f::IO) @@ -162,7 +162,7 @@ function tokenizecif(f::IO) end push!(tokens, join(token_buffer, "\n")) else - append!(tokens, splitline(line)) + splitline!(tokens, line) end end return tokens @@ -204,7 +204,7 @@ function tokenizecifstructure(f::IO) in_keys = true else in_keys = false - append!(tokens, splitline(line)) + splitline!(tokens, line) end end return tokens @@ -236,7 +236,7 @@ function MMCIFDict(f::IO; gzip::Bool=false) end # Add tokens to a mmCIF dictionary -function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String}) +function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString}) key = "" keys = String[] loop_flag = false @@ -264,16 +264,8 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:String}) continue end else - try - push!(mmcif_dict[keys[i % n + 1]], token) - catch ex - # A zero division error means we have not found any keys - if isa(ex, DivideError) - throw(ArgumentError("Loop keys not found, token: \"$token\"")) - else - rethrow() - end - end + iszero(n) && throw(ArgumentError("Loop keys not found, token: \"$token\"")) + push!(mmcif_dict[keys[i % n + 1]], token) i += 1 continue end @@ -384,25 +376,34 @@ function MolecularStructure(mmcif_dict::MMCIFDict; end # Constructor from mmCIF ATOM/HETATM line -AtomRecord(d::MMCIFDict, i::Integer) = AtomRecord( - d["_atom_site.group_PDB"][i] == "HETATM", - parse(Int, d["_atom_site.id"][i]), - d["_atom_site.auth_atom_id"][i], - d["_atom_site.label_alt_id"][i] in missingvals ? ' ' : d["_atom_site.label_alt_id"][i][1], - d["_atom_site.auth_comp_id"][i], - d["_atom_site.auth_asym_id"][i], - parse(Int, d["_atom_site.auth_seq_id"][i]), - d["_atom_site.pdbx_PDB_ins_code"][i] in missingvals ? ' ' : d["_atom_site.pdbx_PDB_ins_code"][i][1], - [ - parse(Float64, d["_atom_site.Cartn_x"][i]), - parse(Float64, d["_atom_site.Cartn_y"][i]), - parse(Float64, d["_atom_site.Cartn_z"][i]) - ], - d["_atom_site.occupancy"][i] in missingvals ? 1.0 : parse(Float64, d["_atom_site.occupancy"][i]), - d["_atom_site.B_iso_or_equiv"][i] in missingvals ? 0.0 : parse(Float64, d["_atom_site.B_iso_or_equiv"][i]), - d["_atom_site.type_symbol"][i] in missingvals ? " " : d["_atom_site.type_symbol"][i], - d["_atom_site.pdbx_formal_charge"][i] in missingvals ? " " : d["_atom_site.pdbx_formal_charge"][i], -) +function AtomRecord(d::MMCIFDict, i::Integer) + alt_id = d["_atom_site.label_alt_id"][i] + ins_code = d["_atom_site.pdbx_PDB_ins_code"][i] + occupancy = d["_atom_site.occupancy"][i] + temp_factor = d["_atom_site.B_iso_or_equiv"][i] + typesym = d["_atom_site.type_symbol"][i] + charge = d["_atom_site.pdbx_formal_charge"][i] + + return AtomRecord( + d["_atom_site.group_PDB"][i] == "HETATM", + parse(Int, d["_atom_site.id"][i]), + d["_atom_site.auth_atom_id"][i], + alt_id in missingvals ? ' ' : alt_id[1], + d["_atom_site.auth_comp_id"][i], + d["_atom_site.auth_asym_id"][i], + parse(Int, d["_atom_site.auth_seq_id"][i]), + ins_code in missingvals ? ' ' : ins_code[1], + SVector{3,Float64}(( + parse(Float64, d["_atom_site.Cartn_x"][i]), + parse(Float64, d["_atom_site.Cartn_y"][i]), + parse(Float64, d["_atom_site.Cartn_z"][i]), + )), + occupancy in missingvals ? 1.0 : parse(Float64, occupancy), + temp_factor in missingvals ? 0.0 : parse(Float64, temp_factor), + typesym in missingvals ? " " : typesym, + charge in missingvals ? " " : charge, + ) +end # Format a mmCIF data value by enclosing with quotes or semicolon lines where # appropriate. See diff --git a/src/model.jl b/src/model.jl index 52545e2..3c57aef 100644 --- a/src/model.jl +++ b/src/model.jl @@ -102,20 +102,20 @@ end Base.showerror(io::IO, e::PDBConsistencyError) = print(io, "PDBConsistencyError: ", e.message) "An atom that is part of a macromolecule." -struct Atom <: AbstractAtom - serial::Int - name::String - alt_loc_id::Char - coords::Vector{Float64} - occupancy::Float64 - temp_factor::Float64 - element::String - charge::String - residue::StructuralElement +mutable struct Atom <: AbstractAtom + const serial::Int + const name::String + const alt_loc_id::Char + coords::SVector{3,Float64} + const occupancy::Float64 + const temp_factor::Float64 + const element::String + const charge::String + const residue::StructuralElement end function Atom(a::Atom, r::StructuralElement) - return Atom(a.serial, a.name, a.alt_loc_id, copy(a.coords), a.occupancy, + return Atom(a.serial, a.name, a.alt_loc_id, a.coords, a.occupancy, a.temp_factor, a.element, a.charge, r) end @@ -234,7 +234,7 @@ struct AtomRecord chain_id::String res_number::Int ins_code::Char - coords::Vector{Float64} + coords::SVector{3,Float64} occupancy::Float64 temp_factor::Float64 element::String @@ -483,7 +483,7 @@ Set the x coordinate in Å of an `AbstractAtom` to `val`. For `DisorderedAtom`s only the default atom is updated. """ -x!(at::Atom, x::Real) = (at.coords[1] = x; at) +x!(at::Atom, x::Real) = (at.coords = SVector{3,Float64}((x, at.coords[2], at.coords[3]))) x!(dis_at::DisorderedAtom, x::Real) = x!(defaultatom(dis_at), x) """ @@ -501,7 +501,7 @@ Set the y coordinate in Å of an `AbstractAtom` to `val`. For `DisorderedAtom`s only the default atom is updated. """ -y!(at::Atom, y::Real) = (at.coords[2] = y; at) +y!(at::Atom, y::Real) = (at.coords = SVector{3,Float64}((at.coords[1], y, at.coords[3]))) y!(dis_at::DisorderedAtom, y::Real) = y!(defaultatom(dis_at), y) """ @@ -519,13 +519,13 @@ Set the z coordinate in Å of an `AbstractAtom` to `val`. For `DisorderedAtom`s only the default atom is updated. """ -z!(at::Atom, z::Real) = (at.coords[3] = z; at) +z!(at::Atom, z::Real) = (at.coords = SVector{3,Float64}((at.coords[1], at.coords[2], z))) z!(dis_at::DisorderedAtom, z::Real) = z!(defaultatom(dis_at), z) """ coords(at) -Get the coordinates in Å of an `AbstractAtom` as a `Vector{Float64}`. +Get the coordinates in Å of an `AbstractAtom` as a `SVector{3,Float64}`. """ coords(at::Atom) = at.coords coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at)) @@ -533,7 +533,7 @@ coords(dis_at::DisorderedAtom) = coords(defaultatom(dis_at)) """ coords!(at, new_coords) -Set the coordinates in Å of an `AbstractAtom` to a `Vector` of 3 numbers. +Set the coordinates in Å of an `AbstractAtom` to `new_coords`, an iterable of 3 numbers. For `DisorderedAtom`s only the default atom is updated. """ @@ -541,9 +541,8 @@ function coords!(at::Atom, new_coords) if length(new_coords) != 3 throw(ArgumentError("3 coordinates must be given")) end - x!(at, new_coords[1]) - y!(at, new_coords[2]) - z!(at, new_coords[3]) + x, y, z = new_coords + at.coords = SVector{3,Float64}((x, y, z)) return at end @@ -784,7 +783,7 @@ function resid(hetatm::Bool, resnum::Int, inscode::Char) end else if inscode == ' ' - return "$resnum" + return string(resnum) else return "$resnum$inscode" end diff --git a/src/pdb.jl b/src/pdb.jl index 3f7e6b6..eb36891 100644 --- a/src/pdb.jl +++ b/src/pdb.jl @@ -127,11 +127,11 @@ function AtomRecord(pdb_line::String, line_n::Integer=1) parsechainid(pdb_line, line_n), parseresnumber(pdb_line, line_n), parseinscode(pdb_line, line_n), - [ + SVector{3,Float64}(( parsecoordx(pdb_line, line_n), parsecoordy(pdb_line, line_n), parsecoordz(pdb_line, line_n) - ], + )), n >= 60 ? parseoccupancy(pdb_line) : 1.0, n >= 66 ? parsetempfac(pdb_line) : 0.0, n >= 78 ? parseelement(pdb_line) : " ", diff --git a/test/runtests.jl b/test/runtests.jl index 46a794c..e7ed512 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -246,7 +246,7 @@ end testparent(getchildren(struc), struc) struc_copy = copy(struc) testparent(getchildren(struc_copy), struc_copy) - struc_copy['A'][10]["CA"].coords[2] = 100 + y!(struc_copy['A'][10]["CA"], 100) @test struc_copy['A'][10]["CA"].coords[2] == 100 @test a.coords[2] == 2 @test struc['A'][10]["CA"].coords[2] == 2 From be81bbf7bcb9110b2a34e391dca4be58664d22fb Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Wed, 14 May 2025 03:59:28 -0500 Subject: [PATCH 2/2] Delay SubString->String conversion This has a surprisingly large benefit for performance --- src/mmcif.jl | 32 ++++++++++++++++++-------------- test/runtests.jl | 2 +- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/mmcif.jl b/src/mmcif.jl index e2b250f..970ad8b 100644 --- a/src/mmcif.jl +++ b/src/mmcif.jl @@ -75,16 +75,20 @@ Call `MMCIFDict` with a filepath or stream to read the dictionary from that source. The keyword argument `gzip` (default `false`) determines if the input is gzipped. """ -struct MMCIFDict <: AbstractDict{String, Vector{String}} - dict::Dict{String, Vector{String}} +struct MMCIFDict{K<:AbstractString} <: AbstractDict{K, Vector{K}} + dict::Dict{K, Vector{K}} end -MMCIFDict() = MMCIFDict(Dict()) +MMCIFDict{K}() where K<:AbstractString = MMCIFDict{K}(Dict{K,Vector{K}}()) +MMCIFDict() = MMCIFDict{String}() + +MMCIFDict(d::AbstractDict{K, Vector{K}}) where K<:AbstractString = MMCIFDict{K}(d) +MMCIFDict(d::AbstractDict) = MMCIFDict{String}(Dict(d)) Base.getindex(mmcif_dict::MMCIFDict, field::AbstractString) = mmcif_dict.dict[field] function Base.setindex!(mmcif_dict::MMCIFDict, - val::AbstractVector{<:String}, + val::AbstractVector{<:AbstractString}, field::AbstractString) mmcif_dict.dict[field] = val return mmcif_dict @@ -147,7 +151,7 @@ splitline(s::AbstractString) = splitline!(String[], s) # mostly for testing # Get tokens from a mmCIF file function tokenizecif(f::IO) - tokens = String[] + tokens = SubString{String}[] for line in eachline(f) if startswith(line, "#") continue @@ -172,7 +176,7 @@ end # This will fail if there is only a single atom record in the file # and it is not in the loop format function tokenizecifstructure(f::IO) - tokens = String[] + tokens = SubString{String}[] reading = false in_keys = true category_groups = ["_atom_site.", "_struct_conf."] @@ -218,7 +222,6 @@ end # Read a mmCIF file into a MMCIFDict function MMCIFDict(f::IO; gzip::Bool=false) - mmcif_dict = MMCIFDict() if gzip gz = GzipDecompressorStream(f) tokens = tokenizecif(gz) @@ -226,6 +229,7 @@ function MMCIFDict(f::IO; gzip::Bool=false) else tokens = tokenizecif(f) end + mmcif_dict = MMCIFDict{eltype(tokens)}() # Data label token is read first if length(tokens) == 0 return mmcif_dict @@ -236,16 +240,16 @@ function MMCIFDict(f::IO; gzip::Bool=false) end # Add tokens to a mmCIF dictionary -function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractString}) +function populatedict!(mmcif_dict::MMCIFDict{K}, tokens::AbstractVector{<:AbstractString}) where K<:AbstractString key = "" - keys = String[] + keys = K[] loop_flag = false i = 0 # Value counter n = 0 # Key counter for token in tokens if token == "loop_" || token == "LOOP_" loop_flag = true - keys = String[] + keys = K[] i = 0 n = 0 continue @@ -258,7 +262,7 @@ function populatedict!(mmcif_dict::MMCIFDict, tokens::AbstractVector{<:AbstractS if i > 0 loop_flag = false else - mmcif_dict[token] = String[] + mmcif_dict[token] = K[] push!(keys, token) n += 1 continue @@ -290,7 +294,6 @@ function Base.read(input::IO, run_dssp::Bool=false, run_stride::Bool=false, gzip::Bool=false) - mmcif_dict = MMCIFDict() if gzip gz = GzipDecompressorStream(input) tokens = tokenizecifstructure(gz) @@ -298,6 +301,7 @@ function Base.read(input::IO, else tokens = tokenizecifstructure(input) end + mmcif_dict = MMCIFDict{eltype(tokens)}() populatedict!(mmcif_dict, tokens) return MolecularStructure( mmcif_dict; @@ -673,13 +677,13 @@ end Write multiple `MMCIFDict`s as a `Dict{String, MMCIFDict}` to a filepath or stream. The keyword argument `gzip` (default `false`) determines if the output is gzipped. """ -function writemultimmcif(filepath::AbstractString, cifs::Dict{String, MMCIFDict}; gzip::Bool=false) +function writemultimmcif(filepath::AbstractString, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false) open(filepath, "w") do f writemultimmcif(f, cifs; gzip=gzip) end end -function writemultimmcif(io::IO, cifs::Dict{String, MMCIFDict}; gzip::Bool=false) +function writemultimmcif(io::IO, cifs::Dict{String, <:MMCIFDict}; gzip::Bool=false) if gzip io = GzipCompressorStream(io) end diff --git a/test/runtests.jl b/test/runtests.jl index e7ed512..93abe51 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1783,7 +1783,7 @@ end mmcif_1ake = testfilepath("mmCIF", "1AKE.cif") gzip_file(mmcif_1ake, temp_filename) for dic in (MMCIFDict(mmcif_1ake), MMCIFDict(temp_filename; gzip=true)) - @test isa(dic.dict, Dict{String, Vector{String}}) + @test isa(dic.dict, Dict{K, Vector{K}} where K<:AbstractString) @test dic["_pdbx_database_status.recvd_initial_deposition_date"] == ["1991-11-08"] @test dic["_audit_author.name"] == ["Mueller, C.W.", "Schulz, G.E."] @test length(dic["_atom_site.group_PDB"]) == 3816