From 1b6acc77b75c0e3134d83c7dd6379226c9602d46 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Mon, 1 Sep 2025 12:45:51 +0200 Subject: [PATCH 01/18] add molecular_weight function --- src/molecular-weight.jl | 175 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 src/molecular-weight.jl diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl new file mode 100644 index 00000000..6824b07b --- /dev/null +++ b/src/molecular-weight.jl @@ -0,0 +1,175 @@ +# Definning const of molecular weight of canonical amino acids in g/mol +const AA_A_MW = 89.09 # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 +const AA_R_MW = 174.20 # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 +const AA_N_MW = 132.12 # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 +const AA_D_MW = 133.10 # Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 +const AA_C_MW = 121.16 # Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 +const AA_Q_MW = 146.14 # Glutamine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5961 +const AA_E_MW = 147.13 # Glutamate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/33032 +const AA_G_MW = 75.07 # Glycine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/750 +const AA_H_MW = 155.15 # Histidine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6274 +const AA_I_MW = 131.17 # Isoleucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6306 +const AA_L_MW = 131.17 # Leucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6106 +const AA_K_MW = 146.19 # Lysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5962 +const AA_M_MW = 149.21 # Methionine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6137 +const AA_F_MW = 165.19 # Phenylalanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6140 +const AA_P_MW = 115.13 # Proline -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/145742 +const AA_S_MW = 105.09 # Serine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5951 +const AA_T_MW = 119.12 # Threonine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6288 +const AA_W_MW = 204.22 # Tryptophan -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6305 +const AA_Y_MW = 181.19 # Tyrosine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6057 +const AA_V_MW = 117.15 # Valine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6287 +# Defining water molecular weight (required for calculation of water lost) +const WATER_MW = 18.02 # Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html + +# Non canonical / not defined amino acids in g/mol +const AA_O_MW_MW = 255.31 # Pyrrolysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/21873141 +const AA_U_MW_MW = 168.06 # Selenocysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/25076 +const AA_B_MW_MW = (AA_N_MW + AA_D_MW)/2 # Aspartate or Asparagine -> Average between Aspartic acid and Asparagine +const AA_J_MW_MW = 131.17 # Leucine or Isoleucine -> Both have the same molecular weight +const AA_Z_MW_MW = (AA_Q_MW + AA_D_MW)/2 # Glutamine or Glutamine -> Average between Glutamine and Glutamine +const AA_X_MW_MW = 110 # Any amino acid (average) -> Source = http://thermofisher.com/de/de/home/references/ambion-tech-support/rna-tools-and-calculators/proteins-and-amino-acids.html +const AA_O_Gap_MW = 0 # Consider gap as 0 + +# Defining the function molecular_weight, which accepts a AA sequence and units ("gmol", "KDa") as input and return molecular weight in either g/mol or kDa. If not specified it returns in g/mol +function molecular_weight(aa_seq::LongAA, unit=:"gmol") + AA_vector = Vector{Float64}(undef, 0) + i = 1 + while i <= length(aa_seq) + if aa_seq[i] == AA_A + push!(AA_vector, AA_A_MW) + elseif aa_seq[i] == AA_R + push!(AA_vector, AA_R_MW) + elseif aa_seq[i] == AA_N + push!(AA_vector, AA_N_MW) + elseif aa_seq[i] == AA_D + push!(AA_vector, AA_D_MW) + elseif aa_seq[i] == AA_C + push!(AA_vector, AA_C_MW) + elseif aa_seq[i] == AA_Q + push!(AA_vector, AA_Q_MW) + elseif aa_seq[i] == AA_E + push!(AA_vector, AA_E_MW) + elseif aa_seq[i] == AA_G + push!(AA_vector, AA_G_MW) + elseif aa_seq[i] == AA_H + push!(AA_vector, AA_H_MW) + elseif aa_seq[i] == AA_I + push!(AA_vector, AA_I_MW) + elseif aa_seq[i] == AA_L + push!(AA_vector, AA_L_MW) + elseif aa_seq[i] == AA_K + push!(AA_vector, AA_K_MW) + elseif aa_seq[i] == AA_M + push!(AA_vector, AA_M_MW) + elseif aa_seq[i] == AA_F + push!(AA_vector, AA_F_MW) + elseif aa_seq[i] == AA_P + push!(AA_vector, AA_P_MW) + elseif aa_seq[i] == AA_S + push!(AA_vector, AA_S_MW) + elseif aa_seq[i] == AA_T + push!(AA_vector, AA_T_MW) + elseif aa_seq[i] == AA_W + push!(AA_vector, AA_W_MW) + elseif aa_seq[i] == AA_Y + push!(AA_vector, AA_Y_MW) + elseif aa_seq[i] == AA_V + push!(AA_vector, AA_V_MW) + elseif aa_seq[i] == AA_O + push!(AA_vector, AA_O_MW_MW) + elseif aa_seq[i] == AA_U + push!(AA_vector, AA_U_MW_MW) + elseif aa_seq[i] == AA_B + push!(AA_vector, AA_B_MW_MW) + elseif aa_seq[i] == AA_J + push!(AA_vector, AA_J_MW_MW) + elseif aa_seq[i] == AA_Z + push!(AA_vector, AA_Z_MW_MW) + elseif aa_seq[i] == AA_X + push!(AA_vector, AA_X_MW_MW) + elseif aa_seq[i] == AA_O_Gap + push!(AA_vector, AA_O_Gap_MW) + else + println("Error due to unknown amino acid at $i position") + return nothing + end + i = i+1 + end + if unit == "gmol" + return round(sum(AA_vector) - ((length(aa_seq) - 1) * WATER_MW); digits = 3) + elseif unit == "kDa" + return round(((sum(AA_vector) - ((length(aa_seq) - 1) * WATER_MW))/1000); digits = 3) + else + return nothing + end +end + +# Defining a function, which accepts a AA sequence and print the molecular weight in g/mol and kDa from a given n_start position in the sequence until a n_end given position in the sequence +function molecular_weight(aa_seq::LongAA, n_start::Integer, n_end::Integer) + AA_vector = Vector{Float64}(undef, 0) + i = n_start + while i <= n_end + if aa_seq[i] == AA_A + push!(AA_vector, AA_A_MW) + elseif aa_seq[i] == AA_R + push!(AA_vector, AA_R_MW) + elseif aa_seq[i] == AA_N + push!(AA_vector, AA_N_MW) + elseif aa_seq[i] == AA_D + push!(AA_vector, AA_D_MW) + elseif aa_seq[i] == AA_C + push!(AA_vector, AA_C_MW) + elseif aa_seq[i] == AA_Q + push!(AA_vector, AA_Q_MW) + elseif aa_seq[i] == AA_E + push!(AA_vector, AA_E_MW) + elseif aa_seq[i] == AA_G + push!(AA_vector, AA_G_MW) + elseif aa_seq[i] == AA_H + push!(AA_vector, AA_H_MW) + elseif aa_seq[i] == AA_I + push!(AA_vector, AA_I_MW) + elseif aa_seq[i] == AA_L + push!(AA_vector, AA_L_MW) + elseif aa_seq[i] == AA_K + push!(AA_vector, AA_K_MW) + elseif aa_seq[i] == AA_M + push!(AA_vector, AA_M_MW) + elseif aa_seq[i] == AA_F + push!(AA_vector, AA_F_MW) + elseif aa_seq[i] == AA_P + push!(AA_vector, AA_P_MW) + elseif aa_seq[i] == AA_S + push!(AA_vector, AA_S_MW) + elseif aa_seq[i] == AA_T + push!(AA_vector, AA_T_MW) + elseif aa_seq[i] == AA_W + push!(AA_vector, AA_W_MW) + elseif aa_seq[i] == AA_Y + push!(AA_vector, AA_Y_MW) + elseif aa_seq[i] == AA_V + push!(AA_vector, AA_V_MW) + elseif aa_seq[i] == AA_O + push!(AA_vector, AA_O_MW_MW) + elseif aa_seq[i] == AA_U + push!(AA_vector, AA_U_MW_MW) + elseif aa_seq[i] == AA_B + push!(AA_vector, AA_B_MW_MW) + elseif aa_seq[i] == AA_J + push!(AA_vector, AA_J_MW_MW) + elseif aa_seq[i] == AA_Z + push!(AA_vector, AA_Z_MW_MW) + elseif aa_seq[i] == AA_X + push!(AA_vector, AA_X_MW_MW) + elseif aa_seq[i] == AA_O_Gap + push!(AA_vector, AA_O_Gap_MW) + else + println("Error due to unknown amino acid at $i position") + return nothing + end + i = i+1 + end + println("$(round(sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW); digits = 3)) g/mol") + println("$(round(((sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW))/1000); digits = 3)) kDa") +end \ No newline at end of file From d13ab6961fa064e3c7cdce943331a4cd37e31645 Mon Sep 17 00:00:00 2001 From: Simon Christ Date: Mon, 1 Sep 2025 13:23:52 +0200 Subject: [PATCH 02/18] include molecular weights file --- src/BioSequences.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 9c52b20a..72c1cf5f 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -223,6 +223,7 @@ include("longsequences/randseq.jl") include("longsequences/find.jl") include("counting.jl") +include("molecular-weight.jl") include("geneticcode.jl") From 912398310d834738bc41b5b51173eb197e0f93c2 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Mon, 1 Sep 2025 15:33:17 +0200 Subject: [PATCH 03/18] edit function to accept any AA_seq --- src/molecular-weight.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 6824b07b..11c990eb 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -32,7 +32,7 @@ const AA_X_MW_MW = 110 # Any amino acid (average) -> const AA_O_Gap_MW = 0 # Consider gap as 0 # Defining the function molecular_weight, which accepts a AA sequence and units ("gmol", "KDa") as input and return molecular weight in either g/mol or kDa. If not specified it returns in g/mol -function molecular_weight(aa_seq::LongAA, unit=:"gmol") +function molecular_weight(aa_seq::AASeq, unit=:"gmol") AA_vector = Vector{Float64}(undef, 0) i = 1 while i <= length(aa_seq) @@ -106,7 +106,7 @@ function molecular_weight(aa_seq::LongAA, unit=:"gmol") end # Defining a function, which accepts a AA sequence and print the molecular weight in g/mol and kDa from a given n_start position in the sequence until a n_end given position in the sequence -function molecular_weight(aa_seq::LongAA, n_start::Integer, n_end::Integer) +function molecular_weight(aa_seq::AASeq, n_start::Integer, n_end::Integer) AA_vector = Vector{Float64}(undef, 0) i = n_start while i <= n_end From 45441a35ab0fb58520270cc33484aec4e28b2cf2 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Mon, 1 Sep 2025 20:50:32 +0200 Subject: [PATCH 04/18] Adding implementation for ssDNA and ssRNA --- src/molecular-weight.jl | 85 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 11c990eb..dae56c90 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -172,4 +172,89 @@ function molecular_weight(aa_seq::AASeq, n_start::Integer, n_end::Integer) end println("$(round(sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW); digits = 3)) g/mol") println("$(round(((sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW))/1000); digits = 3)) kDa") +end + +# Definning const of molecular weight of monophosphate in g/mol to account for the loss of water and PPi during synthesis +const DNA_A_MW = 331.22 - WATER_MW # DNA Adenine dAMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/12599 +const RNA_A_MW = 347.22 - WATER_MW # RNA Adenine AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 +const DNA_C_MW = 307.20 - WATER_MW # DNA Cytosine dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 +const RNA_C_MW = 323.20 - WATER_MW # RNA Cytosine CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 +const DNA_G_MW = 347.22 - WATER_MW # DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/1135398597 +const RNA_G_MW = 363.22 - WATER_MW # RNA Guanine GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 +const DNA_T_MW = 322.21 - WATER_MW # DNA Thymine dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 +const RNA_U_MW = 324.18 - WATER_MW # RNA Uracil UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 + +# Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf +# Defining a function, which accepts DNA sequences +function molecular_weight(aa_seq::NucSeq{4, DNAAlphabet{4}}, unit=:"gmol", five_terminal_state=:"Hydroxyl") + AA_vector = Vector{Float64}(undef, 0) + i = 1 + while i <= length(aa_seq) + if aa_seq[i] == DNA_A + push!(AA_vector, DNA_A_MW) + elseif aa_seq[i] == DNA_C + push!(AA_vector, DNA_C_MW) + elseif aa_seq[i] == DNA_G + push!(AA_vector, DNA_G_MW) + elseif aa_seq[i] == DNA_T + push!(AA_vector, DNA_T_MW) + else + println("Error due to unknown nucleotide at $i position") + return nothing + end + i = i+1 + end + if unit == "gmol" && five_terminal_state == "Phosphate" + return round(sum(AA_vector) + 79; digits = 3) + elseif unit == "gmol" && five_terminal_state == "Hydroxyl" + return round(sum(AA_vector) - 62; digits = 3) + elseif unit == "gmol" && five_terminal_state == "Triphosphate" + return round(sum(AA_vector) + 178; digits = 3) + elseif unit == "kDa" && five_terminal_state == "Phosphate" + return (round(sum(AA_vector) + 79; digits = 3))/1000 + elseif unit == "kDa" && five_terminal_state == "Hydroxyl" + return (round(sum(AA_vector) - 62; digits = 3))/1000 + elseif unit == "kDa" && five_terminal_state == "Triphosphate" + return (round(sum(AA_vector) + 178; digits = 3))/1000 + else + println("Error") + return nothing + end +end + +# Defining a function, which accepts a RNA sequences +function molecular_weight(aa_seq::NucSeq{4, RNAAlphabet{4}}, unit=:"gmol", five_terminal_state=:"Triphosphate") + AA_vector = Vector{Float64}(undef, 0) + i = 1 + while i <= length(aa_seq) + if aa_seq[i] == RNA_A + push!(AA_vector, RNA_A_MW) + elseif aa_seq[i] == RNA_C + push!(AA_vector, RNA_C_MW) + elseif aa_seq[i] == RNA_G + push!(AA_vector, RNA_G_MW) + elseif aa_seq[i] == RNA_U + push!(AA_vector, RNA_U_MW) + else + println("Error due to unknown nucleotide at $i position") + return nothing + end + i = i+1 + end + if unit == "gmol" && five_terminal_state == "Triphosphate" + return round(sum(AA_vector) + 178; digits = 3) + elseif unit == "gmol" && five_terminal_state == "Phosphate" + return round(sum(AA_vector) + 79; digits = 3) + elseif unit == "gmol" && five_terminal_state == "Hydroxyl" + return round(sum(AA_vector) - 62; digits = 3) + elseif unit == "kDa" && five_terminal_state == "Triphosphate" + return (round(sum(AA_vector) - 62; digits = 3))/1000 + elseif unit == "kDa" && five_terminal_state == "Phosphate" + return (round(sum(AA_vector) + 79; digits = 3))/1000 + elseif unit == "kDa" && five_terminal_state == "Hydroxyl" + return (round(sum(AA_vector) - 62; digits = 3))/1000 + else + println("Error") + return nothing + end end \ No newline at end of file From 86fdba3acf8cf2458be12d2044ccaf56e8d71a74 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Tue, 2 Sep 2025 13:03:29 +0200 Subject: [PATCH 05/18] Changing implemnetation for AASeq's --- src/molecular-weight.jl | 204 +++++++--------------------------------- 1 file changed, 34 insertions(+), 170 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index dae56c90..962fe462 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -1,177 +1,41 @@ -# Definning const of molecular weight of canonical amino acids in g/mol -const AA_A_MW = 89.09 # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 -const AA_R_MW = 174.20 # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 -const AA_N_MW = 132.12 # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 -const AA_D_MW = 133.10 # Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 -const AA_C_MW = 121.16 # Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 -const AA_Q_MW = 146.14 # Glutamine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5961 -const AA_E_MW = 147.13 # Glutamate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/33032 -const AA_G_MW = 75.07 # Glycine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/750 -const AA_H_MW = 155.15 # Histidine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6274 -const AA_I_MW = 131.17 # Isoleucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6306 -const AA_L_MW = 131.17 # Leucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6106 -const AA_K_MW = 146.19 # Lysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5962 -const AA_M_MW = 149.21 # Methionine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6137 -const AA_F_MW = 165.19 # Phenylalanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6140 -const AA_P_MW = 115.13 # Proline -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/145742 -const AA_S_MW = 105.09 # Serine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5951 -const AA_T_MW = 119.12 # Threonine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6288 -const AA_W_MW = 204.22 # Tryptophan -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6305 -const AA_Y_MW = 181.19 # Tyrosine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6057 -const AA_V_MW = 117.15 # Valine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6287 -# Defining water molecular weight (required for calculation of water lost) -const WATER_MW = 18.02 # Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html +# Sources of the molecular weights of each defined amino acid in g/mol +# AA_A # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 +# AA_R # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 +# AA_N # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 +# AA_D # Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 +# AA_C # Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 +# AA_Q # Glutamine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5961 +# AA_E # Glutamate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/33032 +# AA_G # Glycine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/750 +# AA_H # Histidine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6274 +# AA_I # Isoleucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6306 +# AA_L # Leucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6106 +# AA_K # Lysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5962 +# AA_M # Methionine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6137 +# AA_F # Phenylalanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6140 +# AA_P # Proline -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/145742 +# AA_S # Serine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5951 +# AA_T # Threonine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6288 +# AA_W # Tryptophan -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6305 +# AA_Y # Tyrosine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6057 +# AA_V # Valine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6287 +# AA_O # Pyrrolysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/21873141 +# AA_U # Selenocysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/25076 -# Non canonical / not defined amino acids in g/mol -const AA_O_MW_MW = 255.31 # Pyrrolysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/21873141 -const AA_U_MW_MW = 168.06 # Selenocysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/25076 -const AA_B_MW_MW = (AA_N_MW + AA_D_MW)/2 # Aspartate or Asparagine -> Average between Aspartic acid and Asparagine -const AA_J_MW_MW = 131.17 # Leucine or Isoleucine -> Both have the same molecular weight -const AA_Z_MW_MW = (AA_Q_MW + AA_D_MW)/2 # Glutamine or Glutamine -> Average between Glutamine and Glutamine -const AA_X_MW_MW = 110 # Any amino acid (average) -> Source = http://thermofisher.com/de/de/home/references/ambion-tech-support/rna-tools-and-calculators/proteins-and-amino-acids.html -const AA_O_Gap_MW = 0 # Consider gap as 0 +# Source of the used water molecular weight = 18.02 gmol for calculations +# Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html -# Defining the function molecular_weight, which accepts a AA sequence and units ("gmol", "KDa") as input and return molecular weight in either g/mol or kDa. If not specified it returns in g/mol -function molecular_weight(aa_seq::AASeq, unit=:"gmol") - AA_vector = Vector{Float64}(undef, 0) - i = 1 - while i <= length(aa_seq) - if aa_seq[i] == AA_A - push!(AA_vector, AA_A_MW) - elseif aa_seq[i] == AA_R - push!(AA_vector, AA_R_MW) - elseif aa_seq[i] == AA_N - push!(AA_vector, AA_N_MW) - elseif aa_seq[i] == AA_D - push!(AA_vector, AA_D_MW) - elseif aa_seq[i] == AA_C - push!(AA_vector, AA_C_MW) - elseif aa_seq[i] == AA_Q - push!(AA_vector, AA_Q_MW) - elseif aa_seq[i] == AA_E - push!(AA_vector, AA_E_MW) - elseif aa_seq[i] == AA_G - push!(AA_vector, AA_G_MW) - elseif aa_seq[i] == AA_H - push!(AA_vector, AA_H_MW) - elseif aa_seq[i] == AA_I - push!(AA_vector, AA_I_MW) - elseif aa_seq[i] == AA_L - push!(AA_vector, AA_L_MW) - elseif aa_seq[i] == AA_K - push!(AA_vector, AA_K_MW) - elseif aa_seq[i] == AA_M - push!(AA_vector, AA_M_MW) - elseif aa_seq[i] == AA_F - push!(AA_vector, AA_F_MW) - elseif aa_seq[i] == AA_P - push!(AA_vector, AA_P_MW) - elseif aa_seq[i] == AA_S - push!(AA_vector, AA_S_MW) - elseif aa_seq[i] == AA_T - push!(AA_vector, AA_T_MW) - elseif aa_seq[i] == AA_W - push!(AA_vector, AA_W_MW) - elseif aa_seq[i] == AA_Y - push!(AA_vector, AA_Y_MW) - elseif aa_seq[i] == AA_V - push!(AA_vector, AA_V_MW) - elseif aa_seq[i] == AA_O - push!(AA_vector, AA_O_MW_MW) - elseif aa_seq[i] == AA_U - push!(AA_vector, AA_U_MW_MW) - elseif aa_seq[i] == AA_B - push!(AA_vector, AA_B_MW_MW) - elseif aa_seq[i] == AA_J - push!(AA_vector, AA_J_MW_MW) - elseif aa_seq[i] == AA_Z - push!(AA_vector, AA_Z_MW_MW) - elseif aa_seq[i] == AA_X - push!(AA_vector, AA_X_MW_MW) - elseif aa_seq[i] == AA_O_Gap - push!(AA_vector, AA_O_Gap_MW) - else - println("Error due to unknown amino acid at $i position") - return nothing - end - i = i+1 - end - if unit == "gmol" - return round(sum(AA_vector) - ((length(aa_seq) - 1) * WATER_MW); digits = 3) - elseif unit == "kDa" - return round(((sum(AA_vector) - ((length(aa_seq) - 1) * WATER_MW))/1000); digits = 3) - else - return nothing - end -end +# Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list the string "Undefined", so if called, it returns an error +AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, "Undefined", 131.17, "Undefined", "Undefined", 0, "Undefined"] -# Defining a function, which accepts a AA sequence and print the molecular weight in g/mol and kDa from a given n_start position in the sequence until a n_end given position in the sequence -function molecular_weight(aa_seq::AASeq, n_start::Integer, n_end::Integer) - AA_vector = Vector{Float64}(undef, 0) - i = n_start - while i <= n_end - if aa_seq[i] == AA_A - push!(AA_vector, AA_A_MW) - elseif aa_seq[i] == AA_R - push!(AA_vector, AA_R_MW) - elseif aa_seq[i] == AA_N - push!(AA_vector, AA_N_MW) - elseif aa_seq[i] == AA_D - push!(AA_vector, AA_D_MW) - elseif aa_seq[i] == AA_C - push!(AA_vector, AA_C_MW) - elseif aa_seq[i] == AA_Q - push!(AA_vector, AA_Q_MW) - elseif aa_seq[i] == AA_E - push!(AA_vector, AA_E_MW) - elseif aa_seq[i] == AA_G - push!(AA_vector, AA_G_MW) - elseif aa_seq[i] == AA_H - push!(AA_vector, AA_H_MW) - elseif aa_seq[i] == AA_I - push!(AA_vector, AA_I_MW) - elseif aa_seq[i] == AA_L - push!(AA_vector, AA_L_MW) - elseif aa_seq[i] == AA_K - push!(AA_vector, AA_K_MW) - elseif aa_seq[i] == AA_M - push!(AA_vector, AA_M_MW) - elseif aa_seq[i] == AA_F - push!(AA_vector, AA_F_MW) - elseif aa_seq[i] == AA_P - push!(AA_vector, AA_P_MW) - elseif aa_seq[i] == AA_S - push!(AA_vector, AA_S_MW) - elseif aa_seq[i] == AA_T - push!(AA_vector, AA_T_MW) - elseif aa_seq[i] == AA_W - push!(AA_vector, AA_W_MW) - elseif aa_seq[i] == AA_Y - push!(AA_vector, AA_Y_MW) - elseif aa_seq[i] == AA_V - push!(AA_vector, AA_V_MW) - elseif aa_seq[i] == AA_O - push!(AA_vector, AA_O_MW_MW) - elseif aa_seq[i] == AA_U - push!(AA_vector, AA_U_MW_MW) - elseif aa_seq[i] == AA_B - push!(AA_vector, AA_B_MW_MW) - elseif aa_seq[i] == AA_J - push!(AA_vector, AA_J_MW_MW) - elseif aa_seq[i] == AA_Z - push!(AA_vector, AA_Z_MW_MW) - elseif aa_seq[i] == AA_X - push!(AA_vector, AA_X_MW_MW) - elseif aa_seq[i] == AA_O_Gap - push!(AA_vector, AA_O_Gap_MW) - else - println("Error due to unknown amino acid at $i position") - return nothing - end - i = i+1 +# Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation +function molecular_weight(aa_seq::AASeq) + weight = 0 + for aa in aa_seq + aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] + weight += aa_weight end - println("$(round(sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW); digits = 3)) g/mol") - println("$(round(((sum(AA_vector) - ((n_end - n_start - 1) * WATER_MW))/1000); digits = 3)) kDa") + return round(weight - ((length(aa_seq) - 1) * 18.02); digits = 3) end # Definning const of molecular weight of monophosphate in g/mol to account for the loss of water and PPi during synthesis From 85702bee15e13fb1acdc208f953988ff744e3e5f Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Tue, 2 Sep 2025 13:05:49 +0200 Subject: [PATCH 06/18] fixed typo --- src/molecular-weight.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 962fe462..f304315d 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -1,6 +1,6 @@ # Sources of the molecular weights of each defined amino acid in g/mol # AA_A # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 -# AA_R # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 +# AA_R # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 # AA_N # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 # AA_D # Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 # AA_C # Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 From 0085cc9459e7a4db021f661327a60ea55fa1bba9 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Tue, 2 Sep 2025 13:15:49 +0200 Subject: [PATCH 07/18] fixed typo --- src/molecular-weight.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index f304315d..d2daef9c 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -26,7 +26,7 @@ # Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html # Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list the string "Undefined", so if called, it returns an error -AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, "Undefined", 131.17, "Undefined", "Undefined", 0, "Undefined"] +AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, "Undefined", 131.17, "Undefined", "Undefined", 0, 0] # Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation function molecular_weight(aa_seq::AASeq) From 8eea91b9a2b056de75d47812902e08a8ae29ce17 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Tue, 2 Sep 2025 15:47:53 +0200 Subject: [PATCH 08/18] Changing and expanding upon the implementation for nucseq --- src/molecular-weight.jl | 139 +++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 79 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index d2daef9c..1d2e42ac 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -1,4 +1,4 @@ -# Sources of the molecular weights of each defined amino acid in g/mol +# Sources of the molecular weights of each defined amino acid # AA_A # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 # AA_R # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 # AA_N # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 @@ -38,87 +38,68 @@ function molecular_weight(aa_seq::AASeq) return round(weight - ((length(aa_seq) - 1) * 18.02); digits = 3) end -# Definning const of molecular weight of monophosphate in g/mol to account for the loss of water and PPi during synthesis -const DNA_A_MW = 331.22 - WATER_MW # DNA Adenine dAMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/12599 -const RNA_A_MW = 347.22 - WATER_MW # RNA Adenine AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 -const DNA_C_MW = 307.20 - WATER_MW # DNA Cytosine dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 -const RNA_C_MW = 323.20 - WATER_MW # RNA Cytosine CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 -const DNA_G_MW = 347.22 - WATER_MW # DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/1135398597 -const RNA_G_MW = 363.22 - WATER_MW # RNA Guanine GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 -const DNA_T_MW = 322.21 - WATER_MW # DNA Thymine dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 -const RNA_U_MW = 324.18 - WATER_MW # RNA Uracil UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 - +# Sources of the molecular weights of each defined nucleotide +# DNA Adenine dAMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/12599 +# RNA Adenine AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 +# DNA Cytosine dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 +# RNA Cytosine CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 +# DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/1135398597 +# RNA Guanine GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 +# DNA Thymine dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 # Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf -# Defining a function, which accepts DNA sequences -function molecular_weight(aa_seq::NucSeq{4, DNAAlphabet{4}}, unit=:"gmol", five_terminal_state=:"Hydroxyl") - AA_vector = Vector{Float64}(undef, 0) - i = 1 - while i <= length(aa_seq) - if aa_seq[i] == DNA_A - push!(AA_vector, DNA_A_MW) - elseif aa_seq[i] == DNA_C - push!(AA_vector, DNA_C_MW) - elseif aa_seq[i] == DNA_G - push!(AA_vector, DNA_G_MW) - elseif aa_seq[i] == DNA_T - push!(AA_vector, DNA_T_MW) - else - println("Error due to unknown nucleotide at $i position") - return nothing - end - i = i+1 - end - if unit == "gmol" && five_terminal_state == "Phosphate" - return round(sum(AA_vector) + 79; digits = 3) - elseif unit == "gmol" && five_terminal_state == "Hydroxyl" - return round(sum(AA_vector) - 62; digits = 3) - elseif unit == "gmol" && five_terminal_state == "Triphosphate" - return round(sum(AA_vector) + 178; digits = 3) - elseif unit == "kDa" && five_terminal_state == "Phosphate" - return (round(sum(AA_vector) + 79; digits = 3))/1000 - elseif unit == "kDa" && five_terminal_state == "Hydroxyl" - return (round(sum(AA_vector) - 62; digits = 3))/1000 - elseif unit == "kDa" && five_terminal_state == "Triphosphate" - return (round(sum(AA_vector) + 178; digits = 3))/1000 - else - println("Error") - return nothing - end -end -# Defining a function, which accepts a RNA sequences -function molecular_weight(aa_seq::NucSeq{4, RNAAlphabet{4}}, unit=:"gmol", five_terminal_state=:"Triphosphate") - AA_vector = Vector{Float64}(undef, 0) - i = 1 - while i <= length(aa_seq) - if aa_seq[i] == RNA_A - push!(AA_vector, RNA_A_MW) - elseif aa_seq[i] == RNA_C - push!(AA_vector, RNA_C_MW) - elseif aa_seq[i] == RNA_G - push!(AA_vector, RNA_G_MW) - elseif aa_seq[i] == RNA_U - push!(AA_vector, RNA_U_MW) - else - println("Error due to unknown nucleotide at $i position") - return nothing +# Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list the string "Undefined", so if called, it returns an error +# The values already account for loss of water = - 18.02 +DNA_WEIGHTS = [0, 313.2, 289.18, "Undefined", 329.2, "Undefined", "Undefined", "Undefined", 304.19, "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined"] + +RNA_WEIGHTS = [0, 329.2, 305.18, "Undefined", 345.2, "Undefined", "Undefined", "Undefined", 306.16, "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined"] + +# Defining a function that accepts nucleotide sequences (either DNA or RNA) and calculate the molecular weight in g/mol +# Note the additional keywords alphabet, five_terminal_state and strand_number, which have default cases, if not specified +# In case of using double strand, the five_terminal_state of the complmentary strand is assumed to be the same as the input strand +# Function only accepts double as strand_number for DNA sequences +function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hydroxyl, strand_number=:single) + weight = 0 + if alphabet == :DNA + for dnucleotide in nucseq + dna_weight = DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] + weight += dna_weight end - i = i+1 + elseif alphabet == :RNA + for nucleotide in nucseq + rna_weight = RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] + weight += rna_weight + end + else + throw(ArgumentError("Unknown Alphabet. Must be :DNA or :RNA")) end - if unit == "gmol" && five_terminal_state == "Triphosphate" - return round(sum(AA_vector) + 178; digits = 3) - elseif unit == "gmol" && five_terminal_state == "Phosphate" - return round(sum(AA_vector) + 79; digits = 3) - elseif unit == "gmol" && five_terminal_state == "Hydroxyl" - return round(sum(AA_vector) - 62; digits = 3) - elseif unit == "kDa" && five_terminal_state == "Triphosphate" - return (round(sum(AA_vector) - 62; digits = 3))/1000 - elseif unit == "kDa" && five_terminal_state == "Phosphate" - return (round(sum(AA_vector) + 79; digits = 3))/1000 - elseif unit == "kDa" && five_terminal_state == "Hydroxyl" - return (round(sum(AA_vector) - 62; digits = 3))/1000 - else - println("Error") - return nothing + if five_terminal_state == :hydroxyl + weight = weight - 62 + elseif five_terminal_state == :phosphate + weight = weight + 79 + elseif five_terminal_state == :triphosphate + weight = weight + 178 + else + throw(ArgumentError("Unknown five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) + end + if strand_number == :single + return round(weight; digits = 3) + elseif strand_number == :double && alphabet == :DNA + com_seq = complement(nucseq) + com_weight = 0 + for c_dnucleotide in com_seq + com_dna_weight = DNA_WEIGHTS[reinterpret(UInt8, c_dnucleotide) + 1] + com_weight += com_dna_weight + end + if five_terminal_state == :hydroxyl + com_weight = com_weight - 62 + elseif five_terminal_state == :phosphate + com_weight = com_weight + 79 + elseif five_terminal_state == :triphosphate + com_weight = com_weight + 178 end + return round(com_weight + weight; digits = 3) + else + throw(ArgumentError("Unknown strand_number. Must be :single or :double. Or trying to use double as strand_number with a RNA sequence")) + end end \ No newline at end of file From 31f22217894de04dfbbc58dcf85c17e51dc8909e Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Wed, 3 Sep 2025 13:52:42 +0200 Subject: [PATCH 09/18] Implementing changes to function molecular weight,in accordance to feedback --- src/molecular-weight.jl | 55 ++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 1d2e42ac..31cd8b17 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -25,17 +25,22 @@ # Source of the used water molecular weight = 18.02 gmol for calculations # Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html -# Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list the string "Undefined", so if called, it returns an error -AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, "Undefined", 131.17, "Undefined", "Undefined", 0, 0] +# Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list -1.0 +# In the function, if the weight is -1, then it will trow an error +const AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, -1.0, 131.17, -1.0, -1.0, 0, 0] # Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation function molecular_weight(aa_seq::AASeq) - weight = 0 + weight = 0.0 for aa in aa_seq - aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] - weight += aa_weight + if AA_WEIGHTS[reinterpret(UInt8, aa) + 1] == -1.0 + throw(ArgumentError("amino acid $aa weight is ambiguous")) + else + aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] + weight += aa_weight + end end - return round(weight - ((length(aa_seq) - 1) * 18.02); digits = 3) + return weight - ((length(aa_seq) - 1) * 18.02) end # Sources of the molecular weights of each defined nucleotide @@ -43,33 +48,43 @@ end # RNA Adenine AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 # DNA Cytosine dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 # RNA Cytosine CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 -# DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/1135398597 +# DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398597 # RNA Guanine GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 # DNA Thymine dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 +# RNA Uracil UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 # Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf -# Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list the string "Undefined", so if called, it returns an error -# The values already account for loss of water = - 18.02 -DNA_WEIGHTS = [0, 313.2, 289.18, "Undefined", 329.2, "Undefined", "Undefined", "Undefined", 304.19, "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined"] +# Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list -1.0, so if called +# The listed values does not account for the loss of water and it will be taken into consideration later +# In the function, if the weight is -1, then it will trow an error +const DNA_WEIGHTS = [0, 331.22, 307.20, -1.0, 347.22, -1.0, -1.0, -1.0, 322.21, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] -RNA_WEIGHTS = [0, 329.2, 305.18, "Undefined", 345.2, "Undefined", "Undefined", "Undefined", 306.16, "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined", "Undefined"] +const RNA_WEIGHTS = [0, 347.22, 323.20, -1.0, 363.22, -1.0, -1.0, -1.0, 324.18 , -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] # Defining a function that accepts nucleotide sequences (either DNA or RNA) and calculate the molecular weight in g/mol # Note the additional keywords alphabet, five_terminal_state and strand_number, which have default cases, if not specified # In case of using double strand, the five_terminal_state of the complmentary strand is assumed to be the same as the input strand # Function only accepts double as strand_number for DNA sequences function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hydroxyl, strand_number=:single) - weight = 0 + weight = 0.0 if alphabet == :DNA for dnucleotide in nucseq - dna_weight = DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] - weight += dna_weight + if DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] == -1.0 + throw(ArgumentError("nucleotide $dnucleotide weight is ambiguous")) + else + dna_weight = DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] + weight += dna_weight end + end elseif alphabet == :RNA for nucleotide in nucseq - rna_weight = RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] - weight += rna_weight - end + if RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] == -1.0 + throw(ArgumentError("nucleotide $nucleotide weight is ambiguous")) + else + rna_weight = RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] + weight += rna_weight + end + end else throw(ArgumentError("Unknown Alphabet. Must be :DNA or :RNA")) end @@ -83,10 +98,10 @@ function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hy throw(ArgumentError("Unknown five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) end if strand_number == :single - return round(weight; digits = 3) + return weight - (length(nucseq) * 18.02) elseif strand_number == :double && alphabet == :DNA com_seq = complement(nucseq) - com_weight = 0 + com_weight = 0.0 for c_dnucleotide in com_seq com_dna_weight = DNA_WEIGHTS[reinterpret(UInt8, c_dnucleotide) + 1] com_weight += com_dna_weight @@ -98,7 +113,7 @@ function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hy elseif five_terminal_state == :triphosphate com_weight = com_weight + 178 end - return round(com_weight + weight; digits = 3) + return com_weight + weight else throw(ArgumentError("Unknown strand_number. Must be :single or :double. Or trying to use double as strand_number with a RNA sequence")) end From 015fea56adadd9b4f7db1e83cb41e7d5011049f5 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Wed, 3 Sep 2025 13:55:53 +0200 Subject: [PATCH 10/18] added lost of water weight --- src/molecular-weight.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 31cd8b17..1f66a267 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -113,7 +113,7 @@ function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hy elseif five_terminal_state == :triphosphate com_weight = com_weight + 178 end - return com_weight + weight + return com_weight + weight - (2 * length(nucseq) * 18.02) else throw(ArgumentError("Unknown strand_number. Must be :single or :double. Or trying to use double as strand_number with a RNA sequence")) end From 3e221482bb181bf345958604a47b748940bffce0 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Thu, 4 Sep 2025 09:22:54 +0200 Subject: [PATCH 11/18] Changing function for DNA/RNA in accordance to feedback --- src/molecular-weight.jl | 81 ++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 1f66a267..dfab9311 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -30,6 +30,7 @@ const AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, -1.0, 131.17, -1.0, -1.0, 0, 0] # Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation +# Results are consistent with the one of of this website: https://www.bioinformatics.org/sms/prot_mw.html function molecular_weight(aa_seq::AASeq) weight = 0.0 for aa in aa_seq @@ -54,39 +55,46 @@ end # RNA Uracil UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 # Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf -# Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list -1.0, so if called -# The listed values does not account for the loss of water and it will be taken into consideration later -# In the function, if the weight is -1, then it will trow an error +# Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list -1.0, so if called it trows an error +# The listed values does not account for the loss of water and it will be taken into consideration in the _molecular weight function. The used value for water weight is the same as above const DNA_WEIGHTS = [0, 331.22, 307.20, -1.0, 347.22, -1.0, -1.0, -1.0, 322.21, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] const RNA_WEIGHTS = [0, 347.22, 323.20, -1.0, 363.22, -1.0, -1.0, -1.0, 324.18 , -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] -# Defining a function that accepts nucleotide sequences (either DNA or RNA) and calculate the molecular weight in g/mol -# Note the additional keywords alphabet, five_terminal_state and strand_number, which have default cases, if not specified -# In case of using double strand, the five_terminal_state of the complmentary strand is assumed to be the same as the input strand -# Function only accepts double as strand_number for DNA sequences -function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hydroxyl, strand_number=:single) +# Creating 3 functions which can process DNA or RNA seqeunces, with variable five_terminal_state and strand_number +# If not specified the assumption are single stranded and hydroxyl as terminal 5' functional group +# Results are consistent with the one of of this website: https://molbiotools.com/dnacalculator.php +function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) + if strand_number == :single + return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) + elseif strand_number == :double + com_dna_seq = complement(dna_seq) + return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_dna_seq, DNA_WEIGHTS, five_terminal_state) + else + throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) + end +end + +function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) + if strand_number == :single + return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) + elseif strand_number == :double + com_rna_seq = complement(rna_seq) + return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_rna_seq, RNA_WEIGHTS, five_terminal_state) + else + throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) + end +end + +function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state=:hydroxyl) weight = 0.0 - if alphabet == :DNA - for dnucleotide in nucseq - if DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] == -1.0 - throw(ArgumentError("nucleotide $dnucleotide weight is ambiguous")) - else - dna_weight = DNA_WEIGHTS[reinterpret(UInt8, dnucleotide) + 1] - weight += dna_weight - end - end - elseif alphabet == :RNA - for nucleotide in nucseq - if RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] == -1.0 + for nucleotide in nucseq + if array[reinterpret(UInt8, nucleotide) + 1] == -1.0 throw(ArgumentError("nucleotide $nucleotide weight is ambiguous")) else - rna_weight = RNA_WEIGHTS[reinterpret(UInt8, nucleotide) + 1] - weight += rna_weight + dna_weight = array[reinterpret(UInt8, nucleotide) + 1] + weight += dna_weight end - end - else - throw(ArgumentError("Unknown Alphabet. Must be :DNA or :RNA")) end if five_terminal_state == :hydroxyl weight = weight - 62 @@ -95,26 +103,7 @@ function molecular_weight(nucseq::NucSeq, alphabet=:DNA, five_terminal_state=:hy elseif five_terminal_state == :triphosphate weight = weight + 178 else - throw(ArgumentError("Unknown five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) + throw(ArgumentError("Unknown five_terminal_state $five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) end - if strand_number == :single - return weight - (length(nucseq) * 18.02) - elseif strand_number == :double && alphabet == :DNA - com_seq = complement(nucseq) - com_weight = 0.0 - for c_dnucleotide in com_seq - com_dna_weight = DNA_WEIGHTS[reinterpret(UInt8, c_dnucleotide) + 1] - com_weight += com_dna_weight - end - if five_terminal_state == :hydroxyl - com_weight = com_weight - 62 - elseif five_terminal_state == :phosphate - com_weight = com_weight + 79 - elseif five_terminal_state == :triphosphate - com_weight = com_weight + 178 - end - return com_weight + weight - (2 * length(nucseq) * 18.02) - else - throw(ArgumentError("Unknown strand_number. Must be :single or :double. Or trying to use double as strand_number with a RNA sequence")) - end + return weight - (length(nucseq) * 18.02) end \ No newline at end of file From 336f8c6203f9249648cafab95afc305d6b712f2d Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Fri, 5 Sep 2025 00:24:47 +0200 Subject: [PATCH 12/18] adding docstring,creating --- src/molecular-weight.jl | 78 ++++++++++++++++++++++++++++++++++++++++- test/runtests.jl | 1 + 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index dfab9311..cc667742 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -27,10 +27,28 @@ # Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list -1.0 # In the function, if the weight is -1, then it will trow an error -const AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, -1.0, 131.17, -1.0, -1.0, 0, 0] +# Note that for gap and terminal codon, the value is 18.02 intead of 0 to account for the substraction of water in the function +const AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, -1.0, 131.17, -1.0, -1.0, 18.02, 18.02] # Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation # Results are consistent with the one of of this website: https://www.bioinformatics.org/sms/prot_mw.html + +""" + molecular_weight(aa_seq::AASeq) -> Float64 + +Calculate molecular weight of `aa_seq`in g/mol, i.e. +Sum of the weights of all the amino acids of the sequence minus water lost per peptide bond. +Values of each amino acid were obtained from PubChem 2.1, values are listed in the vector{Float64} AA_WEIGHTS. + +# Examples +```jldoctest +julia> molecular_weight(aa"PKLEQ") +613.68 + +julia> molecular_weight(aa"ETIWS*") +634.65 +``` +""" function molecular_weight(aa_seq::AASeq) weight = 0.0 for aa in aa_seq @@ -64,6 +82,26 @@ const RNA_WEIGHTS = [0, 347.22, 323.20, -1.0, 363.22, -1.0, -1.0, -1.0, 324.18 , # Creating 3 functions which can process DNA or RNA seqeunces, with variable five_terminal_state and strand_number # If not specified the assumption are single stranded and hydroxyl as terminal 5' functional group # Results are consistent with the one of of this website: https://molbiotools.com/dnacalculator.php + +""" + molecular_weight(rna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state::Symbol, strand_number::Symbol) -> Float64 + +Calculate molecular weight of `dna_seq`in g/mol, i.e. Sum of the weights of all the nucleotides of the sequence, +minus water lost per bond formation. +Values of nucleotides were obtained from PubChem 2.1 and are listed in the vector{Float64} DNA_WEIGHTS. +Also possible to define the 5' state as (:hydroxyl, :phosphate or :triphosphate) and strand number as (:single or :double). +If not defined, the default values are :hydroxyl and :single + +# Examples +```jldoctest +julia> molecular_weight(dna"GCAGCCATAG") +3036.930000000001 + +julia> molecular_weight(dna"ACACCATTTG", :phosphate, :double) +6335.8600000000015 +``` +""" + function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) if strand_number == :single return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) @@ -75,6 +113,25 @@ function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_s end end +""" + molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state::Symbol, strand_number::Symbol) -> Float64 + +Calculate molecular weight of `rna_seq`in g/mol, i.e. Sum of the weights of all the nucleotides of the sequence, +minus water lost per bond formation. +Values of nucleotides were obtained from PubChem 2.1 and are listed in the vector{Float64} RNA_WEIGHTS. +Also possible to define the 5' state as (:hydroxyl, :phosphate or :triphosphate) and strand number as (:single or :double). +If not defined, the default values are :hydroxyl and :single + +# Examples +```jldoctest +julia> molecular_weight(rna"GGGCGGACCU") +3214.9 + +julia> molecular_weight(rna"CGAUUUUCGG", :triphosphate, :double) +6784.700000000001 +``` +""" + function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) if strand_number == :single return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) @@ -86,6 +143,25 @@ function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_s end end +""" + _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state::Symbol) -> Float64 + +Calculate molecular weight of `rnucseq`in g/mol, i.e. Sum of the weights of all the nucleotides of the sequence, +minus water lost per bond. Values of nucleotides were obtained from PubChem 2.1. +Values are listed in the vectors{Float64} RNA_WEIGHTS and DNA_WEIGHTS. +Also possible to define the 5' state as (:hydroxyl, :phosphate or :triphosphate), If not defined, +hydroxyl is the default option. Not possible to calculate weights for double stranded sequences. + +# Examples +```jldoctest +julia> _molecular_weight(dna"GTTGCCCGGC",DNA_WEIGHTS) +3019.9000000000005 + +julia> _molecular_weight(rna"GUCUGACGCG",RNA_WEIGHTS, :triphosphate) +3415.8600000000006 +``` +""" + function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state=:hydroxyl) weight = 0.0 for nucleotide in nucseq diff --git a/test/runtests.jl b/test/runtests.jl index dbd69aee..5f3715ec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -42,6 +42,7 @@ end end include("translation.jl") include("counting.jl") +include("weights.jl") @testset "Search" begin include("search/ExactSearchQuery.jl") From a0ffd5f184bb0cc750b73d66368dd15853cdd3b9 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Fri, 5 Sep 2025 00:25:48 +0200 Subject: [PATCH 13/18] creating weights.jl --- test/weights.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/weights.jl diff --git a/test/weights.jl b/test/weights.jl new file mode 100644 index 00000000..e69de29b From 6b94455eb04612a0f9c854e77d89a4f0a0969627 Mon Sep 17 00:00:00 2001 From: Alessio_Agustini Date: Fri, 5 Sep 2025 15:47:50 +0200 Subject: [PATCH 14/18] Adding basic tests to weight.jl + corrections in fuction for phosphate compensation value --- src/molecular-weight.jl | 6 +++--- test/weights.jl | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index cc667742..ffdfb4ca 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -97,8 +97,8 @@ If not defined, the default values are :hydroxyl and :single julia> molecular_weight(dna"GCAGCCATAG") 3036.930000000001 -julia> molecular_weight(dna"ACACCATTTG", :phosphate, :double) -6335.8600000000015 +julia> mmolecular_weight(dna"TCCCAGACTG", :phosphate, :double) +6215.960000000001 ``` """ @@ -175,7 +175,7 @@ function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal if five_terminal_state == :hydroxyl weight = weight - 62 elseif five_terminal_state == :phosphate - weight = weight + 79 + weight = weight + 18.06 elseif five_terminal_state == :triphosphate weight = weight + 178 else diff --git a/test/weights.jl b/test/weights.jl index e69de29b..ea6cb267 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -0,0 +1,12 @@ +@test molecular_weight(aa"PKLEQ") == 613.68 +@test molecular_weight(aa"ETIWS*") == 634.65 + +@test molecular_weight(dna"GCAGCCATAG") == 3036.930000000001 +@test molecular_weight(dna"TCCCAGACTG", :phosphate, :double) == 6215.960000000001 + +@test molecular_weight(rna"GGGCGGACCU") == 3214.9 +@test molecular_weight(rna"CGAUUUUCGG", :triphosphate, :double) == 6784.700000000001 + +@test _molecular_weight(dna"GTTGCCCGGC",DNA_WEIGHTS) == 3019.9000000000005 +@test _molecular_weight(rna"GUCUGACGCG",RNA_WEIGHTS, :triphosphate) == 3415.8600000000006 + From 9204c96812ba8876fcdf15e2ec98c4306fef8892 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 5 Sep 2025 16:24:34 +0200 Subject: [PATCH 15/18] Change formatting * Place the source comment of each molecule where it is defined * Use Runic.jl to automatically format this file --- src/molecular-weight.jl | 134 ++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 52 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index ffdfb4ca..a480a2a5 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -1,36 +1,41 @@ -# Sources of the molecular weights of each defined amino acid -# AA_A # Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 -# AA_R # Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 -# AA_N # Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 -# AA_D # Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 -# AA_C # Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 -# AA_Q # Glutamine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5961 -# AA_E # Glutamate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/33032 -# AA_G # Glycine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/750 -# AA_H # Histidine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6274 -# AA_I # Isoleucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6306 -# AA_L # Leucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6106 -# AA_K # Lysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5962 -# AA_M # Methionine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6137 -# AA_F # Phenylalanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6140 -# AA_P # Proline -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/145742 -# AA_S # Serine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5951 -# AA_T # Threonine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6288 -# AA_W # Tryptophan -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6305 -# AA_Y # Tyrosine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6057 -# AA_V # Valine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6287 -# AA_O # Pyrrolysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/21873141 -# AA_U # Selenocysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/25076 - # Source of the used water molecular weight = 18.02 gmol for calculations -# Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html +# Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html -# Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list -1.0 +# Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list -1.0 # In the function, if the weight is -1, then it will trow an error # Note that for gap and terminal codon, the value is 18.02 intead of 0 to account for the substraction of water in the function -const AA_WEIGHTS = [89.09, 174.20, 132.12, 133.10, 121.16, 146.14, 147.13, 75.07, 155.15, 131.17, 131.17, 146.19, 149.21, 165.19, 115.13, 105.09, 119.12, 204.22, 181.19, 117.15, 255.31, 168.06, -1.0, 131.17, -1.0, -1.0, 18.02, 18.02] - -# Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation +const AA_WEIGHTS = [ + 89.09, # AA_A : Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 + 174.2, # AA_R : Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 + 132.12, # AA_N : Asparagine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6267 + 133.1, # AA_D : Aspartate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5960 + 121.16, # AA_C : Cysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5862 + 146.14, # AA_Q : Glutamine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5961 + 147.13, # AA_E : Glutamate -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/33032 + 75.07, # AA_G : Glycine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/750 + 155.15, # AA_H : Histidine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6274 + 131.17, # AA_I : Isoleucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6306 + 131.17, # AA_L : Leucine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6106 + 146.19, # AA_K : Lysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5962 + 149.21, # AA_M : Methionine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6137 + 165.19, # AA_F : Phenylalanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6140 + 115.13, # AA_P : Proline -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/145742 + 105.09, # AA_S : Serine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5951 + 119.12, # AA_T : Threonine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6288 + 204.22, # AA_W : Tryptophan -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6305 + 181.19, # AA_Y : Tyrosine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6057 + 117.15, # AA_V : Valine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6287 + 255.31, # AA_O : Pyrrolysine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/21873141 + 168.06, # AA_U : Selenocysteine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/25076 + -1.0, # AA_B : Ambiguous + 131.17, # AA_J : Ambiguous, but weight is known (same as AA_I and AA_L) + -1.0, # AA_Z : Ambiguous + -1.0, # AA_X : Ambiguous + 18.02, # AA_Term + 18.02, # AA_Gap +] + +# Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation # Results are consistent with the one of of this website: https://www.bioinformatics.org/sms/prot_mw.html """ @@ -54,7 +59,7 @@ function molecular_weight(aa_seq::AASeq) for aa in aa_seq if AA_WEIGHTS[reinterpret(UInt8, aa) + 1] == -1.0 throw(ArgumentError("amino acid $aa weight is ambiguous")) - else + else aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] weight += aa_weight end @@ -62,22 +67,47 @@ function molecular_weight(aa_seq::AASeq) return weight - ((length(aa_seq) - 1) * 18.02) end -# Sources of the molecular weights of each defined nucleotide -# DNA Adenine dAMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/12599 -# RNA Adenine AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 -# DNA Cytosine dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 -# RNA Cytosine CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 -# DNA Guanine dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398597 -# RNA Guanine GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 -# DNA Thymine dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 -# RNA Uracil UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 # Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf # Creating the arrays DNA_WEIGHTS and RNA_WEIGHTS to list all weights. In ambigous cases, let's list -1.0, so if called it trows an error # The listed values does not account for the loss of water and it will be taken into consideration in the _molecular weight function. The used value for water weight is the same as above -const DNA_WEIGHTS = [0, 331.22, 307.20, -1.0, 347.22, -1.0, -1.0, -1.0, 322.21, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] - -const RNA_WEIGHTS = [0, 347.22, 323.20, -1.0, 363.22, -1.0, -1.0, -1.0, 324.18 , -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0] +const DNA_WEIGHTS = [ + 0, + 331.22, # dAMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/12599 + 307.2, # dCMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/13945 + -1.0, + 347.22, # dGMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398597 + -1.0, + -1.0, + -1.0, + 322.21, # dTMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/9700 + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, +] + +const RNA_WEIGHTS = [ + 0, + 347.22, # AMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6083 + 323.2, # CMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6131 + -1.0, + 363.22, # GMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/135398631 + -1.0, + -1.0, + -1.0, + 324.18, # UMP -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6030 + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, +] # Creating 3 functions which can process DNA or RNA seqeunces, with variable five_terminal_state and strand_number # If not specified the assumption are single stranded and hydroxyl as terminal 5' functional group @@ -102,12 +132,12 @@ julia> mmolecular_weight(dna"TCCCAGACTG", :phosphate, :double) ``` """ -function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) +function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state = :hydroxyl, strand_number = :single) if strand_number == :single return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) elseif strand_number == :double com_dna_seq = complement(dna_seq) - return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_dna_seq, DNA_WEIGHTS, five_terminal_state) + return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_dna_seq, DNA_WEIGHTS, five_terminal_state) else throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) end @@ -132,12 +162,12 @@ julia> molecular_weight(rna"CGAUUUUCGG", :triphosphate, :double) ``` """ -function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state=:hydroxyl, strand_number=:single) +function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state = :hydroxyl, strand_number = :single) if strand_number == :single return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) elseif strand_number == :double com_rna_seq = complement(rna_seq) - return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_rna_seq, RNA_WEIGHTS, five_terminal_state) + return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_rna_seq, RNA_WEIGHTS, five_terminal_state) else throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) end @@ -162,7 +192,7 @@ julia> _molecular_weight(rna"GUCUGACGCG",RNA_WEIGHTS, :triphosphate) ``` """ -function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state=:hydroxyl) +function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state = :hydroxyl) weight = 0.0 for nucleotide in nucseq if array[reinterpret(UInt8, nucleotide) + 1] == -1.0 @@ -173,13 +203,13 @@ function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal end end if five_terminal_state == :hydroxyl - weight = weight - 62 + weight = weight - 62 elseif five_terminal_state == :phosphate weight = weight + 18.06 - elseif five_terminal_state == :triphosphate + elseif five_terminal_state == :triphosphate weight = weight + 178 else - throw(ArgumentError("Unknown five_terminal_state $five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) + throw(ArgumentError("Unknown five_terminal_state $five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) end - return weight - (length(nucseq) * 18.02) -end \ No newline at end of file + return weight - (length(nucseq) * 18.02) +end From d7a6641d1f50433fd5e3388afcc278a0251ac02e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 5 Sep 2025 16:29:14 +0200 Subject: [PATCH 16/18] Make water weight a constant This is more readable than using the literal "18.02" --- src/molecular-weight.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index a480a2a5..157a5a41 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -1,9 +1,8 @@ -# Source of the used water molecular weight = 18.02 gmol for calculations -# Water -> Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html +# Source = https://www.cmu.edu/gelfand/k12-educational-resources/polymers/what-is-polymer/molecular-weight-calculation.html +const WATER_WEIGHT = 18.02 # Creating the array AA_WEIGHTS of length 28 to list all weights. In ambigous cases, let's list -1.0 # In the function, if the weight is -1, then it will trow an error -# Note that for gap and terminal codon, the value is 18.02 intead of 0 to account for the substraction of water in the function const AA_WEIGHTS = [ 89.09, # AA_A : Alanine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/5950 174.2, # AA_R : Arginine -> Source = https://pubchem.ncbi.nlm.nih.gov/compound/6322 @@ -31,8 +30,9 @@ const AA_WEIGHTS = [ 131.17, # AA_J : Ambiguous, but weight is known (same as AA_I and AA_L) -1.0, # AA_Z : Ambiguous -1.0, # AA_X : Ambiguous - 18.02, # AA_Term - 18.02, # AA_Gap + -1.0, # AA_Term + # The calculations below subtract 1 water per AA. Since a gap weighs nothing, this nets to zero. + WATER_WEIGHT, # AA_Gap ] # Defining the function molecular_weight, which accepts an amino acid sequence and return molecular weight in g/mol. Note that the function also account for lost of water for each peptide bond formation @@ -64,7 +64,7 @@ function molecular_weight(aa_seq::AASeq) weight += aa_weight end end - return weight - ((length(aa_seq) - 1) * 18.02) + return weight - ((length(aa_seq) - 1) * WATER_WEIGHT) end # Calculations following the following guidelines: https://ymc.eu/files/imported/publications/556/documents/YMC-Expert-Tip---How-to-calculate-the-MW-of-nucleic-acids.pdf @@ -211,5 +211,5 @@ function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal else throw(ArgumentError("Unknown five_terminal_state $five_terminal_state. Must be :hydroxyl, :phosphate or :triphosphate")) end - return weight - (length(nucseq) * 18.02) + return weight - (length(nucseq) * WATER_WEIGHT) end From 04a84777276c3ba06fd9f085d6a8ca7cfce7f7f0 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 5 Sep 2025 16:32:01 +0200 Subject: [PATCH 17/18] Move weight to its own variable This avoids loading twice from the array. Maybe the compiler would optimise this, but it's safer to do this optimisation manually. --- src/molecular-weight.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 157a5a41..8f551a5b 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -57,10 +57,10 @@ julia> molecular_weight(aa"ETIWS*") function molecular_weight(aa_seq::AASeq) weight = 0.0 for aa in aa_seq - if AA_WEIGHTS[reinterpret(UInt8, aa) + 1] == -1.0 + aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] + if aa_weight == -1.0 throw(ArgumentError("amino acid $aa weight is ambiguous")) else - aa_weight = AA_WEIGHTS[reinterpret(UInt8, aa) + 1] weight += aa_weight end end @@ -195,11 +195,11 @@ julia> _molecular_weight(rna"GUCUGACGCG",RNA_WEIGHTS, :triphosphate) function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state = :hydroxyl) weight = 0.0 for nucleotide in nucseq - if array[reinterpret(UInt8, nucleotide) + 1] == -1.0 + nuc_weight = array[reinterpret(UInt8, nucleotide) + 1] + if nuc_weight == -1.0 throw(ArgumentError("nucleotide $nucleotide weight is ambiguous")) else - dna_weight = array[reinterpret(UInt8, nucleotide) + 1] - weight += dna_weight + weight += nuc_weight end end if five_terminal_state == :hydroxyl From 2ecf3bc0237e57c291013153be83aa7e02e77677 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 5 Sep 2025 16:44:15 +0200 Subject: [PATCH 18/18] Change signatures of functions * Switch from LongSequence to BioSequence to broaden the types of sequences that this signature accepts * Switch from DNAAlphabet{4} to <:DNAAlphabet (and similar for RNA alphabets) to also allow 2-bit alphabet sequences * Switch strandedness kwarg to a bool, since we only want to covert two cases * Restrict five_terminal_state to Symbol * The internal _molecular_weight function does not need default values --- src/molecular-weight.jl | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/molecular-weight.jl b/src/molecular-weight.jl index 8f551a5b..1dc1a4d9 100644 --- a/src/molecular-weight.jl +++ b/src/molecular-weight.jl @@ -132,14 +132,16 @@ julia> mmolecular_weight(dna"TCCCAGACTG", :phosphate, :double) ``` """ -function molecular_weight(dna_seq::LongSequence{DNAAlphabet{4}}, five_terminal_state = :hydroxyl, strand_number = :single) - if strand_number == :single +function molecular_weight( + dna_seq::BioSequence{<:DNAAlphabet}, + five_terminal_state::Symbol = :hydroxyl, + double_stranded::Bool = false + ) + if !double_stranded return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) - elseif strand_number == :double + else com_dna_seq = complement(dna_seq) return _molecular_weight(dna_seq, DNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_dna_seq, DNA_WEIGHTS, five_terminal_state) - else - throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) end end @@ -162,14 +164,16 @@ julia> molecular_weight(rna"CGAUUUUCGG", :triphosphate, :double) ``` """ -function molecular_weight(rna_seq::LongSequence{RNAAlphabet{4}}, five_terminal_state = :hydroxyl, strand_number = :single) - if strand_number == :single +function molecular_weight( + rna_seq::BioSequence{<:RNAAlphabet}, + five_terminal_state::Symbol = :hydroxyl, + double_stranded::Bool = false + ) + if !double_stranded == :single return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) - elseif strand_number == :double + else com_rna_seq = complement(rna_seq) return _molecular_weight(rna_seq, RNA_WEIGHTS, five_terminal_state) + _molecular_weight(com_rna_seq, RNA_WEIGHTS, five_terminal_state) - else - throw(ArgumentError("Unknown strand_number $strand_number. Must be :single or :double")) end end @@ -192,7 +196,7 @@ julia> _molecular_weight(rna"GUCUGACGCG",RNA_WEIGHTS, :triphosphate) ``` """ -function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state = :hydroxyl) +function _molecular_weight(nucseq::NucSeq, array::Vector{Float64}, five_terminal_state::Symbol) weight = 0.0 for nucleotide in nucseq nuc_weight = array[reinterpret(UInt8, nucleotide) + 1]