From a4328cbb7ebd60d30e5070322e551c11946c7333 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Thu, 18 Sep 2025 10:27:33 +0100 Subject: [PATCH 01/36] updated seq class: use base class slicing, added full_sequence property --- master | 0 src/pydna/seq.py | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) create mode 100644 master diff --git a/master b/master new file mode 100644 index 00000000..e69de29b diff --git a/src/pydna/seq.py b/src/pydna/seq.py index c595aeef..2ad0f43f 100644 --- a/src/pydna/seq.py +++ b/src/pydna/seq.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -A subclass of the Biopython SeqRecord class. +A subclass of Biopython Bio.Seq.Seq Has a number of extra methods and uses the :class:`pydna._pretty_str.pretty_str` class instread of str for a @@ -34,6 +34,10 @@ class Seq(_Seq): """docstring.""" + @property + def full_sequence(self): + return self + def translate( self, *args, @@ -154,15 +158,17 @@ def seguid(self) -> str: ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ - return _lsseguid(self._data.decode("utf8").upper(), alphabet="{DNA-extended}") + return _lsseguid( + self._data.decode("ascii").upper(), alphabet="{DNA-extended},AU" + ) - def __getitem__(self, key): - result = super().__getitem__(key) - try: - result.__class__ = self.__class__ - except TypeError: - pass - return result + # def __getitem__(self, key): + # result = super().__getitem__(key) + # try: + # result.__class__ = self.__class__ + # except TypeError: + # pass + # return result def reverse_complement(self): return self.__class__(_rc(self._data)) From c32cabb8a46efb0fbd1cc524e79bd28cb47fbea2 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Sun, 21 Sep 2025 09:09:38 +0100 Subject: [PATCH 02/36] Added override of Bio.Restriction.FormattedSeq._table --- src/pydna/__init__.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/pydna/__init__.py b/src/pydna/__init__.py index 35be864a..2b6bdda8 100644 --- a/src/pydna/__init__.py +++ b/src/pydna/__init__.py @@ -5,6 +5,7 @@ # license. Please see the LICENSE.txt file that should have been included # as part of this package. + """ :copyright: Copyright 2013-2023 by Björn Johansson. All rights reserved. :license: This code is part of the pydna package, governed by the @@ -142,7 +143,7 @@ # import configparser as _configparser # import tempfile as _tempfile from pydna._pretty import PrettyTable as _PrettyTable - +from Bio.Restriction import FormattedSeq __author__ = "Björn Johansson" __copyright__ = "Copyright 2013 - 2023 Björn Johansson" @@ -396,3 +397,18 @@ def logo(): f = Figlet() message = f.renderText(message) return _pretty_str(message) + + +## Override Bio.Restriction.FormattedSeq._table + + +def _make_FormattedSeq_table() -> bytes: + table = bytearray(256) + upper_to_lower = ord("A") - ord("a") + for c in b"ABCDEFGHIJKLMNOPQRSTUVWXYZ": # Only allow IUPAC letters + table[c] = c # map uppercase to uppercase + table[c - upper_to_lower] = c # map lowercase to uppercase + return bytes(table) + + +FormattedSeq._table = _make_FormattedSeq_table() From 00501bab04539bbd0b7b1e6a12b29f5bdecce17b Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Sun, 21 Sep 2025 09:20:45 +0100 Subject: [PATCH 03/36] Added dicts and tables _ambiguous_dna_complement, _keys, _values, _complement_table, _complement_table, to_watson_table, to_crick_table, to_N, to_5tail_table, to_3tail_table, to_full_sequence, bp_dict --- src/pydna/utils.py | 355 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 354 insertions(+), 1 deletion(-) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index 21a226f6..debf5703 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -41,9 +41,362 @@ StrOrBytes = _TypeVar("StrOrBytes", str, bytes) # _module_logger = _logging.getLogger("pydna." + __name__) -_ambiguous_dna_complement.update({"U": "A"}) +# _ambiguous_dna_complement.update({"U": "A"}) +_ambiguous_dna_complement.update( + { # dsIUPAC + "P": "J", # G in top strand, complementary strand empty. + "E": "Z", # A " + "X": "F", # T " + "I": "Q", # C " + "U": "O", # U in top strand, A in complementary strand. + "O": "U", # A in top strand, U in complementary strand. + "J": "P", # top strand empty, G in complementary strand. + "Z": "E", # " A " + "F": "X", # " T " + "Q": "I", # " C " + } +) + +_keys = "".join(_ambiguous_dna_complement.keys()).encode("ASCII") +_values = "".join(_ambiguous_dna_complement.values()).encode("ASCII") +_complement_table = bytes.maketrans(_keys + _keys.lower(), _values + _values.lower()) _complement_table = _maketrans(_ambiguous_dna_complement) +to_watson_table = bytes.maketrans( + b"PEXIQFZJpexiqfzj" b"12" b"UuOoGATCgatc", b"GATC gatc " b" ." b"UuAaGATCgatc" +) + +to_crick_table = bytes.maketrans( + _keys + _keys.lower() + b"PEXIQFZJpexiqfzj" b"12" b"UuOoGATCgatc", + _values + _values.lower() + b" CTAG ctag" b". " b"AaUuCTAGctag", +) + +to_N = rd = bytes.maketrans(b"PEXIpexiQFZJqfzjUuOo", b"NNNNNNNNNNNNNNNNNNNN") +to_5tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj") +to_3tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") +to_full_sequence = bytes.maketrans(b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc") + +bp_dict = { + (b"P", b"Q"): b"G", # P / Q >>---> G + (b"E", b"F"): b"A", # E / F >>---> A + (b"X", b"Z"): b"T", # X / Z >>---> T + (b"I", b"J"): b"C", # I / J >>---> C + (b"p", b"q"): b"g", # p / q >>---> g + (b"e", b"f"): b"a", # e / f >>---> a + (b"x", b"z"): b"t", # x / z >>---> t + (b"i", b"j"): b"c", # i / j >>---> c + (b"p", b"Q"): b"g", # p / Q >>---> g + (b"e", b"F"): b"a", # e / F >>---> a + (b"x", b"Z"): b"t", # x / Z >>---> t + (b"i", b"J"): b"c", # i / J >>---> c + (b"P", b"q"): b"G", # P / q >>---> G + (b"E", b"f"): b"A", # E / f >>---> A + (b"X", b"z"): b"T", # X / z >>---> T + (b"I", b"j"): b"C", # I / j >>---> C + # (b"P", b" "): b"P", # P / q >>---> G + # (b"E", b" "): b"E", # E / f >>---> A + # (b"X", b" "): b"X", # X / z >>---> T + # (b"I", b" "): b"I", # I / j >>---> C + (b"Q", b"P"): b"G", # Q / P >>---> G + (b"F", b"E"): b"A", # F / E >>---> A + (b"Z", b"X"): b"T", # Z / X >>---> T + (b"J", b"I"): b"C", # J / I >>---> C + (b"q", b"p"): b"g", # q / p >>---> g + (b"f", b"e"): b"a", # f / e >>---> a + (b"z", b"x"): b"t", # z / x >>---> t + (b"j", b"i"): b"c", # j / i >>---> c + (b"q", b"P"): b"G", # Q / P >>---> G + (b"f", b"E"): b"A", # F / E >>---> A + (b"z", b"X"): b"T", # Z / X >>---> T + (b"j", b"I"): b"C", # J / I >>---> C + (b"Q", b"p"): b"g", # q / p >>---> g + (b"F", b"e"): b"a", # f / e >>---> a + (b"Z", b"x"): b"t", # z / x >>---> t + (b"J", b"i"): b"c", # j / i >>---> c + # (b" ", b"Q"): b"Q", # Q / P >>---> G + # (b" ", b"F"): b"F", # F / E >>---> A + # (b" ", b"Z"): b"Z", # Z / X >>---> T + # (b" ", b"J"): b"J", # J / I >>---> C + (b"G", b" "): b"P", + (b"A", b" "): b"E", + (b"T", b" "): b"X", + (b"C", b" "): b"I", + (b"g", b" "): b"p", + (b"a", b" "): b"e", + (b"t", b" "): b"x", + (b"c", b" "): b"i", + (b" ", b"G"): b"J", + (b" ", b"A"): b"Z", + (b" ", b"T"): b"F", + (b" ", b"C"): b"Q", + (b" ", b"g"): b"j", + (b" ", b"a"): b"z", + (b" ", b"t"): b"f", + (b" ", b"c"): b"q", + (b"G", b"C"): b"G", + (b"A", b"T"): b"A", + (b"T", b"A"): b"T", + (b"C", b"G"): b"C", + (b"g", b"c"): b"g", + (b"a", b"t"): b"a", + (b"t", b"a"): b"t", + (b"c", b"g"): b"c", + (b"G", b"c"): b"G", + (b"A", b"t"): b"A", + (b"T", b"a"): b"T", + (b"C", b"g"): b"C", + (b"g", b"C"): b"g", + (b"a", b"T"): b"a", + (b"t", b"A"): b"t", + (b"c", b"G"): b"c", + (b"U", b"O"): b"U", + (b"O", b"U"): b"A", + (b"u", b"o"): b"u", + (b"o", b"u"): b"a", + (b"u", b"O"): b"u", + (b"O", b"u"): b"A", + (b"U", b"o"): b"U", + (b"o", b"U"): b"a", + (b"U", b"A"): b"U", + (b"A", b"U"): b"O", + (b"u", b"a"): b"u", + (b"a", b"u"): b"o", + (b"u", b"A"): b"u", + (b"A", b"u"): b"O", + (b"U", b"a"): b"U", + (b"a", b"U"): b"o", + (b"M", b"T"): b"A", + (b"m", b"t"): b"a", + (b"M", b"t"): b"A", + (b"m", b"T"): b"a", + (b"T", b"M"): b"K", + (b"t", b"m"): b"k", + (b"T", b"m"): b"K", + (b"t", b"M"): b"k", + (b"M", b"G"): b"C", + (b"m", b"g"): b"c", + (b"M", b"g"): b"C", + (b"m", b"G"): b"c", + (b"G", b"M"): b"K", + (b"g", b"m"): b"k", + (b"G", b"m"): b"K", + (b"g", b"M"): b"k", + (b"R", b"T"): b"A", + (b"r", b"t"): b"a", + (b"R", b"t"): b"A", + (b"r", b"T"): b"a", + (b"T", b"R"): b"Y", + (b"t", b"r"): b"y", + (b"T", b"r"): b"Y", + (b"t", b"R"): b"y", + (b"R", b"C"): b"G", + (b"r", b"c"): b"g", + (b"R", b"c"): b"G", + (b"r", b"C"): b"g", + (b"C", b"R"): b"Y", + (b"c", b"r"): b"y", + (b"C", b"r"): b"Y", + (b"c", b"R"): b"y", + (b"W", b"T"): b"A", + (b"w", b"t"): b"a", + (b"W", b"t"): b"A", + (b"w", b"T"): b"a", + (b"T", b"W"): b"W", + (b"t", b"w"): b"w", + (b"T", b"w"): b"W", + (b"t", b"W"): b"w", + (b"W", b"A"): b"T", + (b"w", b"a"): b"t", + (b"W", b"a"): b"T", + (b"w", b"A"): b"t", + (b"A", b"W"): b"W", + (b"a", b"w"): b"w", + (b"A", b"w"): b"W", + (b"a", b"W"): b"w", + (b"S", b"G"): b"C", + (b"s", b"g"): b"c", + (b"S", b"g"): b"C", + (b"s", b"G"): b"c", + (b"G", b"S"): b"S", + (b"g", b"s"): b"s", + (b"G", b"s"): b"S", + (b"g", b"S"): b"s", + (b"S", b"C"): b"G", + (b"s", b"c"): b"g", + (b"S", b"c"): b"G", + (b"s", b"C"): b"g", + (b"C", b"S"): b"S", + (b"c", b"s"): b"s", + (b"C", b"s"): b"S", + (b"c", b"S"): b"s", + (b"Y", b"G"): b"C", + (b"y", b"g"): b"c", + (b"Y", b"g"): b"C", + (b"y", b"G"): b"c", + (b"G", b"Y"): b"R", + (b"g", b"y"): b"r", + (b"G", b"y"): b"R", + (b"g", b"Y"): b"r", + (b"Y", b"A"): b"T", + (b"y", b"a"): b"t", + (b"Y", b"a"): b"T", + (b"y", b"A"): b"t", + (b"A", b"Y"): b"R", + (b"a", b"y"): b"r", + (b"A", b"y"): b"R", + (b"a", b"Y"): b"r", + (b"K", b"C"): b"G", + (b"k", b"c"): b"g", + (b"K", b"c"): b"G", + (b"k", b"C"): b"g", + (b"C", b"K"): b"M", + (b"c", b"k"): b"m", + (b"C", b"k"): b"M", + (b"c", b"K"): b"m", + (b"K", b"A"): b"T", + (b"k", b"a"): b"t", + (b"K", b"a"): b"T", + (b"k", b"A"): b"t", + (b"A", b"K"): b"M", + (b"a", b"k"): b"m", + (b"A", b"k"): b"M", + (b"a", b"K"): b"m", + (b"V", b"T"): b"A", + (b"v", b"t"): b"a", + (b"V", b"t"): b"A", + (b"v", b"T"): b"a", + (b"T", b"V"): b"B", + (b"t", b"v"): b"b", + (b"T", b"v"): b"B", + (b"t", b"V"): b"b", + (b"V", b"G"): b"C", + (b"v", b"g"): b"c", + (b"V", b"g"): b"C", + (b"v", b"G"): b"c", + (b"G", b"V"): b"B", + (b"g", b"v"): b"b", + (b"G", b"v"): b"B", + (b"g", b"V"): b"b", + (b"V", b"C"): b"G", + (b"v", b"c"): b"g", + (b"V", b"c"): b"G", + (b"v", b"C"): b"g", + (b"C", b"V"): b"B", + (b"c", b"v"): b"b", + (b"C", b"v"): b"B", + (b"c", b"V"): b"b", + (b"H", b"T"): b"A", + (b"h", b"t"): b"a", + (b"H", b"t"): b"A", + (b"h", b"T"): b"a", + (b"T", b"H"): b"D", + (b"t", b"h"): b"d", + (b"T", b"h"): b"D", + (b"t", b"H"): b"d", + (b"H", b"G"): b"C", + (b"h", b"g"): b"c", + (b"H", b"g"): b"C", + (b"h", b"G"): b"c", + (b"G", b"H"): b"D", + (b"g", b"h"): b"d", + (b"G", b"h"): b"D", + (b"g", b"H"): b"d", + (b"H", b"A"): b"T", + (b"h", b"a"): b"t", + (b"H", b"a"): b"T", + (b"h", b"A"): b"t", + (b"A", b"H"): b"D", + (b"a", b"h"): b"d", + (b"A", b"h"): b"D", + (b"a", b"H"): b"d", + (b"D", b"T"): b"A", + (b"d", b"t"): b"a", + (b"D", b"t"): b"A", + (b"d", b"T"): b"a", + (b"T", b"D"): b"H", + (b"t", b"d"): b"h", + (b"T", b"d"): b"H", + (b"t", b"D"): b"h", + (b"D", b"C"): b"G", + (b"d", b"c"): b"g", + (b"D", b"c"): b"G", + (b"d", b"C"): b"g", + (b"C", b"D"): b"H", + (b"c", b"d"): b"h", + (b"C", b"d"): b"H", + (b"c", b"D"): b"h", + (b"D", b"A"): b"T", + (b"d", b"a"): b"t", + (b"D", b"a"): b"T", + (b"d", b"A"): b"t", + (b"A", b"D"): b"H", + (b"a", b"d"): b"h", + (b"A", b"d"): b"H", + (b"a", b"D"): b"h", + (b"B", b"G"): b"C", + (b"b", b"g"): b"c", + (b"B", b"g"): b"C", + (b"b", b"G"): b"c", + (b"G", b"B"): b"V", + (b"g", b"b"): b"v", + (b"G", b"b"): b"V", + (b"g", b"B"): b"v", + (b"B", b"C"): b"G", + (b"b", b"c"): b"g", + (b"B", b"c"): b"G", + (b"b", b"C"): b"g", + (b"C", b"B"): b"V", + (b"c", b"b"): b"v", + (b"C", b"b"): b"V", + (b"c", b"B"): b"v", + (b"B", b"A"): b"T", + (b"b", b"a"): b"t", + (b"B", b"a"): b"T", + (b"b", b"A"): b"t", + (b"A", b"B"): b"V", + (b"a", b"b"): b"v", + (b"A", b"b"): b"V", + (b"a", b"B"): b"v", + # (b"N", b"C"): b"G", + # (b"n", b"c"): b"g", + # (b"N", b"c"): b"G", + # (b"n", b"C"): b"g", + # (b"C", b"N"): b"N", + # (b"c", b"n"): b"n", + # (b"C", b"n"): b"N", + # (b"c", b"N"): b"n", + # (b"N", b"T"): b"A", + # (b"n", b"t"): b"a", + # (b"N", b"t"): b"A", + # (b"n", b"T"): b"a", + # (b"T", b"N"): b"N", + # (b"t", b"n"): b"n", + # (b"T", b"n"): b"N", + # (b"t", b"N"): b"n", + # (b"N", b"A"): b"T", + # (b"n", b"a"): b"t", + # (b"N", b"a"): b"T", + # (b"n", b"A"): b"t", + # (b"A", b"N"): b"N", + # (b"a", b"n"): b"n", + # (b"A", b"n"): b"N", + # (b"a", b"N"): b"n", + # (b"N", b"G"): b"C", + # (b"n", b"g"): b"c", + # (b"N", b"g"): b"C", + # (b"n", b"G"): b"c", + # (b"G", b"N"): b"N", + # (b"g", b"n"): b"n", + # (b"G", b"n"): b"N", + # (b"g", b"N"): b"n", + (b"N", b"N"): b"N", + (b"n", b"N"): b"n", + (b"N", b"n"): b"N", + (b"n", b"n"): b"n", + # (b" ", b"N"): b"N", + # (b" ", b"n"): b"n", +} + def three_frame_orfs( dna: str, From 1b58d3794f831705af346ce28529e69edf8bb873 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Sun, 21 Sep 2025 11:06:06 +0100 Subject: [PATCH 04/36] 1. import inspect module, tables and dicts from utils module 2. new Dseq.__init__ w same arguments as before, but data is now stored in Bio.Seq.Seq._data 3. altered Dseq.quick classmethod 4. watson, crick and ovhg are methods decorated with @property 5. New method to_blunt_string with returns a the string of the watson strand if the underlying Dseq object was blunt. 6. Old __getitem__ replaced 7. New __repr__ method 8. new looped method 9. new __add__ method --- src/pydna/dseq.py | 652 +++++++++++++++++++++------------------------- 1 file changed, 294 insertions(+), 358 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 6412a176..b4be49ec 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -16,10 +16,12 @@ import copy as _copy -import itertools as _itertools + +# import itertools as _itertools import re as _re import sys as _sys import math as _math +import inspect as _inspect from pydna.seq import Seq as _Seq from Bio.Seq import _translate_str, _SeqAbstractBaseClass @@ -31,13 +33,21 @@ from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap +from pydna.utils import bp_dict +from pydna.utils import to_watson_table +from pydna.utils import to_crick_table +from pydna.utils import to_full_sequence + +# from pydna.utils import to_5tail_table +# from pydna.utils import to_3tail_table from pydna.common_sub_strings import common_sub_strings as _common_sub_strings +from pydna.common_sub_strings import terminal_overlap as _terminal_overlap from Bio.Restriction import RestrictionBatch as _RestrictionBatch from Bio.Restriction import CommOnly -from .types import DseqType, EnzymesType, CutSiteType +from pydna.types import DseqType, EnzymesType, CutSiteType from typing import List as _List, Tuple as _Tuple, Union as _Union @@ -313,28 +323,29 @@ def __init__( if crick is None: if ovhg is not None: raise ValueError("ovhg defined without crick strand!") - crick = _rc(watson) - ovhg = 0 + # Giving only the watson string implies a blunt sequence self._data = bytes(watson, encoding="ASCII") else: # crick strand given - if ovhg is None: # ovhg not given + if ovhg is None: # ovhg not given, try to guess from sequences olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), int(_math.log(len(watson)) / _math.log(4)), ) - if len(olaps) == 0: + if len(olaps) == 0: # No overlaps found, strands do not anneal raise ValueError( "Could not anneal the two strands." " Please provide ovhg value" ) - # We extract the positions and length of the first (longest) overlap, since - # common_sub_strings sorts the overlaps by length. - pos_watson, pos_crick, longest_olap_length = olaps[0] + # We extract the positions and length of the first (longest) overlap, + # since common_sub_strings sorts the overlaps by length, longest first. + (pos_watson, pos_crick, longest_olap_length), *rest = olaps # We see if there is another overlap of the same length - if any(olap[2] >= longest_olap_length for olap in olaps[1:]): + # This means that annealing is ambigous. USer should provide + # and ovhg value. + if any(olap[2] >= longest_olap_length for olap in rest): raise ValueError( "More than one way of annealing the" " strands. Please provide ovhg value" @@ -342,79 +353,48 @@ def __init__( ovhg = pos_crick - pos_watson - sns = (ovhg * " ") + _pretty_str(watson) - asn = (-ovhg * " ") + _pretty_str(_rc(crick)) - - self._data = bytes( - "".join( - [ - a.strip() or b.strip() - for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ") - ] - ), - encoding="ASCII", - ) + sense = f"{watson:>{len(watson) + ovhg}}" # pad on left side ovhg spaces + antisense = ( + f"{crick[::-1]:>{len(crick) - ovhg}}" # pad on left side -ovhg spaces + ) - else: # ovhg given - if ovhg == 0: - if len(watson) >= len(crick): - self._data = bytes(watson, encoding="ASCII") - else: - self._data = bytes( - watson + _rc(crick[: len(crick) - len(watson)]), - encoding="ASCII", - ) - elif ovhg > 0: - if ovhg + len(watson) > len(crick): - self._data = bytes( - _rc(crick[-ovhg:]) + watson, encoding="ASCII" - ) - else: - self._data = bytes( - _rc(crick[-ovhg:]) - + watson - + _rc(crick[: len(crick) - ovhg - len(watson)]), - encoding="ASCII", - ) - else: # ovhg < 0 - if -ovhg + len(crick) > len(watson): - self._data = bytes( - watson + _rc(crick[: -ovhg + len(crick) - len(watson)]), - encoding="ASCII", - ) - else: - self._data = bytes(watson, encoding="ASCII") + assert sense == (ovhg * " ") + watson + assert antisense == (-ovhg * " ") + crick[::-1] + + max_len = max(len(sense), len(antisense)) # find out which is longer + + sense = f"{sense:<{max_len}}" # pad on right side to max_len + antisense = f"{antisense:<{max_len}}" # pad on right side to max_len + + data = bytearray() + + for w, c in zip(sense, antisense): + try: + data.extend(bp_dict[w.encode("ascii"), c.encode("ascii")]) + except KeyError as err: + print(f"Base mismatch in representation {err}") + raise + self._data = bytes(data) self.circular = circular - self.watson = _pretty_str(watson) - self.crick = _pretty_str(crick) + # self.watson = _pretty_str(watson) + # self.crick = _pretty_str(crick) self.length = len(self._data) - self.ovhg = ovhg + # self.ovhg = ovhg self.pos = pos @classmethod - def quick( - cls, - watson: str, - crick: str, - ovhg=0, - circular=False, - pos=0, - ): - obj = cls.__new__(cls) # Does not call __init__ - obj.watson = _pretty_str(watson) - obj.crick = _pretty_str(crick) - obj.ovhg = ovhg + def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs): + """Fastest way to instantiate an object of the Dseq class. + + No checks of parameters are made. + Does not call Bio.Seq.Seq.__init__() which has lots of time consuming checks. + """ + obj = cls.__new__(cls) obj.circular = circular - obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) + obj.length = len(data) obj.pos = pos - wb = bytes(watson, encoding="ASCII") - cb = bytes(crick, encoding="ASCII") - obj._data = ( - _rc(cb[-max(0, ovhg) or len(cb) :]) - + wb - + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) - ) + obj._data = data return obj @classmethod @@ -427,34 +407,33 @@ def from_string( **kwargs, ): obj = cls.__new__(cls) # Does not call __init__ - obj.watson = _pretty_str(dna) - obj.crick = _pretty_str(_rc(dna)) - obj.ovhg = 0 obj.circular = circular - # obj._linear = linear obj.length = len(dna) obj.pos = 0 - obj._data = bytes(dna, encoding="ASCII") + obj._data = dna.encode("ASCII") return obj @classmethod def from_representation(cls, dsdna: str, *args, **kwargs): - obj = cls.__new__(cls) # Does not call __init__ - w, c, *r = [ln for ln in dsdna.splitlines() if ln] - ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip())) - watson = obj.watson = _pretty_str(w.strip()) - crick = obj.crick = _pretty_str(c.strip()[::-1]) + obj = cls.__new__(cls) obj.circular = False - # obj._linear = True - obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = 0 - wb = bytes(watson, encoding="ASCII") - cb = bytes(crick, encoding="ASCII") - obj._data = ( - _rc(cb[-max(0, ovhg) or len(cb) :]) - + wb - + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) - ) + clean = _inspect.cleandoc("\n" + dsdna) + watson, crick = [ + ln + for ln in clean.splitlines() + if ln.strip() and not ln.strip().startswith("Dseq(") + ] + watson = f"{watson:<{len(crick)}}" + crick = f"{crick:<{len(watson)}}" + data = bytearray() + for w, c in zip(watson, crick): + try: + data.extend(bp_dict[w.encode("ascii"), c.encode("ascii")]) + except KeyError as err: + print(f"Base mismatch in representation {err}") + raise + obj._data = bytes(data) return obj @classmethod @@ -522,107 +501,79 @@ def from_full_sequence_and_overhangs( return Dseq(watson, crick=crick, ovhg=crick_ovhg) - # @property - # def ovhg(self): - # """The ovhg property. This cannot be set directly, but is a - # consequence of how the watson and crick strands anneal to - # each other""" - # return self._ovhg - - # @property - # def linear(self): - # """The linear property can not be set directly. - # Use an empty slice [:] to create a linear object.""" - # return self._linear - - # @property - # def circular(self): - # """The circular property can not be set directly. - # Use :meth:`looped` to create a circular Dseq object""" - # return self._circular + @property + def watson(self): + """ + The watson (upper) strand of the double stranded fragment 5'-3'. - def mw(self) -> float: - """This method returns the molecular weight of the DNA molecule - in g/mol. The following formula is used:: + Returns + ------- + TYPE + DESCRIPTION. - MW = (A x 313.2) + (T x 304.2) + - (C x 289.2) + (G x 329.2) + - (N x 308.9) + 79.0 """ - nts = (self.watson + self.crick).lower() + return self._data.translate(to_watson_table).strip().decode("ascii") - return ( - 313.2 * nts.count("a") - + 304.2 * nts.count("t") - + 289.2 * nts.count("c") - + 329.2 * nts.count("g") - + 308.9 * nts.count("n") - + 79.0 - ) + @property + def crick(self): + """ + The crick (lower) strand of the double stranded fragment 5'-3'. - def upper(self: DseqType) -> DseqType: - """Return an upper case copy of the sequence. + Returns + ------- + TYPE + DESCRIPTION. - >>> from pydna.dseq import Dseq - >>> my_seq = Dseq("aAa") - >>> my_seq - Dseq(-3) - aAa - tTt - >>> my_seq.upper() - Dseq(-3) - AAA - TTT + """ + return self._data.translate(to_crick_table).strip().decode("ascii")[::-1] + + @property + def ovhg(self): + """ + The 5' overhang of the lower strand compared the the upper. + + See module docstring for more information. Returns ------- - Dseq - Dseq object in uppercase - - See also - -------- - pydna.dseq.Dseq.lower + TYPE + DESCRIPTION. """ - return self.quick( - self.watson.upper(), - self.crick.upper(), - ovhg=self.ovhg, - # linear=self.linear, - circular=self.circular, - pos=self.pos, - ) + ohw = len(_re.match(b"^[PEXIpexi]*", self._data).group(0)) + ohc = len(_re.match(b"^[QFZJqfzj]*", self._data).group(0)) + return -ohw or ohc - def lower(self: DseqType) -> DseqType: - """Return a lower case copy of the sequence. + def to_blunt_string(self): + """ + The sequence as a blunt ended string. - >>> from pydna.dseq import Dseq - >>> my_seq = Dseq("aAa") - >>> my_seq - Dseq(-3) - aAa - tTt - >>> my_seq.lower() - Dseq(-3) - aaa - ttt Returns ------- - Dseq - Dseq object in lowercase + TYPE + DESCRIPTION. - See also - -------- - pydna.dseq.Dseq.upper """ - return self.quick( - self.watson.lower(), - self.crick.lower(), - ovhg=self.ovhg, - # linear=self.linear, - circular=self.circular, - pos=self.pos, + return str(self).translate(to_full_sequence) + + def mw(self) -> float: + """This method returns the molecular weight of the DNA molecule + in g/mol. The following formula is used:: + + MW = (A x 313.2) + (T x 304.2) + + (C x 289.2) + (G x 329.2) + + (N x 308.9) + 79.0 + """ + nts = (self.watson + self.crick).lower() + + return ( + 313.2 * nts.count("a") + + 304.2 * nts.count("t") + + 289.2 * nts.count("c") + + 329.2 * nts.count("g") + + 308.9 * nts.count("n") + + 79.0 ) def find( @@ -671,54 +622,28 @@ def find( return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end) - def __getitem__(self, sl: slice) -> "Dseq": - """Returns a subsequence. This method is used by the slice notation""" - - if not self.circular: - x = len(self.crick) - self.ovhg - len(self.watson) - - sns = (self.ovhg * " " + self.watson + x * " ")[sl] - asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl] - - ovhg = max( - (len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs - ) - - return Dseq( - sns.strip(), - asn[::-1].strip(), - ovhg=ovhg, - # linear=True - ) - else: - sl = slice(sl.start or 0, sl.stop or len(self), sl.step) - if sl.start > len(self) or sl.stop > len(self): - return Dseq("") - if sl.start < sl.stop: - return Dseq( - self.watson[sl], - self.crick[::-1][sl][::-1], - ovhg=0, - # linear=True - ) - else: - try: - stp = abs(sl.step) - except TypeError: - stp = 1 - start = sl.start - stop = sl.stop - - w = ( - self.watson[(start or len(self)) :: stp] - + self.watson[: (stop or 0) : stp] - ) - c = ( - self.crick[len(self) - stop :: stp] - + self.crick[: len(self) - start : stp] - ) - - return Dseq(w, c, ovhg=0) # , linear=True) + def __getitem__(self, sl: [slice, int]) -> "Dseq": + """Method is used by the slice notation""" + if isinstance(sl, int): + # slice(start, stop, step) where stop = start+1 + sl = slice(sl, sl + 1, 1) + sl = slice(sl.start or 0, sl.stop or len(self), sl.step) + if self.circular: + if sl.start is None and sl.stop is None: + # Returns a linear copy using slice obj[:] + return self.quick(self._data[sl]) + elif sl.start == sl.stop: + # Returns a shifted object + shift = sl.start % len(self) + return self.quick(self._data[shift:] + self._data[:shift]) + elif ( + sl.start > sl.stop + and 0 <= sl.start <= len(self) + and 0 <= sl.stop <= len(self) + ): + # Returns the circular slice spanning the origin + return self.quick(self._data[sl.start :] + self._data[: sl.stop]) + return super().__getitem__(sl) def __eq__(self, other: DseqType) -> bool: """Compare to another Dseq object OR an object that implements @@ -739,84 +664,54 @@ def __eq__(self, other: DseqType) -> bool: return same def __repr__(self): - """Returns a representation of the sequence, truncated if - longer than 30 bp""" - - if len(self) > Dseq.trunc: - if self.ovhg > 0: - d = self.crick[-self.ovhg :][::-1] - hej = len(d) - if len(d) > 10: - d = "{}..{}".format(d[:4], d[-4:]) - a = len(d) * " " - - elif self.ovhg < 0: - a = self.watson[: max(0, -self.ovhg)] - hej = len(a) - if len(a) > 10: - a = "{}..{}".format(a[:4], a[-4:]) - d = len(a) * " " - else: - a = "" - d = "" - hej = 0 - - x = self.ovhg + len(self.watson) - len(self.crick) - - if x > 0: - c = self.watson[len(self.crick) - self.ovhg :] - y = len(c) - if len(c) > 10: - c = "{}..{}".format(c[:4], c[-4:]) - f = len(c) * " " - elif x < 0: - f = self.crick[:-x][::-1] - y = len(f) - if len(f) > 10: - f = "{}..{}".format(f[:4], f[-4:]) - c = len(f) * " " - else: - c = "" - f = "" - y = 0 - - L = len(self) - hej - y - x1 = -min(0, self.ovhg) - x2 = x1 + L - x3 = -min(0, x) - x4 = x3 + L - - b = self.watson[x1:x2] - e = self.crick[x3:x4][::-1] - - if len(b) > 10: - b = "{}..{}".format(b[:4], b[-4:]) - e = "{}..{}".format(e[:4], e[-4:]) - - return _pretty_str( - "{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}" - ).format( - klass=self.__class__.__name__, - top={False: "-", True: "o"}[self.circular], - size=len(self), - a=a, - b=b, - c=c, - d=d, - e=e, - f=f, - ) - else: - return _pretty_str( - "{}({}{})\n{}\n{}".format( - self.__class__.__name__, - {False: "-", True: "o"}[self.circular], - len(self), - self.ovhg * " " + self.watson, - -self.ovhg * " " + self.crick[::-1], - ) + header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})" + # m = _re.match( + # b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUOgatcuo])(.*)(?<=[GATCUOgatcuo])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", + # self._data, + # ) + m = _re.match( + b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUONgatcuon])(.*)(?<=[GATCUONgatcuon])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", + self._data, + ) + result = m.groups() if m else (b"",) * 6 + sticky_left5, sticky_left3, middle, sticky_right5, sticky_right3, single = ( + result + ) + if len(self) > self.trunc: + sticky_left5 = ( + sticky_left5[:4] + b"22" + sticky_left5[-4:] + if sticky_left5 and len(sticky_left5) > 10 + else sticky_left5 + ) + sticky_left3 = ( + sticky_left3[:4] + b"11" + sticky_left3[-4:] + if sticky_left3 and len(sticky_left3) > 10 + else sticky_left3 + ) + middle = ( + middle[:4] + b".." + middle[-4:] + if middle and len(middle) > 10 + else middle ) + sticky_right5 = ( + sticky_right5[:4] + b"22" + sticky_right5[-4:] + if sticky_right5 and len(sticky_right5) > 10 + else sticky_right5 + ) + sticky_right3 = ( + sticky_right3[:4] + b"11" + sticky_right3[-4:] + if sticky_right3 and len(sticky_right3) > 10 + else sticky_right3 + ) + r = ( + (sticky_left5 or sticky_left3 or b"") + + (middle or b"") + + (sticky_right5 or sticky_right3 or single or b"") + ) + return _pretty_str( + f"{header}\n{r.translate(to_watson_table).decode().rstrip()}\n{r.translate(to_crick_table).decode().rstrip()}" + ) def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. @@ -839,12 +734,7 @@ def reverse_complement(self) -> "Dseq": >>> """ - return Dseq.quick( - self.crick, - self.watson, - ovhg=len(self.watson) - len(self.crick) + self.ovhg, - circular=self.circular, - ) + return Dseq(_rc(self._data), circular=self.circular) rc = reverse_complement # alias for reverse_complement @@ -876,19 +766,21 @@ def looped(self: DseqType) -> DseqType: Dseq(o8) catcgatc gtagctag - >>> a.T4("t") + >>> b = Dseq("iatcgatj") + >>> b Dseq(-8) catcgat tagctag - >>> a.T4("t").looped() + >>> b.looped() Dseq(o7) catcgat gtagcta - >>> a.T4("a") + >>> c = Dseq("ietcgazj") + >>> c Dseq(-8) catcga agctag - >>> a.T4("a").looped() + >>> c.looped() Traceback (most recent call last): File "", line 1, in File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped @@ -900,23 +792,38 @@ def looped(self: DseqType) -> DseqType: """ if self.circular: return _copy.deepcopy(self) - type5, sticky5 = self.five_prime_end() - type3, sticky3 = self.three_prime_end() - if type5 == type3 and str(sticky5) == str(_rc(sticky3)): - nseq = self.__class__.quick( - self.watson, - self.crick[-self.ovhg :] + self.crick[: -self.ovhg], - ovhg=0, - # linear=False, - circular=True, - ) - # assert len(nseq.crick) == len(nseq.watson) - return nseq - else: + + ms = _re.fullmatch(b"(?:.*)([GATCgatc])([QFZJqfzj]+|[PEXIpexi]+)", self._data) + mo = _re.fullmatch(b"([PEXIpexi]+|[QFZJqfzj]+)([GATCgatc])(?:.*)", self._data) + + sticky_left = ms.group(2) if ms else b"" + sticky_right = mo.group(1) if mo else b"" + + sticky_left_just = bytes(f"{sticky_left.decode():<{len(sticky_right)}}", "utf8") + sticky_right_just = bytes( + f"{sticky_right.decode():>{len(sticky_left)}}", "utf8" + ) + + assert len(sticky_left_just) == len(sticky_right_just) + + junction = b"".join( + [ + bp_dict.get((bytes([w]), bytes([c])), b"-") + for w, c in zip(sticky_left_just, sticky_right_just) + ] + ) + + if b"-" in junction: raise TypeError( "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!" ) + return self.__class__( + junction + + self._data[len(sticky_left) or None : -len(sticky_right) or None], + circular=True, + ) + def tolinear(self: DseqType) -> DseqType: # pragma: no cover """Returns a blunt, linear copy of a circular Dseq object. This can only be done if the Dseq object is circular, otherwise a @@ -1056,25 +963,8 @@ def watson_ovhg(self) -> int: """Returns the overhang of the watson strand at the three prime.""" return len(self.watson) - len(self.crick) + self.ovhg - def __add__(self: DseqType, other: DseqType) -> DseqType: - """Simulates ligation between two DNA fragments. - - Add other Dseq object at the end of the sequence. - Type error is raised if any of the points below are fulfilled: - - * one or more objects are circular - * if three prime sticky end of self is not the same type - (5' or 3') as the sticky end of other - * three prime sticky end of self complementary with five - prime sticky end of other. + def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: - Phosphorylation and dephosphorylation is not considered. - - DNA is allways presumed to have the necessary 5' phospate - group necessary for ligation. - - """ - # test for circular DNA if self.circular: raise TypeError("circular DNA cannot be ligated!") try: @@ -1083,20 +973,68 @@ def __add__(self: DseqType, other: DseqType) -> DseqType: except AttributeError: pass - self_type, self_tail = self.three_prime_end() - other_type, other_tail = other.five_prime_end() + if not other: + return _copy.deepcopy(self) - if self_type == other_type and str(self_tail) == str(_rc(other_tail)): - answer = Dseq.quick( - self.watson + other.watson, other.crick + self.crick, self.ovhg - ) elif not self: - answer = _copy.deepcopy(other) - elif not other: - answer = _copy.deepcopy(self) + return _copy.deepcopy(other) + + ms = _re.search(b"([PEXIpexiQFZJqfzj]+)$", self._data) + mo = _re.match(b"^([PEXIpexiQFZJqfzj]+)", other._data) + + sticky_self = ms.group() if ms else b"" + sticky_other = mo.group() if mo else b"" + + sticky_self_just = bytes(f"{sticky_self.decode():<{len(sticky_other)}}", "utf8") + sticky_other_just = bytes( + f"{sticky_other.decode():>{len(sticky_self)}}", "utf8" + ) + + assert len(sticky_self_just) == len(sticky_other_just) + + if perfectmatch: + + junction = b"".join( + [ + bp_dict.get((bytes([w]), bytes([c])), b"-") + for w, c in zip(sticky_self_just, sticky_other_just) + ] + ) + + if b"-" in junction: + raise TypeError("sticky ends not compatible!") + else: - raise TypeError("sticky ends not compatible!") - return answer + + result = _terminal_overlap( + sticky_self.translate(to_full_sequence).decode("ascii"), + sticky_other.translate(to_full_sequence).decode("ascii") + "@", + limit=1, + ) + + if not result: + return None + + (startx, starty, length), *_ = result + + sticky_self = sticky_self[startx : startx + length] + sticky_other = sticky_other[starty : starty + length] + + junction = b"".join( + [ + bp_dict.get((bytes([w]), bytes([c])), b"-") + for w, c in zip(sticky_self, sticky_other) + ] + ) + + return self.__class__( + self._data[: -len(sticky_self) or None] + + junction + + other._data[len(sticky_other) or None :] + ) + + def __add__(self: DseqType, other: DseqType) -> DseqType: + return self._add(other, perfectmatch=True) def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): @@ -1766,13 +1704,11 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) return Dseq( - str(self[left_watson:right_watson]), + self[left_watson:right_watson].to_blunt_string(), # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case - str( - self.reverse_complement()[ - len(self) - right_crick : len(self) - left_crick - ] - ), + self.rc()[ + len(self) - right_crick : len(self) - left_crick + ].to_blunt_string(), ovhg=ovhg_left, ) From ddf6acc412fcec46ed4287a63d98e1bfc2f632dd Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Sun, 21 Sep 2025 11:30:10 +0100 Subject: [PATCH 05/36] Dseq + empty string now returns the Dseq obj unchanged. collected all imports at the top. Some tests involved strands that did not anneal prefectly, these have been corrected. --- tests/test_module_dseq.py | 196 ++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 95 deletions(-) diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index 770fe9bf..c2d56f0a 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -2,10 +2,21 @@ # -*- coding: utf-8 -*- import pytest +import textwrap +from pydna.dseq import Dseq +from pydna.utils import eq + +from Bio.Restriction import ( + Acc65I, ApaI, BamHI, BglII, BsaI, Bsp120I, EcoRI, EcoRV, + KpnI, MaeII, NmeDI, NotI, PacI, PstI, RestrictionBatch, TaiI +) + +from seguid import ldseguid +from seguid import cdseguid def test_cut1(): - from pydna.dseq import Dseq + """ Acc65I.search(_Seq("GGTACC")) @@ -21,7 +32,7 @@ def test_cut1(): Out [12]: [6] """ - from Bio.Restriction import Acc65I, Bsp120I, KpnI, ApaI, TaiI, MaeII + """ @@ -76,7 +87,7 @@ def test_cut1(): def test_cas9(): - from pydna.dseq import Dseq + s = Dseq("gattcatgcatgtagcttacgtagtct") @@ -86,8 +97,8 @@ def test_cas9(): def test_initialization(): - import pytest - from pydna.dseq import Dseq + + obj = Dseq(b"aaa") assert obj.crick == "ttt" @@ -156,9 +167,6 @@ def test_initialization(): def test_cut_around_and_religate(): - from pydna.dseq import Dseq - from pydna.utils import eq - from Bio.Restriction import KpnI, BamHI, Acc65I def cut_and_religate_Dseq(seq_string, enz, top): ds = Dseq(seq_string, circular=not top) @@ -190,17 +198,13 @@ def cut_and_religate_Dseq(seq_string, enz, top): ("aaaGGTACCcccGGATCCCgggGGTACCttt", [BamHI, KpnI], True), ] - for s in seqs: - print(s) - sek, enz, lin = s + for sek, enz, lin in seqs: for i in range(len(sek)): zek = sek[i:] + sek[:i] cut_and_religate_Dseq(zek, enz, lin) def test_Dseq_cutting_adding(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI, PstI, EcoRI a = Dseq( "GGATCCtcatctactatcatcgtagcgtactgatctattctgctgctcatcatcggtactctctataattatatatatatgcgcgtGGATCC", @@ -236,8 +240,6 @@ def test_Dseq_cutting_adding(): def test_dseq(): - import textwrap - from pydna.dseq import Dseq obj1 = Dseq("a", "t", circular=True) obj2 = Dseq("a", "t") @@ -251,8 +253,12 @@ def test_dseq(): with pytest.raises(TypeError): obj1 + "" - with pytest.raises(AttributeError): - obj2 + "" + + # with pytest.raises(AttributeError): + # obj2 + "" + + assert obj2 + "" == obj2 + assert "" + obj2 == obj2 obj1 = Dseq("at", "t") obj2 = Dseq("a", "t") @@ -289,15 +295,18 @@ def test_dseq(): assert obj.five_prime_end() == ("3'", "c") obj = Dseq("ccGGATCC", "aaggatcc", -2) - assert obj._data == b"ccGGATCCtt" + # assert obj._data == b"ccGGATCCtt" + assert obj._data == b"iiGGATCCzz" assert str(obj.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCC - cctaggaa + CCTAGGaa """ ).strip() + + assert repr(obj) == rpr assert obj[3] == Dseq("G", "c", 0) @@ -343,53 +352,54 @@ def test_dseq(): assert Dseq("GGATCC", "GGATCC").t4("gat") == Dseq("ggat", "ggat", ovhg=-2) a2 = Dseq("ccGGATCCaa", "ggatcc", -2) - assert a2._data == b"ccGGATCCaa" - assert a2._data == b"ccGGATCCaa" + # assert a2._data == b"ccGGATCCaa" + assert a2._data == b"iiGGATCCee" assert str(a2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCCaa - cctagg + CCTAGG """ ).strip() + assert repr(a2) == rpr a3 = Dseq("ccGGATCC", "ggatcc", -2) - assert a3._data == b"ccGGATCC" - assert a3._data == b"ccGGATCC" + # assert a3._data == b"ccGGATCC" + assert a3._data == b"iiGGATCC" assert str(a3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) ccGGATCC - cctagg + CCTAGG """ ).strip() assert repr(a3) == rpr b = Dseq("GGATCC", "aaggatcccc", 2) - assert b._data == b"ggGGATCCtt" - assert b._data == b"ggGGATCCtt" + # assert b._data == b"ggGGATCCtt" + assert b._data == b"qqGGATCCzz" assert str(b.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCC - cccctaggaa + ccCCTAGGaa """ ).strip() assert repr(b) == rpr b2 = Dseq("GGATCCaa", "ggatcccc", 2) - assert b2._data == b"ggGGATCCaa" - assert b2._data == b"ggGGATCCaa" + # assert b2._data == b"ggGGATCCaa" + assert b2._data == b"qqGGATCCee" assert str(b2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCCaa - cccctagg + ccCCTAGG """ ).strip() assert repr(b2) == rpr @@ -397,46 +407,46 @@ def test_dseq(): assert b2.rc().seguid() == "ldseguid=F0z-LxHZqAK3HvqQiqjM7A28daE" b3 = Dseq("GGATCC", "ggatcccc", 2) - assert b3._data == b"ggGGATCC" - assert b3._data == b"ggGGATCC" + # assert b3._data == b"ggGGATCC" + assert b3._data == b"qqGGATCC" assert str(b3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) GGATCC - cccctagg + ccCCTAGG """ ).strip() assert repr(b3) == rpr c = Dseq("GGATCCaaa", "ggatcc", 0) - assert c._data == b"GGATCCaaa" - assert c._data == b"GGATCCaaa" + # assert c._data == b"GGATCCaaa" + assert c._data == b"GGATCCeee" assert str(c.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCCaaa - cctagg + CCTAGG """ ).strip() assert repr(c) == rpr d = Dseq("GGATCC", "aaaggatcc", 0) - assert d._data == b"GGATCCttt" - assert d._data == b"GGATCCttt" + # assert d._data == b"GGATCCttt" + assert d._data == b"GGATCCzzz" assert str(d.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCC - cctaggaaa + CCTAGGaaa """ ).strip() assert repr(d) == rpr obj = Dseq("GGATCCaaa", "ggatcc", 0) - from Bio.Restriction import BamHI + frag1 = Dseq("G", "gatcc", 0) frag2 = Dseq("GATCCaaa", "g", -4) @@ -470,11 +480,11 @@ def test_dseq(): obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "CCCttagaccagatcacatactacagctacgcta") - assert repr(obj) == "Dseq(-34)\ntagc..ctaa \natcg..gattCCC" + assert repr(obj) == "Dseq(-34)\ntagc..ctaa\natcg..gattCCC" obj = Dseq("tagcgtagctgtagtatgtgatctggtctaaCCC", "ttagaccagatcacatactacagctacgcta") - assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt " + assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt" obj = Dseq("agcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-31)\n agcg..ctaa\natcgc..gatt" @@ -512,7 +522,7 @@ def test_dseq(): assert Dseq("tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta").looped() == obj1 - from Bio.Restriction import BglII, BamHI + obj = Dseq("ggatcc") @@ -529,7 +539,7 @@ def test_dseq(): assert BamHI in obj.n_cutters(1) assert BamHI in obj.cutters() - from Bio.Restriction import RestrictionBatch + rb = RestrictionBatch((BamHI, BglII)) @@ -553,8 +563,8 @@ def test_dseq(): def test_Dseq_slicing(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI + + a = Dseq("ggatcc", "ggatcc", 0) @@ -569,8 +579,8 @@ def test_Dseq_slicing(): def test_Dseq_slicing2(): - from pydna.dseq import Dseq - from Bio.Restriction import BamHI, EcoRI, KpnI + + a = Dseq("aaGGATCCnnnnnnnnnGAATTCccc", circular=True) # TODO: address this test change Related to https://github.com/pydna-group/pydna/issues/78 @@ -588,7 +598,6 @@ def test_Dseq_slicing2(): def test_Dseq___getitem__(): # test the slicing - from pydna.dseq import Dseq s = Dseq("GGATCC", circular=False) assert s[1:-1] == Dseq("GATC", circular=False) @@ -603,6 +612,8 @@ def test_Dseq___getitem__(): assert s[9:1] == Dseq("") assert t[9:1] == Dseq("") + s[-1:9] + # Indexing of full circular molecule (https://github.com/pydna-group/pydna/issues/161) s = Dseq("GGATCC", circular=True) str_seq = str(s) @@ -611,8 +622,8 @@ def test_Dseq___getitem__(): def test_cut_circular(): - from pydna.dseq import Dseq - from Bio.Restriction import BsaI, KpnI, Acc65I, NotI + + test = "aaaaaaGGTACCggtctcaaaa" @@ -637,35 +648,35 @@ def test_cut_circular(): def test_repr(): - from pydna.dseq import Dseq + a = Dseq("gattcgtatgctgatcgtacgtactgaaaac") assert repr(a) == "Dseq(-31)\ngatt..aaac\nctaa..tttg" - b = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "gactagcatgcatgacttttc"[::-1]) + b = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "gactagcatgcatgacttttg"[::-1]) - assert repr(b) == "Dseq(-31)\ngattcgtatgctga..aaac\n gact..tttc" + assert repr(b) == "Dseq(-31)\ngattcgtatgctga..aaac\n gact..tttg" - c = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "actagcatgcatgacttttc"[::-1]) + c = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "actagcatgcatgacttttg"[::-1]) - assert repr(c) == "Dseq(-31)\ngatt..atgctgat..aaac\n acta..tttc" + assert repr(c) == "Dseq(-31)\ngatt..atgctgat..aaac\n acta..tttg" d = Dseq("gattcgtatgctgatcgtacg", "gactagcatgc"[::-1]) assert repr(d) == "Dseq(-21)\ngattcgtatgctgatcgtacg\n gactagcatgc" - e = Dseq("gactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaac"[::-1]) + e = Dseq("gactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaag"[::-1]) - assert repr(e) == "Dseq(-31)\n gact..tttc\ngattcgtatgctga..aaac" + assert repr(e) == "Dseq(-31)\n gact..tttc\ngattcgtatgctga..aaag" - f = Dseq("Ggactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaac"[::-1]) + f = Dseq("Cgactagcatgcatgacttttc", "gattcgtatgctgatcgtacgtactgaaaag"[::-1]) - assert repr(f) == "Dseq(-31)\n Ggac..tttc\ngattcgtatgctg..aaac" + assert repr(f) == "Dseq(-31)\n Cgac..tttc\ngattcgtatGctg..aaag" g = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "ctaagcatacgactagc"[::-1]) - assert repr(g) == "Dseq(-31)\ngatt..atcgtacg..aaac\nctaa..tagc " + assert repr(g) == "Dseq(-31)\ngatt..atcgtacg..aaac\nctaa..tagc" h = Dseq("cgtatgctgatcgtacgtactgaaaac", "gcatacgactagc"[::-1]) @@ -673,15 +684,15 @@ def test_repr(): i = Dseq("cgtatgctgatcgtacgtactgaaaacagact", "gcatacgactagc"[::-1]) - assert repr(i) == "Dseq(-32)\ncgta..atcgtacg..gact\ngcat..tagc " + assert repr(i) == "Dseq(-32)\ncgta..atcgtacg..gact\ngcat..tagc" - j = Dseq("gattcgtatgctgatcgtacgtactgaaaac", "acAAGGAGAGAtg", ovhg=11) + j = Dseq("gtttcgtatgctgatcgtacgtactgaaaac", "acAAGGAGAGAtg", ovhg=11) - assert repr(j) == "Dseq(-42)\n gattcg..aaac\ngtAG..GGAAca " + assert repr(j) == "Dseq(-42)\n gtttcg..aaac\ngtAG..GGAAca" k = Dseq("g", "gattcgtatgctgatcgtacgtactgaaaac", ovhg=0) - assert repr(k) == "Dseq(-31)\ng \ncaaaa..ttag" + assert repr(k) == "Dseq(-31)\ng\ncaaaa..ttag" x = Dseq("gattcgtatgctgatcgtacgtactgaaaa") @@ -697,7 +708,7 @@ def test_repr(): def test_shifted(): - from pydna.dseq import Dseq + a = Dseq("gatc", circular=True) @@ -719,7 +730,7 @@ def test_looped(): # Looping a circular sequence should return a copy of the sequence # not the same sequence - from pydna.dseq import Dseq + a = Dseq("gatc", circular=True) @@ -728,11 +739,11 @@ def test_looped(): def test_misc(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg", circular=True) - from Bio.Restriction import NotI + a, b = x.cut(NotI) @@ -742,11 +753,11 @@ def test_misc(): def test_cut_missing_enzyme(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg") - from Bio.Restriction import PstI + assert x.cut(PstI) == () @@ -756,7 +767,7 @@ def test_cut_missing_enzyme(): def test_cut_with_no_enzymes(): - from pydna.dseq import Dseq + x = Dseq("ctcgGCGGCCGCcagcggccg") @@ -768,7 +779,7 @@ def test_cut_with_no_enzymes(): def test_transcribe(): - from pydna.dseq import Dseq + x = Dseq("ATGAAATAA") @@ -778,7 +789,7 @@ def test_transcribe(): def test_translate(): - from pydna.dseq import Dseq + x = Dseq("ATGAAATAA") @@ -788,7 +799,7 @@ def test_translate(): def test_from_full_sequence_and_overhangs(): - from pydna.dseq import Dseq + test_cases = [ (2, 2, "AAAA", "TTTT"), @@ -807,7 +818,7 @@ def test_from_full_sequence_and_overhangs(): def test_right_end_position(): - from pydna.dseq import Dseq + test_cases = [ ("AAA", "TT", (3, 2)), @@ -821,12 +832,12 @@ def test_right_end_position(): def test_left_end_position(): - from pydna.dseq import Dseq + test_cases = [ ("AAA", "TT", (0, 1), -1), ("AA", "TTT", (1, 0), 1), - ("AAT", "TTT", (0, 0), 0), + ("AAA", "TTT", (0, 0), 0), ] for watson, crick, expected, ovhg in test_cases: dseq = Dseq(watson, crick, ovhg=ovhg, circular=False) @@ -834,8 +845,8 @@ def test_left_end_position(): def test_apply_cut(): - from pydna.dseq import Dseq - from Bio.Restriction import EcoRI, BamHI + + seq = Dseq("aaGAATTCaa", circular=False) @@ -917,33 +928,32 @@ def test_apply_cut(): # Rotating the sequence, apply the same cut seq = Dseq("acgtATGaatt", circular=True) for shift in range(len(seq)): + seq_shifted = seq.shifted(shift) + start = 4 - shift if start < 0: start += len(seq) # Cut with negative ovhg new_cut = ((start, -3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgtATG" + assert out.to_blunt_string() == "ATGaattacgtATG" # Cut with positive ovhg start = (start + 3) % len(seq) new_cut = ((start, 3), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgtATG" + assert out.to_blunt_string() == "ATGaattacgtATG" # A blunt cut start = 4 - shift new_cut = ((start, 0), None) out = seq_shifted.apply_cut(new_cut, new_cut) - assert str(out) == "ATGaattacgt" + assert out.to_blunt_string() == "ATGaattacgt" def test_cutsite_is_valid(): - from pydna.dseq import Dseq - from Bio.Restriction import EcoRI, PacI, NmeDI, EcoRV - # Works for circular case seqs = ["GAATTC", "TTAATTAAC", "GATATC"] enzs = [EcoRI, PacI, EcoRV] @@ -1007,7 +1017,7 @@ def test_cutsite_is_valid(): def test_get_cutsite_pairs(): - from pydna.dseq import Dseq + # in the test, we replace cuts by integers for clarity. @@ -1036,7 +1046,7 @@ def test_get_cutsite_pairs(): def test_get_cut_parameters(): - from pydna.dseq import Dseq + dseq = Dseq.from_full_sequence_and_overhangs("aaaACGTaaa", 3, 3) assert dseq.get_cut_parameters(None, True) == (*dseq.left_end_position(), dseq.ovhg) @@ -1079,8 +1089,8 @@ def test_get_cut_parameters(): def test_checksums(): - from seguid import ldseguid, cdseguid - from pydna.dseq import Dseq + + # AT # TA @@ -1119,7 +1129,3 @@ def test_checksums(): truth = "cdseguid=5fHMG19IbYxn7Yr7_sOCkvaaw7U" assert cdseguid("ACGTT", "AACGT") == truth == Dseq("ACGTT", "AACGT", circular=True).seguid() assert cdseguid("AACGT", "ACGTT") == truth == Dseq("AACGT", "ACGTT", circular=True).seguid() - - -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) From 11883207d26b7e9c200aeb600800dd4ff2e02cce Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:38:38 +0100 Subject: [PATCH 06/36] 1. A new CircularString class. Work in progress, should be natively bytestrings 2. user method that removes U and leaves an empty site. 3. cast_to_ds_right, cast_to_ds_left methods, these are *not* fill_in methods as they do not rely on a polymerase. 4. New melt method, useful for USER cloning etc.. 5. reimplemented apply_cut method --- src/pydna/dseq.py | 371 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 354 insertions(+), 17 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index b4be49ec..15dc6550 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -16,8 +16,6 @@ import copy as _copy - -# import itertools as _itertools import re as _re import sys as _sys import math as _math @@ -30,6 +28,7 @@ from seguid import ldseguid as _ldseguid from seguid import cdseguid as _cdseguid +# from pydna.utils import complement as _complement from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap @@ -37,10 +36,8 @@ from pydna.utils import to_watson_table from pydna.utils import to_crick_table from pydna.utils import to_full_sequence - -# from pydna.utils import to_5tail_table -# from pydna.utils import to_3tail_table - +from pydna.utils import to_5tail_table +from pydna.utils import to_3tail_table from pydna.common_sub_strings import common_sub_strings as _common_sub_strings from pydna.common_sub_strings import terminal_overlap as _terminal_overlap from Bio.Restriction import RestrictionBatch as _RestrictionBatch @@ -52,6 +49,88 @@ from typing import List as _List, Tuple as _Tuple, Union as _Union +class CircularString(str): + """ + A circular string: indexing and slicing wrap around the origin (index 0). + + Examples: + s = CircularString("ABCDE") + + # Integer indexing (wraps) + assert s[7] == "C" # 7 % 5 == 2 + assert s[-1] == "E" + + # Forward circular slices (wrap when stop <= start) + assert s[3:1] == "DEA" # 3,4,0 + assert s[1:1] == "BCDE" # full turn starting at 1, excluding 1 on next lap + assert s[:] == "ABCDE" # full string + assert s[::2] == "ACE" # every 2nd char, no wrap needed + assert s[4:2:2] == "EA" # 4,0 + + # Reverse circular slices + assert s[1:1:-1] == "BAEDC" # full reverse circle starting at 1 + assert s[2:4:-1] == "CBA" # 2,1,0 + + # Steps > 1 and negatives work with wrap as expected + assert s[0:0:2] == "ACE" + assert s[0:0:-2] == "AECBD" + """ + + def __new__(cls, value): + return super().__new__(cls, value) + + def __getitem__(self, key): + n = len(self) + if n == 0: + # Behave like str: indexing raises, slicing returns empty + if isinstance(key, slice): + return self.__class__("") + raise IndexError("CircularString index out of range (empty string)") + + if isinstance(key, int): + # Wrap single index + return super().__getitem__(key % n) + + if isinstance(key, slice): + start, stop, step = key.start, key.stop, key.step + step = 1 if step is None else step + if step == 0: + raise ValueError("slice step cannot be zero") + + # Defaults that mimic normal slicing but on an infinite tiling + if step > 0: + start = 0 if start is None else start + stop = n if stop is None else stop + # Ensure we move forward; if stop <= start, wrap once. + while stop <= start: + stop += n + rng = range(start, stop, step) + else: + # step < 0 + start = (n - 1) if start is None else start + stop = -1 if stop is None else stop + # Ensure we move backward; if stop >= start, wrap once (backwards). + while stop >= start: + stop -= n + rng = range(start, stop, step) + + # Map to modulo-n and collect + # Cap at one full lap for safety (no infinite loops) + limit = n if step % n == 0 else n * 2 # generous cap for large steps + out = [] + count = 0 + for i in rng: + out.append(super().__getitem__(i % n)) + count += 1 + if count > limit: + break # should never happen with the above normalization + + return self.__class__("".join(out)) + + # Fallback (shouldn't be reached) + return super().__getitem__(key) + + class Dseq(_Seq): """Dseq holds information for a double stranded DNA fragment. @@ -434,6 +513,7 @@ def from_representation(cls, dsdna: str, *args, **kwargs): print(f"Base mismatch in representation {err}") raise obj._data = bytes(data) + obj.length = len(data) return obj @classmethod @@ -555,7 +635,9 @@ def to_blunt_string(self): DESCRIPTION. """ - return str(self).translate(to_full_sequence) + return self._data.translate(to_full_sequence).decode("ascii") + + __str__ = to_blunt_string def mw(self) -> float: """This method returns the molecular weight of the DNA molecule @@ -622,6 +704,11 @@ def find( return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end) + def __contains__(self, item: [str, bytes]): + if isinstance(item, bytes): + item = item.decode("ascii") + return item in self.to_blunt_string() + def __getitem__(self, sl: [slice, int]) -> "Dseq": """Method is used by the slice notation""" if isinstance(sl, int): @@ -1409,6 +1496,20 @@ def terminal_transferase(self, nucleotides="a") -> "Dseq": ovhg += len(nucleotides) return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg) + def user(self): + """ + USER Enzyme is a mixture of Uracil DNA glycosylase (UDG) and the + DNA glycosylase-lyase Endonuclease VIII. UDG catalyses the excision + of an uracil base, forming an abasic (apyrimidinic) site while leaving + the phosphodiester backbone intact (2,3). + + Returns + ------- + None. + + """ + return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"FfEe"))) + def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. @@ -1614,6 +1715,154 @@ def right_end_position(self) -> _Tuple[int, int]: return len(self) + self.watson_ovhg(), len(self) return len(self), len(self) - self.watson_ovhg() + def get_ss_meltsites(self: DseqType, length) -> _List[CutSiteType]: + + if length < 1: + return () + + regex = _re.compile( + ( + f"(?P((?<=[PEXIpexi]))([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])))|" + f"(?P((?<=[QFZJqfzj]))([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])))" + ).encode("ascii") + ) + + if self.circular: + + spacer = length + + cutfrom = self._data[-length:] + self._data + self._data[:length] + + else: + + spacer = 0 + + cutfrom = self._data + + watsoncuts = [] + crickcuts = [] + + for m in regex.finditer(cutfrom): + + if m.lastgroup == "watson": + cut1 = m.start() + spacer + cut2 = m.end() + spacer + watsoncuts.append((cut1, cut2)) + else: + assert m.lastgroup == "crick" + cut1 = m.start() + spacer + cut2 = m.end() + spacer + crickcuts.append((cut1, cut2)) + + return watsoncuts, crickcuts + + def get_ds_meltsites(self: DseqType, length) -> _List[CutSiteType]: + """Double stranded DNA melt sites + + Returns a sorted list (by distance from the left) of tuples. Each tuple (`((cut_watson, ovhg), enz)`) + contains a tuple containing the cut position and the overhang value for the enzyme followed by the + an enzyme object. + + """ + + if length < 1: + return () + + regex = _re.compile( + ( + f"(?P((?<=[PEXIpexi])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])|$))|" + f"(?P((?<=[QFZJqfzj])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])|$))" + ).encode("ascii") + ) + + if self.circular: + + spacer = length + + cutfrom = self._data[-length:] + self._data + self._data[:length] + + else: + + spacer = 0 + + cutfrom = self._data + + cuts = [] + + for m in regex.finditer(cutfrom): + + if m.lastgroup == "watson": + cut = (m.end() - spacer, m.end() - m.start()), None + else: + assert m.lastgroup == "crick" + cut = (m.start() - spacer, m.start() - m.end()), None + + cuts.append(cut) + + return cuts + + def cast_to_ds_right(self): + """ + NNNNQFZJ + + NNNN---- + NNNNCTAG + + NNNNGATC + NNNNCTAG + + + + NNNNPEXI + + NNNNGATC + NNNN---- + + NNNNGATC + NNNNCTAG + + """ + + def replace(m): + return m.group(1).translate(to_full_sequence) + + # Not using f-strings below to avoid bytes/string conversion + return self.__class__( + _re.sub(b"(?<=[GATCgatc])([PEXIpexiQFZJqfzj]+)$", replace, self._data), + circular=False, + ) + + def cast_to_ds_left(self): + """ + PEXINNNN + + GATCNNNN + NNNN + + GATCNNNN + CTAGNNNN + + + + QFZJNNNN + + NNNN + CTAGNNNN + + GATCNNNN + CTAGNNNN + + """ + + def replace(m): + return m.group(1).translate(to_full_sequence) + + # Not using f-strings below to avoid bytes/string conversion + return self.__class__( + _re.sub(b"^([PEXIpexiQFZJqfzj]+)(?=[GATCgatc])", replace, self._data), + circular=False, + ) + def get_cut_parameters( self, cut: _Union[CutSiteType, None], is_left: bool ) -> _Tuple[int, int, int]: @@ -1643,6 +1892,64 @@ def get_cut_parameters( # In the right end, the overhang does not matter return *self.right_end_position(), self.watson_ovhg() + def melt(self, length): + if not length or length < 1: + return tuple() + + new, strands = self.shed_ss_dna(length) + + cutsites = new.get_ds_meltsites(length) + + cutsite_pairs = self.get_cutsite_pairs(cutsites) + + result = tuple(new.apply_cut(*cutsite_pair) for cutsite_pair in cutsite_pairs) + + result = tuple([new]) if strands and not result else result + + return strands + result + + def shed_ss_dna(self, length): + + new, strands, intervals = self._shed_ss_dna(length) + + return new, strands + + def _shed_ss_dna(self, length): + + watsonnicks, cricknicks = self.get_ss_meltsites(length) + + watsonstrands = [] + crickstrands = [] + + new = self._data + + for x, y in watsonnicks: + stuffer = new[x:y] + ss = Dseq.quick(stuffer.translate(to_5tail_table)) + new = new[:x] + stuffer.translate(to_3tail_table) + new[y:] + watsonstrands.append((x, y, ss)) + + for x, y in cricknicks: + stuffer = new[x:y] + ss = Dseq.quick(stuffer.translate(to_3tail_table)) + new = new[:x] + stuffer.translate(to_5tail_table) + new[y:] + crickstrands.append((x, y, ss)) + + ordered_strands = sorted(watsonstrands + crickstrands) + + strands = [] + + for x, y, ss in ordered_strands: + seq = ( + ss._data[::-1].translate(to_watson_table).strip() + or ss._data.translate(to_crick_table).strip() + ) + strands.append(_Seq(seq)) + + intervals = tuple((x, y) for x, y, ss in ordered_strands) + + return Dseq(new), tuple(strands), intervals + def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": """Extracts a subfragment of the sequence between two cuts. @@ -1698,19 +2005,49 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": GttCTTAA """ - if _cuts_overlap(left_cut, right_cut, len(self)): + if _cuts_overlap(left_cut, right_cut, self.length): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) - left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) - right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) - return Dseq( - self[left_watson:right_watson].to_blunt_string(), - # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case - self.rc()[ - len(self) - right_crick : len(self) - left_crick - ].to_blunt_string(), - ovhg=ovhg_left, + if left_cut: + (left_watson_cut, left_overhang), _ = left_cut + else: + (left_watson_cut, left_overhang), _ = ((0, 0), None) + + if right_cut: + (right_watson_cut, right_overhang), _ = right_cut + else: + (right_watson_cut, right_overhang), _ = ( + (self.length, 0), + None, + ) + + table1 = to_5tail_table if left_overhang > 0 else to_3tail_table + table2 = to_5tail_table if right_overhang < 0 else to_3tail_table + + left_stck_begin = min(left_watson_cut, left_watson_cut - left_overhang) + left_stck_end = left_stck_begin + abs(left_overhang) + + right_stck_begin = min(right_watson_cut, right_watson_cut - right_overhang) + right_stck_end = right_stck_begin + abs(right_overhang) + + if self.circular: + cutfrom = CircularString(self.to_blunt_string()) + else: + cutfrom = self._data.decode("ascii") + + left_sticky_end = ( + cutfrom[left_stck_begin:left_stck_end] + .translate(table1) + .encode("ascii")[0 : abs(left_overhang)] ) + ds_middle_part = cutfrom[left_stck_end:right_stck_begin].encode("ascii") + right_sticky_end = ( + cutfrom[right_stck_begin:right_stck_end] + .translate(table2) + .encode("ascii")[0 : abs(right_overhang)] + ) + + return Dseq.quick(left_sticky_end + ds_middle_part + right_sticky_end) def get_cutsite_pairs( self, cutsites: _List[CutSiteType] From d6b9eda16453c01fcd7796db06db5ba5e27d2530 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:43:29 +0100 Subject: [PATCH 07/36] _annealing_positions new implementation using _iupac_compl_regex from utils. This should fix U in primers --- src/pydna/amplify.py | 74 ++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/src/pydna/amplify.py b/src/pydna/amplify.py index f2946b4c..1c4fd553 100644 --- a/src/pydna/amplify.py +++ b/src/pydna/amplify.py @@ -15,9 +15,7 @@ from pydna._pretty import pretty_str as _pretty_str from pydna.utils import flatten as _flatten - -# from pydna.utils import memorize as _memorize -from pydna.utils import rc as _rc, shift_feature as _shift_feature +from pydna.utils import shift_feature as _shift_feature from pydna.amplicon import Amplicon as _Amplicon from pydna.primer import Primer as _Primer from pydna.seqrecord import SeqRecord as _SeqRecord @@ -26,10 +24,11 @@ from Bio.SeqFeature import SimpleLocation as _SimpleLocation from Bio.SeqFeature import CompoundLocation as _CompoundLocation from pydna.seq import Seq as _Seq -import itertools as _itertools import re as _re import copy as _copy import operator as _operator +from pydna.utils import iupac_compl_regex as _iupac_compl_regex +from pydna.utils import anneal_from_left as _anneal_from_left # import os as _os @@ -37,25 +36,6 @@ # _module_logger = _logging.getLogger("pydna." + __name__) -_table = { # IUPAC Ambiguity Codes for Nucleotide Degeneracy and U for Uracile - "A": "A", - "C": "C", - "G": "G", - "T": "T", - "U": "A", # XXX - "R": "(A|G)", - "Y": "(C|T)", - "S": "(G|C)", - "W": "(A|T)", - "K": "(G|T)", - "M": "(A|C)", - "B": "(C|G|T)", - "D": "(A|G|T)", - "H": "(A|C|T)", - "V": "(A|C|G)", - "N": "(A|G|C|T)", -} - def _annealing_positions(primer, template, limit): """Finds the annealing position(s) for a primer on a template where the @@ -70,13 +50,14 @@ def _annealing_positions(primer, template, limit): <- - - - - - - - - - template - - - - - - - - - - - - - > - <------- start (int) ------> - 5'-...gctactacacacgtactgactgcctccaagatagagtcagtaaccacactcgat...3' + < ----- start = 26 ------> + 5'- gctactacacacgtactgactgcctccaagatagagtcagtaaccacactcgatag...3' |||||||||||||||||||||||||||||||||||||||||||||||| 3'-gttctatctcagtcattggtgtATAGTG-5' <-footprint length --> + Parameters ---------- primer : string @@ -85,7 +66,7 @@ def _annealing_positions(primer, template, limit): template : string The template sequence 5'-3' - limit : int = 15, optional + limit : int footprint needs to be at least of length limit. Returns @@ -94,32 +75,37 @@ def _annealing_positions(primer, template, limit): [ (start1, footprint1), (start2, footprint2) ,..., ] """ + # under_tail + # anchor AACCACACTCGAT + # CAAGATAGAGTCAGT + # ||||||||||||||| + # gttctatctcagtca + # ttggtgtATAGTG revprimer + # tail + # + # | <- limit -> | + # return empty list if primer too short if len(primer) < limit: return [] - prc = _rc(primer) + revprimer = primer[::-1] # head is minimum part of primer that must anneal - head = prc[:limit].upper() + head = revprimer[:limit].upper() + tail = revprimer[limit:].upper() # Make regex pattern that reflects extended IUPAC DNA code - head = "".join(_table[key] for key in head) - - positions = [m.start() for m in _re.finditer(f"(?={head})", template, _re.I)] - - if positions: - tail = prc[limit:].lower() - length = len(tail) - results = [] - for match_start in positions: - tm = template[match_start + limit : match_start + limit + length].lower() - footprint = len( - list(_itertools.takewhile(lambda x: x[0] == x[1], zip(tail, tm))) - ) - results.append((match_start, footprint + limit)) - return results - return [] + head_regex = "".join(_iupac_compl_regex[key] for key in head) + primer_regex = f"(?:({head_regex})(.{{0,{len(primer) - limit}}}))" + + results = [] + for m in _re.finditer(primer_regex, template.upper()): + anchor, under_tail = m.groups() + match_start = m.start() + match_extension = _anneal_from_left(tail, under_tail[::-1]) + results.append((match_start, limit + match_extension)) + return results # class _Memoize(type): From b06e843871558575e6fad2d9626755abca3d03d3 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:52:54 +0100 Subject: [PATCH 08/36] fixed fill_left and fill_right and a FIXME --- src/pydna/assembly2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index 2e3fdea3..e3e47cdc 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -616,7 +616,7 @@ def fill_left(seq: _Dseq) -> _Dseq: elif seq.ovhg > 0: new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson - return _Dseq(new_watson, new_crick, 0) + return seq.cast_to_ds_left() # _Dseq(new_watson, new_crick, 0) def fill_right(seq: _Dseq) -> _Dseq: @@ -633,7 +633,7 @@ def fill_right(seq: _Dseq) -> _Dseq: elif watson_ovhg > 0: new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick - return _Dseq(new_watson, new_crick, seq.ovhg) + return seq.cast_to_ds_right() # _Dseq(new_watson, new_crick, seq.ovhg) def fill_dseq(seq: _Dseq) -> _Dseq: @@ -797,8 +797,12 @@ def assemble( ] # Join the left sequence including the overlap with the right sequence without the overlap # we use fill_right / fill_left so that it works for ligation of sticky ends + # breakpoint() out_dseqrecord = _Dseqrecord( - fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[overlap:], + fill_right(out_dseqrecord.seq) + + fill_left(fragment.seq)[ + overlap: + ], # FIXME: This is wrong for PCR. Both primers get incorporated into the product. features=out_dseqrecord.features + new_features, ) From a8cb12e7f656e29e6d59503f0439dc04ad9ecabc Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:56:03 +0100 Subject: [PATCH 09/36] fixed initiation --- src/pydna/dseqrecord.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py index fadd9a44..9ed5aa5f 100644 --- a/src/pydna/dseqrecord.py +++ b/src/pydna/dseqrecord.py @@ -18,7 +18,7 @@ from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries # from pydna.utils import memorize as _memorize -from pydna.utils import rc as _rc +# from pydna.utils import rc as _rc from pydna.utils import shift_location as _shift_location from pydna.utils import shift_feature as _shift_feature from pydna.common_sub_strings import common_sub_strings as _common_sub_strings @@ -219,9 +219,7 @@ def from_string( obj = cls.__new__(cls) # Does not call __init__ obj._per_letter_annotations = {} obj.seq = _Dseq.quick( - record, - _rc(record), - ovhg=0, + record.encode("ascii"), # linear=linear, circular=circular, ) @@ -258,9 +256,7 @@ def from_SeqRecord( obj.n = n if circular is None: circular = record.annotations.get("topology") == "circular" - obj.seq = _Dseq.quick( - str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular - ) + obj.seq = _Dseq.quick(record.seq._data, ovhg=0, circular=circular) return obj @property From 78c6ea374e42e3eacfa0f0a841c7b2b294eec66c Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:56:38 +0100 Subject: [PATCH 10/36] deleted --- src/pydna/user_cloning.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 src/pydna/user_cloning.py diff --git a/src/pydna/user_cloning.py b/src/pydna/user_cloning.py deleted file mode 100644 index 1402c65a..00000000 --- a/src/pydna/user_cloning.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright 2013-2023 by Björn Johansson. All rights reserved. -# This code is part of the Python-dna distribution and governed by its -# license. Please see the LICENSE.txt file that should have been included -# as part of this package. - - -# from abc import ABC, abstractmethod -# import re -# from pydna.utils import rc - - -""" -Nicking & USER cloning - - -CGAuGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGATC -GCTACAGCTGAATCTAGAGTGTCCGAAAAAAGTTCTGCCGGAACTTAAGTCAGTAAACCTAGGCCGGCuAG - -1. Digest both strands -2. Collect all linear ssDNA -3. Anneal all combinations -4. Keep ones present in original molecule -5. Rank by stability -6. - -""" From 07838f5fc6fad4fe1f9a3c3d98f31ac7637e6db3 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 23 Sep 2025 18:58:23 +0100 Subject: [PATCH 11/36] anneal_from_left function and more regexes and tables --- src/pydna/utils.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index debf5703..780d23e1 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -76,6 +76,46 @@ to_3tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") to_full_sequence = bytes.maketrans(b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc") + +iupac_regex = { # IUPAC Ambiguity Codes for Nucleotide Degeneracy and U for Uracile + "A": "(?:A)", + "C": "(?:C)", + "G": "(?:G)", + "T": "(?:T|U)", + "U": "(?:U|T|U)", + "R": "(?:A|G|R)", + "Y": "(?:C|T|Y)", + "S": "(?:G|C|S)", + "W": "(?:A|T|W)", + "K": "(?:G|T|K)", + "M": "(?:A|C|M)", + "B": "(?:C|G|T|B)", + "D": "(?:A|G|T|D)", + "H": "(?:A|C|T|H)", + "V": "(?:A|C|G|V)", + "N": "(?:A|G|C|T|N)", +} + +iupac_compl_regex = { # IUPAC Ambiguity Code complements + "A": "(?:T|U)", + "C": "(?:G)", + "G": "(?:C)", + "T": "(?:A)", + "U": "(?:A)", + "R": "(?:T|C|Y)", + "Y": "(?:G|A|R)", + "S": "(?:G|C|S)", + "W": "(?:A|T|W)", + "K": "(?:C|AM)", + "M": "(?:T|G|K)", + "B": "(?:C|G|A|V)", + "D": "(?:A|C|T|H)", + "H": "(?:A|G|T|D)", + "V": "(?:T|C|G|B)", + "N": "(?:A|G|C|T|N)", +} + + bp_dict = { (b"P", b"Q"): b"G", # P / Q >>---> G (b"E", b"F"): b"A", # E / F >>---> A @@ -397,6 +437,11 @@ # (b" ", b"n"): b"n", } +bp_dict_str = { + (x.decode("ascii"), y.decode("ascii")): z.decode("ascii") + for (x, y), z in bp_dict.items() +} + def three_frame_orfs( dna: str, @@ -563,6 +608,29 @@ def smallest_rotation(s): return s[k:] + s[:k] +def anneal_from_left(watson: str, crick: str) -> int: + """ + The length of the common prefix shared by two strings. + + Args: + str1 (str): The first string. + str2 (str): The second string. + + Returns: + int: The length of the common prefix. + """ + + result = len( + list( + _itertools.takewhile( + lambda x: bp_dict_str.get((x[0], x[1])), zip(watson, crick[::-1]) + ) + ) + ) + + return result + + def cai(seq: str, organism: str = "sce", weights: dict = _weights): """docstring.""" from cai2 import CAI as _CAI From 0977340f5969864a83c8b3a1573fbd0dc884f012 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 24 Sep 2025 05:44:56 +0100 Subject: [PATCH 12/36] updated test for USER cloning --- tests/test_USERcloning.py | 76 +++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/tests/test_USERcloning.py b/tests/test_USERcloning.py index ad389a92..1d635c85 100644 --- a/tests/test_USERcloning.py +++ b/tests/test_USERcloning.py @@ -2,72 +2,62 @@ # -*- coding: utf-8 -*- import pytest +from pydna.dseq import Dseq from pydna.parsers import parse, parse_primers from pydna.amplify import pcr - +from textwrap import dedent def test_USER_cloning(): + # >a + # GUGGATT + primers = """ - >3CYC1clon - CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAG + >a + GUGGATT - >5CYC1clone - GAUCGGCCGGATCCAAATGACTGAATTCAAGGCC + >b + cUCGCCG """ template = """ - >templ - CgATGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGAtC + >temp + gtGGATTaaaCGGCGag """ fp, rp = parse_primers(primers) te, *rest = parse(template) te.add_feature() - p = pcr((fp, rp, te)) + p = pcr((fp, rp, te), limit=5) + assert p.seq._data == b"GUGGATTaaaCGGCGOg" figure = p.figure() - correct_figure = """\ -5CgATGTCGACTTAGATCTCACAGGCTTTTTTCAAG...GGCCTTGAATTCAGTCATTTGGATCCGGCCGAtC3 - |||||||||||||||||||||||||||||||||| - 3CCGGAACTTAAGTCAGTAAACCTAGGCCGGCUAG5 -5CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAG3 - ||||||||||||||||||||||||||||||||||| -3GcTACAGCTGAATCTAGAGTGTCCGAAAAAAGTTC...CCGGAACTTAAGTCAGTAAACCTAGGCCGGCTaG5""" - + correct_figure = dedent( + """\ + 5gtGGATT...CGGCGag3 + ||||||| + 3GCCGCUc5 + 5GUGGATT3 + ||||||| + 3caCCTAA...GCCGCtc5""" + ) assert figure == correct_figure - assert p.seq.watson == "CGAUGTCGACTTAGATCTCACAGGCTTTTTTCAAGaCGGCCTTGAATTCAGTCATTTGGATCCGGCCGATC" - assert p.seq.crick == "GAUCGGCCGGATCCAAATGACTGAATTCAAGGCCGtCTTGAAAAAAGCCTGTGAGATCTAAGTCGACATCG" - - -# hej = p.seq - -# from Bio.SeqFeature import SeqFeature -# import re + w = "GUGGATTaaaCGGCGAg" + c = "CACCTAAtttGCCGCUc" -# wpos = [0] + [m.start() for m in re.finditer('U', hej.watson)] + [len(hej.watson)] -# cpos = [0] + [m.start() for m in re.finditer('U', hej.crick)] + [len(hej.crick)] + assert p.seq.watson == w + assert p.seq.crick == c[::-1] -# from itertools import tee + assert te.features[0] == p.features[0] + USERprocessed = p.seq.user() -# def pairwise(iterable): -# "s -> (s0,s1), (s1,s2), (s2, s3), ..." -# a, b = tee(iterable) -# next(b, None) -# return list(slice(x, y, 1) for x, y in zip(a, b)) + assert USERprocessed.melt(1) == USERprocessed.melt(12) + stuffer, insert, stuffer = USERprocessed.melt(1) -# wslices = pairwise(wpos) -# cslices = pairwise(cpos) + plasmid = Dseq("FqaaaPE") -# ln = len(hej) -# for ws in wslices: -# for cs in cslices: -# if ws.stop >= ln - cs.stop: -# pass -# # print(ws, cs) -# print(ws, cs) + plasmid_insert = (plasmid + insert).looped() -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) + assert plasmid_insert == Dseq("AgaaaGAGGATTaaaCGGCG", circular=True) From 033b40696d5df8704c994460e9d6fd895a7801ec Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 24 Sep 2025 05:49:52 +0100 Subject: [PATCH 13/36] moved all imports to the beginning. Changed some tests. There is a FIXME indicating a large change in behaviour. --- tests/test_module_dseqrecord.py | 442 ++++++++++++++++++-------------- 1 file changed, 247 insertions(+), 195 deletions(-) diff --git a/tests/test_module_dseqrecord.py b/tests/test_module_dseqrecord.py index b3676650..cf31e1a3 100644 --- a/tests/test_module_dseqrecord.py +++ b/tests/test_module_dseqrecord.py @@ -2,10 +2,38 @@ # -*- coding: utf-8 -*- import pytest +import IPython +import sys +import copy +import warnings +import glob + +from importlib import reload +from unittest.mock import patch, mock_open, MagicMock + +from pydna import dseqrecord +from pydna.dseq import Dseq +from pydna.dseqrecord import Dseqrecord +from pydna.readers import read +from pydna.utils import eq +from pydna.utils import location_boundaries as _location_boundaries +from pydna.amplify import pcr + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqIO import read as abiread +from Bio.SeqFeature import SeqFeature, FeatureLocation, SimpleLocation +from Bio.Restriction import ( + Acc65I, ApaI, BamHI, BglII, BsaI, Bsp120I, + Bsu36I, BstAPI, EcoRI, EcoRV, KpnI, MaeII, + NlaIV, NdeI, NotI, PacI, PstI, SmaI, + RestrictionBatch +) + def test_orfs(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("AatgaaataaT") @@ -22,7 +50,7 @@ def test_orfs(): def test_cas9(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("gattcatgcatgtagcttacgtagtct") @@ -36,9 +64,9 @@ def test_cas9(): def test_FadiBakoura(): - from Bio.SeqFeature import SeqFeature, FeatureLocation - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + + dseq_record = Dseqrecord(Dseq("ACTCTTTCTCTCTCT", circular=True)) dseq_record.features = [SeqFeature(FeatureLocation(start=2, end=4))] @@ -47,41 +75,41 @@ def test_FadiBakoura(): def test_IPython_missing(monkeypatch): - import IPython - from pydna import dseqrecord + + # assert dseqrecord._display is IPython.display.display assert dseqrecord._display_html == IPython.display.display_html - import sys - import copy + + fakesysmodules = copy.copy(sys.modules) fakesysmodules["IPython.display"] = None monkeypatch.delitem(sys.modules, "IPython.display") monkeypatch.setattr("sys.modules", fakesysmodules) - from importlib import reload + reload(dseqrecord) - from pydna import dseqrecord + assert dseqrecord._display_html("item") == "item" # assert dseqrecord._HTML("item") == "item" def test_initialization(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord as Srec + + + + + a = [] a.append(Dseqrecord("attt", id="1")) a.append(Dseqrecord(Dseq("attt"), id="2")) a.append(Dseqrecord(Seq("attt"), id="3")) - a.append(Dseqrecord(Srec(Seq("attt"), id="4"))) + a.append(Dseqrecord(SeqRecord(Seq("attt"), id="4"))) a.append(Dseqrecord(Dseqrecord("attt"), id="5")) a.append(Dseqrecord(Dseqrecord(Dseq("attt", circular=True)), id="6", circular=False)) @@ -98,7 +126,7 @@ def test_initialization(): a.append(Dseqrecord("attt", circular=True)) a.append(Dseqrecord(Dseq("attt"), circular=True)) a.append(Dseqrecord(Seq("attt"), circular=True)) - a.append(Dseqrecord(Srec(Seq("attt")), circular=True)) + a.append(Dseqrecord(SeqRecord(Seq("attt")), circular=True)) a.append(Dseqrecord(Dseqrecord("attt"), circular=True)) for b in a: @@ -144,6 +172,8 @@ def test_initialization(): assert dsr.circular is False assert dsr.seq.circular is False assert str(dsr.seq) == "attta" + assert dsr.seq._data == b'etttf' + assert str(dsr.seq.to_blunt_string()) == "attta" dsr = Dseqrecord(ds, circular=True) @@ -238,7 +268,7 @@ def test_initialization(): def test_linear_circular(): - from pydna.dseqrecord import Dseqrecord + """ test Dseqrecord linear & circular property""" a = Dseqrecord("attt") @@ -268,7 +298,7 @@ def test_linear_circular(): def test_stamp(): - from pydna.dseqrecord import Dseqrecord + lin = Dseqrecord("attt") lin.stamp() @@ -285,7 +315,7 @@ def test_stamp(): assert first[:42] == crc.annotations["comment"][:42] assert len(first) == len(crc.annotations["comment"]) - from pydna.dseq import Dseq + blunt = Dseqrecord(Dseq("aa")) @@ -302,7 +332,7 @@ def test_stamp(): def test_revcomp(): - from pydna.dseqrecord import Dseqrecord + # ---- # attcccgggg @@ -341,14 +371,14 @@ def test_revcomp(): def test_m(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("A" * 5000) assert f"{s.m():.3e}" == "1.544e-07" def test_extract_feature(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") s.add_feature(3, 9) @@ -356,7 +386,7 @@ def test_extract_feature(): def test_find(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert s.find(Dseqrecord("ggatcc")) == 3 @@ -366,7 +396,7 @@ def test_find(): def test_find_aa(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert s.find_aa("FGSK") == slice(0, 12, None) @@ -374,7 +404,7 @@ def test_find_aa(): def test_str(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") s.annotations = {"date": "03-JAN-2018"} @@ -390,14 +420,14 @@ def test_str(): def test___contains__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("tttGGATCCaaa") assert "ggatcc" in s def test_seguid(): - from pydna.dseqrecord import Dseqrecord + l = Dseqrecord("tttGGATCCaaa") assert l.seguid() == "ldseguid=jbGRr-Jhpl0tVyt0Bx5nmY9_G6E" @@ -406,7 +436,7 @@ def test_seguid(): def test_format(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=True) s.format("gb") @@ -431,9 +461,9 @@ def test_format(): def test_write(): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("GGATCC", circular=True) m = mock_open() @@ -457,9 +487,9 @@ def test_write(): def test_write_same_seq_to_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("Ggatcc", circular=True) @@ -471,9 +501,9 @@ def test_write_same_seq_to_existing_file(monkeypatch): def test_write_different_file_to_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + s = Dseqrecord("Ggatcc", circular=True) d = Dseqrecord("GgatcA", circular=True) @@ -487,9 +517,9 @@ def test_write_different_file_to_existing_file(monkeypatch): def test_write_different_file_to_stamped_existing_file(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + new = Dseqrecord("Ggatcc", circular=True) new.stamp() @@ -529,9 +559,9 @@ def test_write_different_file_to_stamped_existing_file(monkeypatch): def test_write_different_file_to_stamped_existing_file2(monkeypatch): - from unittest.mock import patch - from unittest.mock import mock_open - from pydna.dseqrecord import Dseqrecord + + + new = Dseqrecord("Ggatcc", circular=True) new.stamp() @@ -582,10 +612,10 @@ def test_write_different_file_to_stamped_existing_file2(monkeypatch): # monkeypatch.setitem(readers.__builtins__, 'open', open) # def test_write_to_existing_file(): -# from unittest.mock import patch -# from unittest.mock import mock_open -# from pydna.dseqrecord import Dseqrecord -# from pydna.readers import read +# +# +# +# # # s = Dseqrecord("GGATCC", circular=True) # m = mock_open() @@ -610,10 +640,10 @@ def test_write_different_file_to_stamped_existing_file2(monkeypatch): def test_cut_args(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC") - from Bio.Restriction import BamHI, BglII, RestrictionBatch + rb = RestrictionBatch((BamHI, BglII)) assert s.cut(BamHI)[0].seq == s.cut(BamHI + BglII)[0].seq == s.cut(rb)[0].seq @@ -621,8 +651,8 @@ def test_cut_args(): def test_cut_circular(): - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import BsaI, KpnI, Acc65I, NotI + + test = "aaaaaaGGTACCggtctcaaaa" @@ -646,10 +676,10 @@ def test_cut_circular(): def test_cut_add(): - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from pydna.utils import eq - from Bio.Restriction import BamHI, EcoRI, PstI, SmaI + + + + a = Dseqrecord("GGATCCtcatctactatcatcgtagcgtactgatctattctgctgctcatcatcggtactctctataattatatatatatgcgcgtGGATCC").seq b = a.cut(BamHI)[1] @@ -746,7 +776,7 @@ def test_cut_add(): assert eq(pUC19_EcoRI_PstI_d, read("pUC19-EcoRI_PstI-d-rc.gb")) assert eq(pUC19_EcoRI_PstI_d.rc(), read("pUC19-EcoRI_PstI-d-rc.gb")) - from Bio.Restriction import Bsu36I, BstAPI + pCAPs = read("pCAPs.gb") a, b = pCAPs.cut(Bsu36I, BstAPI) @@ -755,8 +785,8 @@ def test_cut_add(): def test_Dseqrecord_cutting_adding_2(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + a = ( Dseqrecord( @@ -776,7 +806,7 @@ def test_Dseqrecord_cutting_adding_2(): ), ) - from Bio.Restriction import KpnI, Acc65I, NlaIV + enzymes = [Acc65I, NlaIV, KpnI] @@ -791,8 +821,8 @@ def test_Dseqrecord_cutting_adding_2(): def test_Dseqrecord_cutting_adding_3(): - from pydna.readers import read - from Bio.Restriction import Acc65I + + a = read( """ @@ -895,8 +925,8 @@ def test_Dseqrecord_cutting_adding_3(): def test_Dseqrecord_cutting_adding_4(): - from pydna.readers import read - from Bio.Restriction import KpnI, Acc65I, NlaIV, EcoRI, EcoRV + + a = read( """ @@ -1054,10 +1084,10 @@ def test_Dseqrecord_cutting_adding_4(): def test_features_on_slice(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature - from Bio.SeqFeature import SimpleLocation + + + + dseq_record = Dseqrecord(Dseq("ACTCTTTCTCTCTCT", circular=True)) dseq_record.features = [SeqFeature(SimpleLocation(start=2, end=4))] @@ -1072,10 +1102,10 @@ def test_features_on_slice(): def test_features_change_ori(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read - from pydna.utils import eq + + + + # Shifted a sequence by zero returns a copy s = Dseqrecord("GGATCC", circular=True) @@ -1141,10 +1171,10 @@ def test_features_change_ori(): # for x in b.features[0].location.parts: # print(x.extract(b.seq)) - # from pydna.utils import shift_location - # from Bio.SeqFeature import SimpleLocation - # from Bio.SeqFeature import CompoundLocation - # from Bio.SeqFeature import ExactPosition + # + # + # + # # ml6 = s2.shifted(6).features[0].location # ml7 = s2.shifted(7).features[0].location @@ -1197,10 +1227,9 @@ def test_features_change_ori(): assert [str(f.extract(b).seq) for f in b.features if f.qualifiers["label"][0] == "ins"][0] == "GTACCTTTGGATC" assert [str(f.extract(b).seq) for f in b.features if f.qualifiers["label"][0] == "bb"][0] == "CGGGAAAG" - from Bio.Restriction import Acc65I, BamHI - inseq = Dseq.from_representation("GTACCTTTG\n" " GAAACCTAG") + inseq = Dseq.from_representation("GTACCTTTG\n" " GAAACCTAG") bbseq = Dseq.from_representation("GATCCGGGAAAG\n" " GCCCTTTCCATG") assert s3.seq.cut(Acc65I, BamHI) == (inseq, bbseq) @@ -1208,6 +1237,7 @@ def test_features_change_ori(): bb1, ins1 = sorted(s3.cut(Acc65I, BamHI), key=len, reverse=True) assert str(bbfeat).upper() in bbseq + assert str(insfeat).upper() in inseq for i in range(0, len(s3)): @@ -1236,12 +1266,12 @@ def test_features_change_ori(): assert eq(ins1, ins) assert bb.features[0].extract(bb).seq == bbfeat - assert str(ins.features[0].extract(ins).seq) == str(insfeat) + assert ins.features[0].extract(ins).seq.to_blunt_string() == str(insfeat) def test_amijalis(): # Thanks to https://github.com/amijalis - from pydna.dseqrecord import Dseqrecord + test_seq = "ATCGATCGATCGATCGATCGATCGATCGATCGATCG" @@ -1264,11 +1294,11 @@ def test_amijalis(): def test_figure(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import Acc65I, KpnI, ApaI, Bsp120I - from Bio.SeqFeature import SimpleLocation - from Bio.SeqFeature import SeqFeature + + + + + # broken feature linear @@ -1596,10 +1626,10 @@ def test_figure(): # @pytest.mark.xfail(reason="issue #78") def test_jan_glx(): # Thanks to https://github.com/jan-glx - from Bio.Restriction import NdeI, BamHI - from pydna.readers import read - # from pydna.genbank import Genbank + + + # # gb = Genbank("bjornjobb@gmail.com") # puc19 = gb.nucleotide("M77789.2") # assert puc19.seguid() == "n-NZfWfjHgA7wKoEBU6zfoXib_0" @@ -1619,11 +1649,11 @@ def test_jan_glx(): def test_synced(): - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.readers import read + + + + + bb_ins = Dseqrecord("tcgcgcgtttcgATGTAgtgatgacggtgaA", circular=True) @@ -1664,7 +1694,7 @@ def test_synced(): def test_map_pCR_MCT1_HA46(): - from pydna.readers import read + pCR_MCT1_HA46 = read("pCR_MCT1_HA46.gb") @@ -1704,7 +1734,7 @@ def test_map_pCR_MCT1_HA46(): def test_map_short(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord("AAGTTAAAATAAGGCTAGTCCGTTAT") t.map_target = slice(0, 26) @@ -1713,7 +1743,7 @@ def test_map_short(): def test_map_too_short(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord("AAGTTAAAATAAGGCTAGTCCGTT") # t.map_target = slice(0,23) @@ -1722,7 +1752,7 @@ def test_map_too_short(): def test_map_no_match(): - from pydna.dseqrecord import Dseqrecord + t = Dseqrecord( "AGGAGGGGCCCACACCGAGGAAGTAGACTGTTATACGTCGGCGATGGTGGTAGCTAACTATGTTGCCTGCCACTACAACAGTATCTAAGCCGTGTAAAGG" @@ -1732,8 +1762,8 @@ def test_map_no_match(): def test_slicing2(): - from pydna.dseqrecord import Dseqrecord - from Bio.Restriction import BamHI, EcoRI, KpnI + + a = Dseqrecord("aaGGATCCnnnnnnnnnGAATTCccc", circular=True) assert ( @@ -1751,11 +1781,11 @@ def test_slicing2(): def test_rogerstager(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from pydna.utils import eq - from Bio.Seq import Seq - from Bio.Restriction import BsaI + + + + + answ = [] answ.append(Dseq("aaaaaaaaaaaaggtctca", "ttttttttccagagttttt"[::-1])) @@ -1774,7 +1804,7 @@ def test_rogerstager(): def test___mul__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert s * 3 == Dseqrecord("GGATCCGGATCCGGATCC", circular=False) @@ -1787,7 +1817,7 @@ def test___mul__(): def test___repr__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert repr(s) == "Dseqrecord(-6)" @@ -1796,8 +1826,8 @@ def test___repr__(): def test__repr_pretty_(): - from unittest.mock import MagicMock - from pydna.dseqrecord import Dseqrecord + + s = Dseqrecord("GGATCC", circular=False) pp = MagicMock() @@ -1809,8 +1839,8 @@ def test__repr_pretty_(): def test___getitem__(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation + + s = Dseqrecord("GGATCC", circular=False) assert s[1:-1].seq == Dseqrecord("GATC", circular=False).seq @@ -1825,7 +1855,7 @@ def test___getitem__(): assert t[1:1].seq == Dseqrecord("GATCCG").seq assert t[5:1].seq == Dseqrecord("CG", circular=False).seq assert t[9:1].seq == Dseqrecord("").seq - assert t[1:9].seq == Dseqrecord("").seq + assert t[1:9].seq == Dseqrecord("GATCC").seq # FIXME: Significant change assert t[9:10].seq == Dseqrecord("").seq assert t[10:9].seq == Dseqrecord("").seq @@ -1851,7 +1881,7 @@ def test___getitem__(): def test___eq__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) t = Dseqrecord("GGATCC", circular=False) @@ -1870,7 +1900,7 @@ def test___ne__(): def test___hash__(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) t = Dseqrecord("GGATCC", circular=False) @@ -1883,9 +1913,9 @@ def test___hash__(): def test_linearize(): - from Bio.Restriction import BamHI, BglII - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + + + u = Dseqrecord("GGATCC", circular=True) frag = u.linearize(BamHI) @@ -1911,10 +1941,10 @@ def test_linearize(): def test_cutters(): - from pydna.dseqrecord import Dseqrecord + obj = Dseqrecord("ggatcc") - from Bio.Restriction import BglII, BamHI + assert BglII in obj.no_cutters() assert BamHI not in obj.no_cutters() @@ -1931,14 +1961,14 @@ def test_cutters(): def test_number_of_cuts(): - from Bio.Restriction import EcoRI, BamHI, BglII - from pydna.dseqrecord import Dseqrecord + + s = Dseqrecord("GGATCCgaattc", circular=False) s.cut(BamHI) s.cut(EcoRI) s.cut(BglII) - from Bio.Restriction import RestrictionBatch + rb = RestrictionBatch((EcoRI, BamHI, BglII)) assert s.number_of_cuts(BamHI) == 1 @@ -1949,14 +1979,14 @@ def test_number_of_cuts(): def test_reverse_complement(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCC", circular=False) assert s.reverse_complement() == s.rc() assert s.rc().seq == s.seq assert s.reverse_complement().seq == s.seq - from pydna.readers import read + s = read( """ @@ -1982,7 +2012,7 @@ def test_reverse_complement(): def test_shifted(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("GGATCCgaattc", circular=False) with pytest.raises(TypeError): @@ -1990,9 +2020,9 @@ def test_shifted(): def test_looped(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - import warnings + + + warnings.simplefilter("always") @@ -2070,7 +2100,7 @@ def test_looped(): def test_upper(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("Gc") s.annotations["sample"] = ["sample"] @@ -2083,7 +2113,7 @@ def test_upper(): def test_lower(): - from pydna.dseqrecord import Dseqrecord + s = Dseqrecord("Gc") s.annotations["sample"] = ["sample"] @@ -2095,10 +2125,73 @@ def test_lower(): assert l.__dict__ == s.__dict__ +# def test_map(): + + + + +# traces = [] + + + +# for name in glob.glob("*.ab1"): +# traces.append(abiread(name, "abi")) + +# for t in traces: +# d = Dseqrecord(t.seq) + +# if "ITVFFKEYPYDVPDYAIEGIFHAT" in d: +# tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = "AAA tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = " ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + +# tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" + +# tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# tag, trc = trc, tag + +# s = Dseqrecord(Dseq(tag, trc, 0)) +# sl = s.rc().find_aa("YPYDVPDYA") + +# assert str(s.rc()[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s.rc() + +# tag = "aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "") +# trc = "ttt ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + +# s = Dseqrecord(Dseq(tag, trc, 0, circular=True)) +# sl = s.find_aa("YPYDVPDYA") +# assert str(s[sl].seq.translate()) == "YPYDVPDYA" +# assert "YPYDVPDYA" in s + def test_map(): - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord - from Bio.SeqIO import read as abiread traces = [] @@ -2111,52 +2204,29 @@ def test_map(): d = Dseqrecord(t.seq) if "ITVFFKEYPYDVPDYAIEGIFHAT" in d: - tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + # Y P Y D V P D Y A + s = Dseqrecord(Dseq.quick("tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = "AAA tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = " ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + s = Dseqrecord(Dseq.quick("EEE tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0)) + s = Dseqrecord(Dseq.quick("FFF tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"))) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s - tag = " tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "AAA ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") + s = Dseqrecord(Dseq.quick("FFF tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii")).rc()) + assert s.find_aa("YPYDVPDYA") is None - s = Dseqrecord(Dseq(tag, trc, 0)) - sl = s.find_aa("YPYDVPDYA") - assert str(s[sl].seq.translate()) == "YPYDVPDYA" - - tag = "tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - tag, trc = trc, tag - - s = Dseqrecord(Dseq(tag, trc, 0)) - sl = s.rc().find_aa("YPYDVPDYA") - - assert str(s.rc()[sl].seq.translate()) == "YPYDVPDYA" + assert str(s.rc()[sl].seq.translate(to_stop=False)) == "YPYDVPDYA" assert "YPYDVPDYA" in s.rc() - tag = "aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "") - trc = "ttt ata ggt ata ctg caa ggt ctg ata cgt"[::-1].replace(" ", "") - - s = Dseqrecord(Dseq(tag, trc, 0, circular=True)) + s = Dseqrecord(Dseq.quick("aaa tat cca tat gac gtt cca gac tat gca".replace(" ", "").encode("ascii"), circular=True)) sl = s.find_aa("YPYDVPDYA") assert str(s[sl].seq.translate()) == "YPYDVPDYA" assert "YPYDVPDYA" in s @@ -2167,9 +2237,9 @@ def test_assemble_YEp24PGK_XK(): test YEp24PGK_XK """ - from pydna.readers import read - from pydna.utils import eq - from pydna.amplify import pcr + + + """ test YEp24PGK_XK""" p1 = read("primer1.txt", ds=False) @@ -2179,11 +2249,11 @@ def test_assemble_YEp24PGK_XK(): PCR_prod = pcr(p1, p3, XKS1) - from Bio.Restriction import BamHI + stuffer1, insert, stuffer2 = PCR_prod.cut(BamHI) - from Bio.Restriction import BglII + YEp24PGK_BglII = YEp24PGK.cut(BglII)[0] @@ -2207,9 +2277,9 @@ def test_assemble_YEp24PGK_XK(): def test_apply_cut(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation - from pydna.utils import location_boundaries as _location_boundaries + + + def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: return next(f for f in f.features if f.id == id) @@ -2230,7 +2300,7 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: open_seq = seq_shifted.apply_cut(dummy_cut, dummy_cut) assert len(open_seq.features) == 4 new_locs = sorted(str(f.location) for f in open_seq.features) - assert str(open_seq.seq) == "ATGaattacgtATG" + assert open_seq.seq.to_blunt_string() == "ATGaattacgtATG" if strand == 1: assert new_locs == sorted(["[0:3](+)", "[0:4](+)", "[11:14](+)", "[10:14](+)"]) elif strand == -1: @@ -2241,9 +2311,9 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: def test_apply_cut2(): - from pydna.dseqrecord import Dseqrecord - from Bio.SeqFeature import SeqFeature, SimpleLocation - from pydna.utils import location_boundaries as _location_boundaries + + + def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: return next(f for f in f.features if f.id == id) @@ -2264,28 +2334,10 @@ def find_feature_by_id(f: Dseqrecord, id: str) -> SeqFeature: open_seq = seq_shifted.apply_cut(dummy_cut, dummy_cut) assert len(open_seq.features) == 4 new_locs = sorted(str(f.location) for f in open_seq.features) - assert str(open_seq.seq) == "ATGaattacgtATG" + assert open_seq.seq.to_blunt_string() == "ATGaattacgtATG" if strand == 1: assert new_locs == sorted(["[0:3](+)", "[0:4](+)", "[11:14](+)", "[10:14](+)"]) elif strand == -1: assert new_locs == sorted(["[0:3](-)", "[0:4](-)", "[11:14](-)", "[10:14](-)"]) if strand is None: assert new_locs == sorted(["[0:3]", "[0:4]", "[11:14]", "[10:14]"]) - - -if __name__ == "__main__": - args = [ - __file__, - "--cov=pydna", - "--cov-append", - "--cov-report=html:../htmlcov", - "--cov-report=xml", - "--capture=no", - "--durations=10", - "--import-mode=importlib", - "--nbval", - "--current-env", - "--doctest-modules", - "--capture=no", - ] - pytest.main(args) From aa12fcb5afeca7331121dc0b957bce0887b667c3 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 24 Sep 2025 05:51:06 +0100 Subject: [PATCH 14/36] removed if __name__ == __main__ --- tests/test_module_design.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_module_design.py b/tests/test_module_design.py index 704fcf9e..1857de17 100755 --- a/tests/test_module_design.py +++ b/tests/test_module_design.py @@ -368,5 +368,5 @@ def test_primer_design_correct_value(): assert str(rvs.seq) == str(correct_rvs.seq) -if __name__ == "__main__": - pytest.cmdline.main([__file__, "-v", "-s"]) +# if __name__ == "__main__": +# pytest.cmdline.main([__file__, "-v", "-s"]) From e816d98c32fa0d489c6afb8b03bbd300f8d54185 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 24 Sep 2025 05:54:18 +0100 Subject: [PATCH 15/36] removed main test and moved imports to the top --- tests/test_module_assembly.py | 100 ++++++++++++---------------------- 1 file changed, 34 insertions(+), 66 deletions(-) diff --git a/tests/test_module_assembly.py b/tests/test_module_assembly.py index 946bf190..b120aba8 100644 --- a/tests/test_module_assembly.py +++ b/tests/test_module_assembly.py @@ -2,14 +2,25 @@ # -*- coding: utf-8 -*- import pytest +# from importlib import reload +from pydna import assembly +from pydna.dseqrecord import Dseqrecord +from pydna.parsers import parse +from pydna.utils import eq +from Bio.SeqFeature import SeqFeature +from Bio.SeqFeature import FeatureLocation +from Bio.SeqFeature import ExactPosition +from pydna.amplify import pcr +from pydna.readers import read +from Bio.Restriction import EcoRV, ZraI +from Bio.Restriction import AjiI, AgeI +from Bio.Restriction import SalI +from pydna.assembly import Assembly +from Bio.Restriction import AatII +from pydna._pretty import pretty_str + +def test_built(): - -def test_built(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from importlib import reload - from pydna import assembly - - reload(assembly) asm = assembly.Assembly(assembly.example_fragments, limit=5) lin = asm.assemble_linear() crc = asm.assemble_circular() @@ -18,18 +29,10 @@ def test_built(monkeypatch): assert [c.seq for c in crc] == [c.seq for c in assembly.circular_results] -def test_new_assembly(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna.dseqrecord import Dseqrecord - from pydna import assembly - from pydna.parsers import parse - from pydna.utils import eq - from importlib import reload - from Bio.SeqFeature import SeqFeature - from Bio.SeqFeature import FeatureLocation - from Bio.SeqFeature import ExactPosition +def test_new_assembly(): + + - reload(assembly) # 0000000000111111111222222222233333333334444444444555555555566 # 0123456780123456789012345678901234567890123456789012345678901 @@ -214,8 +217,6 @@ def test_new_assembly(monkeypatch): c3 = assembly.Assembly((a, b, c), limit=14) assert str(c3.assemble_circular()[0].seq) == "acgatgctatactggCCCCCtgtgctgtgctctaTTTTTtattctggctgtatctGGGGGT" - from pydna.parsers import parse - from pydna.utils import eq text1 = """ >A_AgTEFp_b_631 NP+geg/4Ykv2pIwEqiLylYKPYOE @@ -248,14 +249,10 @@ def test_new_assembly(monkeypatch): # [821] [713] [132] [51] [38] -def test_assembly(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna import assembly - from pydna.parsers import parse - from pydna.utils import eq - from importlib import reload +def test_assembly(): + + - reload(assembly) text1 = """ >A_AgTEFp_b_631 NP+geg/4Ykv2pIwEqiLylYKPYOE @@ -482,16 +479,10 @@ def test_assembly(monkeypatch): assert candidate.seguid() == "cdseguid=-mVwekticpAYIT9C4JcXmOGFkRo" -def test_MXblaster1(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") - from pydna import assembly - from pydna.parsers import parse - from pydna.amplify import pcr - from pydna.readers import read - from pydna.utils import eq - from importlib import reload +def test_MXblaster1(): + + - reload(assembly) """ test MXblaster1""" @@ -517,7 +508,6 @@ def test_MXblaster1(monkeypatch): pCAPs_pSU0 = read("pCAPs-pSU0.gb") # cut the pCAPs vectors for cloning - from Bio.Restriction import EcoRV, ZraI pCAPs_ZraI = pCAPs.linearize(ZraI) pCAPs_PCR_prod = pcr(primer[492], primer[493], pCAPs) @@ -585,7 +575,6 @@ def test_MXblaster1(monkeypatch): assert pCAPs_MX4blaster1.seguid() == "cdseguid=bUl04KTp5LpAulZX3UHdejwnuIQ" - from Bio.Restriction import AjiI, AgeI AX023560 = read("AX023560.gb") @@ -639,16 +628,11 @@ def test_MXblaster1(monkeypatch): assert pCAPs_MX4blaster2.seguid() == "cdseguid=c48cBUb3wF-Sdhzh0Tlprp-0CEg" -def test_assemble_pGUP1(monkeypatch): - monkeypatch.setenv("pydna_cached_funcs", "") +def test_assemble_pGUP1(): + + - from pydna.readers import read - from pydna import assembly - from pydna.utils import eq - from pydna.amplify import pcr - from importlib import reload - reload(assembly) GUP1rec1sens = read("GUP1rec1sens.txt") GUP1rec2AS = read("GUP1rec2AS.txt") @@ -657,7 +641,6 @@ def test_assemble_pGUP1(monkeypatch): insert = pcr(GUP1rec1sens, GUP1rec2AS, GUP1_locus) - from Bio.Restriction import SalI his3, lin_vect = pGREG505.cut(SalI) @@ -676,7 +659,7 @@ def test_assemble_pGUP1(monkeypatch): assert pGUP1.seguid() == "cdseguid=QiK2pH9yioTPfSobUTLz4CPiNzY" -# def test_35_36(monkeypatch): +# def test_35_36(): # import sys # from pydna.assembly import _od # if sys.version_info < (3, 6): @@ -686,15 +669,12 @@ def test_assemble_pGUP1(monkeypatch): # assert _od==dict -def test_pYPK7_TDH3_GAL2_PGI1(monkeypatch): - from pydna.readers import read - from pydna.assembly import Assembly +def test_pYPK7_TDH3_GAL2_PGI1(): pMEC1142 = read("pYPK0_TDH3_GAL2_PGI1.gb") pYPKp7 = read("pYPKp7.gb") - from Bio.Restriction import AatII pYPKp7_AatII = pYPKp7.linearize(AatII) @@ -703,9 +683,7 @@ def test_pYPK7_TDH3_GAL2_PGI1(monkeypatch): assert z.assemble_circular()[1].seguid() == "cdseguid=DeflrptvvS6m532WogvxQSgVKpk" -def test_marker_replacement_on_plasmid(monkeypatch): - from pydna.assembly import Assembly - from pydna.parsers import parse +def test_marker_replacement_on_plasmid(): f, r, _, _ = parse( """ @@ -724,12 +702,10 @@ def test_marker_replacement_on_plasmid(monkeypatch): """ ) - from pydna.readers import read pAG32 = read("pAG32.gb") pMEC1135 = read("pMEC1135.gb") - from pydna.amplify import pcr hygromycin_product = pcr(f, r, pAG32) @@ -741,12 +717,9 @@ def test_marker_replacement_on_plasmid(monkeypatch): assert pMEC1135.features[-1].extract(pMEC1135).seq == candidate.features[-1].extract(candidate).seq -def test_linear_with_annotations2(monkeypatch): +def test_linear_with_annotations2(): # Thanks to James Bagley for finding this bug # https://github.com/JamesBagley - from pydna._pretty import pretty_str - from pydna.assembly import Assembly - from pydna.dseqrecord import Dseqrecord a = Dseqrecord("acgatgctatactgtgCCNCCtgtgctgtgctcta") a.add_feature(0, 10, label="a_feat") @@ -790,8 +763,3 @@ def test_linear_with_annotations2(monkeypatch): # tgtgctgtgctctaTTTTTTTtattctggctgtatcCCCCCC # TATTCTGGCTGTATC # GtattctggctgtatcGGGGGtacgatgctatactgtg - - -if __name__ == "__main__": - pytest.main([__file__, "-x", "-vv", "-s"]) - # pytest.main([__file__, "-x", "-vvv", "-s", "--profile"]) From e12bf12b98fc6e28c675e7985ae9aa2de2cc7fb1 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 24 Sep 2025 06:01:51 +0100 Subject: [PATCH 16/36] 1. Exchanged xxxx and yyyy to 1111 and 2222 respectively. This because x and y has meaning in the new Dseq implementation. (line 1074) 2. The expected result in test_pcr_assembly_uracil should be AUUAggccggTTOO. 3. Removed numbers at start and end of some sequenses. This could be discussed. 4. Four instances of FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. --- tests/test_module_assembly2.py | 51 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/tests/test_module_assembly2.py b/tests/test_module_assembly2.py index 3b05a548..eba500df 100644 --- a/tests/test_module_assembly2.py +++ b/tests/test_module_assembly2.py @@ -1062,7 +1062,7 @@ def test_pcr_assembly_normal(): assert str(prods[0].seq) == "TTTACGTACGTAAAAAAGCGCGCGCTTT" -@pytest.mark.xfail(reason="U in primers not handled") +# @pytest.mark.xfail(reason="U in primers not handled") def test_pcr_assembly_uracil(): primer1 = Primer("AUUA") @@ -1071,7 +1071,7 @@ def test_pcr_assembly_uracil(): seq = Dseqrecord(Dseq("aaATTAggccggTTAAaa")) asm = assembly.PCRAssembly([primer1, seq, primer2], limit=4) - assert str(asm.assemble_linear()[0].seq) == "AUUAggccggTTAA" + assert str(asm.assemble_linear()[0].seq) == "AUUAggccggTTOO" # FIXME: This is the expected result, see def in utils assert asm.assemble_linear()[0].seq.crick.startswith("UUAA") primer1 = Primer("ATAUUA") @@ -1603,19 +1603,19 @@ def test_insertion_assembly(): # Insertion of linear sequence into linear sequence (like # homologous recombination of PCR product with homology arms in genome) - a = Dseqrecord("1CGTACGCACAxxxxCGTACGCACAC2") - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4") + a = Dseqrecord("CGTACGCACA1111CGTACGCACAC") + b = Dseqrecord("CGTACGCACA2222CGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) # All possibilities, including the single insertions results = [ - "1CGTACGCACAyyyyCGTACGCACAxxxxCGTACGCACAC2", - "1CGTACGCACAyyyyCGTACGCACAC2", - "3CGTACGCACAxxxxCGTACGCACAyyyyCGTACGCACAT4", - "1CGTACGCACAxxxxCGTACGCACAyyyyCGTACGCACAC2", - "3CGTACGCACAxxxxCGTACGCACAT4", - "3CGTACGCACAyyyyCGTACGCACAxxxxCGTACGCACAT4", + "CGTACGCACA2222CGTACGCACA1111CGTACGCACAC", + "CGTACGCACA2222CGTACGCACAC", + "CGTACGCACA1111CGTACGCACA2222CGTACGCACAT", + "CGTACGCACA1111CGTACGCACA2222CGTACGCACAC", + "CGTACGCACA1111CGTACGCACAT", + "CGTACGCACA2222CGTACGCACA1111CGTACGCACAT", ] assembly_products = [ @@ -1627,21 +1627,21 @@ def test_insertion_assembly(): # TODO: debatable whether this kind of homologous recombination should happen, or how # the overlap restrictions should be applied. - a = Dseqrecord("1CGTACGCACAxxxxC2") - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4") + a = Dseqrecord("CGTACGCACA1111C") + b = Dseqrecord("CGTACGCACA2222CGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) - results = ["1CGTACGCACAyyyyCGTACGCACAxxxxC2"] + results = ["CGTACGCACA2222CGTACGCACA1111C"] for assem, result in zip(f.get_insertion_assemblies(), results): assert result == str(assembly.assemble([a, b], assem).seq) - a = Dseqrecord("1CGTACGCACAxxxxC2") - b = Dseqrecord("3CGTACGCACAyyyyT4") + a = Dseqrecord("CGTACGCACA1111C") + b = Dseqrecord("CGTACGCACA2222T") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) assert len(f.get_insertion_assemblies()) == 0 # Does not work for circular molecules - a = Dseqrecord("1CGTACGCACAxxxxCGTACGCACAC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4", circular=True) + a = Dseqrecord("CGTACGCACA1111CGTACGCACAC", circular=True) + b = Dseqrecord("CGTACGCACA2222CGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1649,8 +1649,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("1CGTACGCACAxxxxC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyCGTACGCACAT4", circular=True) + a = Dseqrecord("CGTACGCACA1111C", circular=True) + b = Dseqrecord("CGTACGCACA2222CGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1658,8 +1658,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("1CGTACGCACAxxxxC2", circular=True) - b = Dseqrecord("3CGTACGCACAyyyyT4", circular=True) + a = Dseqrecord("CGTACGCACA1111C", circular=True) + b = Dseqrecord("CGTACGCACA2222T", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1752,6 +1752,7 @@ def test_assemble_function(): f2.features = [f2_feat1, f2_feat2] for shift in range(len(f1)): + f1_shifted = f1.shifted(shift) # Re-order the features so that TTT is first @@ -1839,6 +1840,7 @@ def test_assemble_function(): assembly_plan = [ (1, 2, loc_end, loc_start), ] + # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert (fragments[0] + fragments[1]).seq == assembly.assemble( fragments, assembly_plan ).seq @@ -1848,6 +1850,7 @@ def test_assemble_function(): (1, 2, loc_end, loc_start), (2, 1, loc_end, loc_start), ] + # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert (fragments[0] + fragments[1]).looped().seq == assembly.assemble( fragments, assembly_plan ).seq @@ -2125,7 +2128,7 @@ def test_ligation_assembly(): # Blunt ligation combined with sticky end fragments = Dseqrecord("AAAGAATTCAAA").cut(EcoRI) - result = assembly.ligation_assembly(fragments, allow_blunt=True) + result = assembly.ligation_assembly(fragments, allow_blunt=True) # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. result_str = [str(x.seq) for x in result] assert sorted(result_str) == sorted(["AAAGAATTCAAA"]) assert result[0].circular @@ -2147,7 +2150,7 @@ def test_blunt_assembly(): use_fragment_order=False, ) - assert dseqrecord_list_to_dseq_list(asm.assemble_linear()) == [(b + a).seq] + assert dseqrecord_list_to_dseq_list(asm.assemble_linear()) == [(b + a).seq] # FIXME: The assert below fails in the Sanity check on line 770 in assembly2, but gives the expected result. assert asm.assemble_circular() == [] # Circular assembly @@ -2572,3 +2575,5 @@ def test_common_handle_insertion_fragments(): assert [genome] + inserts == assembly.common_handle_insertion_fragments( genome, inserts ) + +# pytest.main([__file__, "-vvv", "-s"]) From 56814d903bd2e3a1923581e855b03a02dce7683d Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Thu, 25 Sep 2025 07:51:31 +0100 Subject: [PATCH 17/36] Updated docstrings in Dseq class for clarity, work in progress --- src/pydna/dseq.py | 180 +++++++++++++++++++++++++++------------------- 1 file changed, 107 insertions(+), 73 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 15dc6550..499f0bf0 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -1,10 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2013-2023 by Björn Johansson. All rights reserved. -# This code is part of the Python-dna distribution and governed by its -# license. Please see the LICENSE.txt file that should have been included -# as part of this package. """Provides the Dseq class for handling double stranded DNA sequences. Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class @@ -20,15 +16,17 @@ import sys as _sys import math as _math import inspect as _inspect +from typing import List as _List, Tuple as _Tuple, Union as _Union -from pydna.seq import Seq as _Seq -from Bio.Seq import _translate_str, _SeqAbstractBaseClass +from Bio.Restriction import RestrictionBatch as _RestrictionBatch +from Bio.Restriction import CommOnly -from pydna._pretty import pretty_str as _pretty_str from seguid import ldseguid as _ldseguid from seguid import cdseguid as _cdseguid -# from pydna.utils import complement as _complement +from pydna.seq import Seq as _Seq +from Bio.Seq import _translate_str, _SeqAbstractBaseClass +from pydna._pretty import pretty_str as _pretty_str from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap @@ -40,58 +38,80 @@ from pydna.utils import to_3tail_table from pydna.common_sub_strings import common_sub_strings as _common_sub_strings from pydna.common_sub_strings import terminal_overlap as _terminal_overlap -from Bio.Restriction import RestrictionBatch as _RestrictionBatch -from Bio.Restriction import CommOnly - - from pydna.types import DseqType, EnzymesType, CutSiteType -from typing import List as _List, Tuple as _Tuple, Union as _Union - class CircularString(str): """ A circular string: indexing and slicing wrap around the origin (index 0). - Examples: - s = CircularString("ABCDE") - - # Integer indexing (wraps) - assert s[7] == "C" # 7 % 5 == 2 - assert s[-1] == "E" - - # Forward circular slices (wrap when stop <= start) - assert s[3:1] == "DEA" # 3,4,0 - assert s[1:1] == "BCDE" # full turn starting at 1, excluding 1 on next lap - assert s[:] == "ABCDE" # full string - assert s[::2] == "ACE" # every 2nd char, no wrap needed - assert s[4:2:2] == "EA" # 4,0 - - # Reverse circular slices - assert s[1:1:-1] == "BAEDC" # full reverse circle starting at 1 - assert s[2:4:-1] == "CBA" # 2,1,0 - - # Steps > 1 and negatives work with wrap as expected - assert s[0:0:2] == "ACE" - assert s[0:0:-2] == "AECBD" + Examples + -------- + s = CircularString("ABCDE") + + # Integer indexing (wraps) + assert s[7] == "C" # 7 % 5 == 2 + assert s[-1] == "E" + + # Forward circular slices (wrap when stop <= start) + assert s[3:1] == "DEA" # 3,4,0 + assert s[1:1] == "BCDE" # full turn starting at 1, excluding 1 on next lap + assert s[:] == "ABCDE" # full string + assert s[::2] == "ACE" # every 2nd char, no wrap needed + assert s[4:2:2] == "EA" # 4,0 + + # Reverse circular slices + assert s[1:1:-1] == "BAEDC" # full reverse circle starting at 1 + assert s[2:4:-1] == "CBA" # 2,1,0 + + # Steps > 1 and negatives work with wrap as expected + assert s[0:0:2] == "ACE" + assert s[0:0:-2] == "AECBD" """ - def __new__(cls, value): + def __new__(cls, value: str): + """ + Create a new instance of CircularString. + + For immutable built-in types like str, object initialization must be + performed in __new__ rather than __init__. The __init__ method is called + *after* the immutable value has already been created, which means it + cannot alter the underlying string content. By overriding __new__, we + ensure that the desired value is passed directly into the construction + of the string object before it becomes immutable. + """ return super().__new__(cls, value) def __getitem__(self, key): + """ + Return a portion of the sequence, supporting both indexing and slicing. + + This method is called whenever the object is accessed with square brackets, + e.g. obj[i] or obj[i:j]. If `key` is an integer, the method returns the + element at that position. If `key` is a slice, it returns a new instance of + the subclass containing the corresponding subsequence. + + This method has been modified to return subsequences assuming that the + string is circular. + """ n = len(self) if n == 0: + # If the circular string is empty. # Behave like str: indexing raises, slicing returns empty if isinstance(key, slice): return self.__class__("") raise IndexError("CircularString index out of range (empty string)") if isinstance(key, int): - # Wrap single index + # If obj[i] is called, return the character at that position. + # The difference from the normal string class is that the index + # wraps around, so it is never out of range. return super().__getitem__(key % n) if isinstance(key, slice): + # A slice object has start, stop and step properties, where step + # indicates taking for example every 3rd character if step == 3. + # str accepts None for each property. start, stop, step = key.start, key.stop, key.step step = 1 if step is None else step if step == 0: @@ -101,7 +121,10 @@ def __getitem__(self, key): if step > 0: start = 0 if start is None else start stop = n if stop is None else stop - # Ensure we move forward; if stop <= start, wrap once. + # A special case is when stop <= start. + # A normal string returns "", but we return the slice + # going forward around the end of the string to start. + # or if stop <= start, go forward wrap once. while stop <= start: stop += n rng = range(start, stop, step) @@ -116,6 +139,7 @@ def __getitem__(self, key): # Map to modulo-n and collect # Cap at one full lap for safety (no infinite loops) + # This part could probably be simplified if step was not supported. limit = n if step % n == 0 else n * 2 # generous cap for large steps out = [] count = 0 @@ -132,58 +156,70 @@ def __getitem__(self, key): class Dseq(_Seq): - """Dseq holds information for a double stranded DNA fragment. + """Dseq describes a double stranded DNA fragment, linear or circular. + + Dseq can be initiated in two ways, uisng two strings, each representing the + Watson (upper, sense) strand, the Crick (lower, antisense) strand and an + optional value describing the stagger betwen the strands on the left side (ovhg). + + Alternatively, a single string represenation using dsIUPAC codes can be used. + If a single string is used, the letters of that string are interpreted as base + pairs rather than single bases. For example "A" would indicate the basepair + "A/T". An expanded IUPAC code is used where the letters PEXI have been assigned + to GATC on the Watson strand with no paring base on the Crick strand G/"", A/"", + T/"" and C/"". The letters QFZJ have been assigned the opposite base pairs with + an empty Watson strand ""/G, ""/A, ""/T, and ""/C. + + :: + + PEXIGATCQFZJ would indicate the linear double-stranded fragment: + + GATCGATC + CTAGCTAG + - Dseq also holds information describing the topology of - the DNA fragment (linear or circular). Parameters ---------- watson : str - a string representing the watson (sense) DNA strand. + a string representing the Watson (sense) DNA strand or a basepair + represenation. crick : str, optional - a string representing the crick (antisense) DNA strand. + a string representing the Crick (antisense) DNA strand. ovhg : int, optional A positive or negative number to describe the stagger between the - watson and crick strands. + Watson and Crick strands. see below for a detailed explanation. - linear : bool, optional - True indicates that sequence is linear, False that it is circular. - circular : bool, optional True indicates that sequence is circular, False that it is linear. Examples -------- - Dseq is a subclass of the Biopython Seq object. It stores two - strings representing the watson (sense) and crick(antisense) strands. - two properties called linear and circular, and a numeric value ovhg - (overhang) describing the stagger for the watson and crick strand - in the 5' end of the fragment. + Dseq is a subclass of the Biopython Bio.Seq.Seq class. The constructor + can accept two strings representing the Watson (sense) and Crick(antisense) + DNA strands. These are interpreted as single stranded DNA. There is a check + for complementarity between the strands. - The most common usage is probably to create a Dseq object as a - part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). + If the DNA molecule is staggered on the left side, an integer ovhg + (overhang) must be given, describing the stagger between the Watson and Crick strand + in the 5' end of the fragment. - There are three ways of creating a Dseq object directly listed below, but you can also - use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: + Additionally, the optional boolean parameter circular can be given to indicate if the + DNA molecule is circular. - Only one argument (string): + The most common usage of the Dseq class is probably not to use it directly, but to + create it as part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). + This works in the same way as for the relationship between the :class:`Bio.Seq.Seq` and + :class:`Bio.SeqRecord.SeqRecord` classes in Biopython. - >>> from pydna.dseq import Dseq - >>> Dseq("aaa") - Dseq(-3) - aaa - ttt - - The given string will be interpreted as the watson strand of a - blunt, linear double stranded sequence object. The crick strand - is created automatically from the watson strand. + There are multiple ways of creating a Dseq object directly listed below, but you can also + use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: - Two arguments (string, string): + Two arguments (string, string), no overhang provided: >>> from pydna.dseq import Dseq >>> Dseq("gggaaat","ttt") @@ -191,16 +227,14 @@ class Dseq(_Seq): gggaaat ttt - If both watson and crick are given, but not ovhg an attempt - will be made to find the best annealing between the strands. - There are limitations to this. For long fragments it is quite - slow. The length of the annealing sequences have to be at least - half the length of the shortest of the strands. + If Watson and Crick are given, but not ovhg, an attempt will be made to find the best annealing + between the strands. There are important limitations to this. If there are several ways to + anneal the strands, this will fail. For long fragments it is quite slow. Three arguments (string, string, ovhg=int): The ovhg parameter is an integer describing the length of the - crick strand overhang in the 5' end of the molecule. + Crick strand overhang in the 5' end of the molecule. The ovhg parameter controls the stagger at the five prime end:: From 24969cd1f4602e6972f30021f636a424e3d67634 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Thu, 25 Sep 2025 17:17:26 +0100 Subject: [PATCH 18/36] fix doctests --- src/pydna/dseq.py | 59 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 499f0bf0..700de4ef 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -158,7 +158,7 @@ def __getitem__(self, key): class Dseq(_Seq): """Dseq describes a double stranded DNA fragment, linear or circular. - Dseq can be initiated in two ways, uisng two strings, each representing the + Dseq can be initiated in two ways, using two strings, each representing the Watson (upper, sense) strand, the Crick (lower, antisense) strand and an optional value describing the stagger betwen the strands on the left side (ovhg). @@ -233,8 +233,8 @@ class Dseq(_Seq): Three arguments (string, string, ovhg=int): - The ovhg parameter is an integer describing the length of the - Crick strand overhang in the 5' end of the molecule. + The ovhg parameter is an integer describing the length of the Crick strand overhang on the + left side (the 5' end of Watson strand). The ovhg parameter controls the stagger at the five prime end:: @@ -257,29 +257,29 @@ class Dseq(_Seq): Example of creating Dseq objects with different amounts of stagger: - >>> Dseq(watson="agt", crick="actta", ovhg=-2) + >>> Dseq(watson="att", crick="acata", ovhg=-2) Dseq(-7) - agt - attca - >>> Dseq(watson="agt",crick="actta",ovhg=-1) + att + ataca + >>> Dseq(watson="ata",crick="acata",ovhg=-1) Dseq(-6) - agt - attca - >>> Dseq(watson="agt",crick="actta",ovhg=0) + ata + ataca + >>> Dseq(watson="taa",crick="actta",ovhg=0) Dseq(-5) - agt + taa attca - >>> Dseq(watson="agt",crick="actta",ovhg=1) + >>> Dseq(watson="aag",crick="actta",ovhg=1) Dseq(-5) - agt + aag attca >>> Dseq(watson="agt",crick="actta",ovhg=2) Dseq(-5) agt attca - If the ovhg parameter is specified a crick strand also - needs to be supplied, otherwise an exception is raised. + If the ovhg parameter is specified a Crick strand also needs to be supplied, or + an exception is raised. >>> Dseq(watson="agt", ovhg=2) Traceback (most recent call last): @@ -288,22 +288,21 @@ class Dseq(_Seq): else: ValueError: ovhg defined without crick strand! + The shape or topology of the fragment is set by the circular parameter, True or False (default). - The shape of the fragment is set by circular = True, False - - Note that both ends of the DNA fragment has to be compatible to set - circular = True. - - - >>> Dseq("aaa","ttt") + >>> Dseq("aaa", "ttt", ovhg = 0) # A linear sequence by default Dseq(-3) aaa ttt - >>> Dseq("aaa","ttt",ovhg=0) + >>> Dseq("aaa", "ttt", ovhg = 0, circular = False) # A linear sequence if circular is False Dseq(-3) aaa ttt - >>> Dseq("aaa","ttt",ovhg=1) + >>> Dseq("aaa", "ttt", ovhg = 0, circular = True) # A circular sequence + Dseq(o3) + aaa + ttt + >>> Dseq("aaa", "ttt", ovhg=1, circular = True) Dseq(-4) aaa ttt @@ -333,6 +332,18 @@ class Dseq(_Seq): -4 >>> + + dsIUPAC [#]_ is an nn extension to the IUPAC alphabet used to describe ss regions: + + :: + + aaaGATC GATCccc ad-hoc representations + CTAGttt gggCTAG + + QFZJaaaPEXI PEXIcccQFZJ dsIUPAC + + + Coercing to string >>> str(a) From 7473bd88a68a33a6d62ee5ff7887cdc99592a912 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:19:35 +0100 Subject: [PATCH 19/36] removed main chunk --- tests/test_module_fusionpcr.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_module_fusionpcr.py b/tests/test_module_fusionpcr.py index 08df0207..20d11450 100644 --- a/tests/test_module_fusionpcr.py +++ b/tests/test_module_fusionpcr.py @@ -7,6 +7,7 @@ from Bio.SeqFeature import SeqFeature, SimpleLocation from pydna.fusionpcr import fuse_by_pcr + x = Dseqrecord( Dseq("tctggtcaagctgaagggtattc"), features=[SeqFeature(SimpleLocation(5, 10, strand=1), type="misc_feature")] ) @@ -78,7 +79,7 @@ def test_fusionpcr3(): assert eq(result, r) -from Bio.SeqFeature import SeqFeature, SimpleLocation + x = Dseqrecord( Dseq("tctggtcaagctgaagggtattc"), features=[SeqFeature(SimpleLocation(5, 10, strand=1), type="misc_feature")] @@ -90,7 +91,3 @@ def test_fusionpcr3(): seqtuples = [(x, y, z)] for arg in seqtuples: result = fuse_by_pcr(arg, limit=5).pop() - - -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) From 7d7be27a1022d4836675e0f0647b520471bc13a9 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:20:22 +0100 Subject: [PATCH 20/36] moved import --- src/pydna/amplify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pydna/amplify.py b/src/pydna/amplify.py index 1c4fd553..0b097d60 100644 --- a/src/pydna/amplify.py +++ b/src/pydna/amplify.py @@ -27,7 +27,7 @@ import re as _re import copy as _copy import operator as _operator -from pydna.utils import iupac_compl_regex as _iupac_compl_regex +from pydna.alphabet import iupac_compl_regex as _iupac_compl_regex from pydna.utils import anneal_from_left as _anneal_from_left # import os as _os From 561ab90ed46a382d6397c340cf52357c8a18e47f Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:21:23 +0100 Subject: [PATCH 21/36] removed reference to .length property --- src/pydna/seqrecord.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pydna/seqrecord.py b/src/pydna/seqrecord.py index 5c727744..86054ce8 100755 --- a/src/pydna/seqrecord.py +++ b/src/pydna/seqrecord.py @@ -301,7 +301,7 @@ def add_feature( if self.circular: location = _CompoundLocation( ( - _SimpleLocation(x, self.seq.length, strand=strand), + _SimpleLocation(x, len(self.seq), strand=strand), _SimpleLocation(0, y, strand=strand), ) ) From 3b215b03bbd42a72cdb2158e0748ca8e969d693f Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:22:07 +0100 Subject: [PATCH 22/36] removed reference to .length property --- src/pydna/dseqrecord.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pydna/dseqrecord.py b/src/pydna/dseqrecord.py index 9ed5aa5f..4bd195e2 100644 --- a/src/pydna/dseqrecord.py +++ b/src/pydna/dseqrecord.py @@ -328,7 +328,7 @@ def add_feature( location = _CompoundLocation( ( - _SimpleLocation(x, self.seq.length, strand=strand), + _SimpleLocation(x, len(self.seq), strand=strand), _SimpleLocation(0, y, strand=strand), ) ) @@ -846,7 +846,7 @@ def __getitem__(self, sl): f for f in answer.features if ( - _location_boundaries(f.location)[1] <= answer.seq.length + _location_boundaries(f.location)[1] <= len(answer.seq) and _location_boundaries(f.location)[0] < _location_boundaries(f.location)[1] ) From 5a646794fa2bef947e11620a3c7daf596c62dde2 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:23:49 +0100 Subject: [PATCH 23/36] removed code regarding the alphabet, not in the alphabet module --- src/pydna/utils.py | 419 +-------------------------------------------- 1 file changed, 4 insertions(+), 415 deletions(-) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index 780d23e1..5c615e5e 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -6,17 +6,7 @@ # as part of this package. """Miscellaneous functions.""" -from Bio.Data.IUPACData import ambiguous_dna_complement as _ambiguous_dna_complement -from Bio.Seq import _maketrans - -# import shelve as _shelve -# import os as _os import re as _re - -# import logging as _logging -# import base64 as _base64 -# import pickle as _pickle -# import hashlib as _hashlib import keyword as _keyword import collections as _collections import itertools as _itertools @@ -30,7 +20,8 @@ from pydna.codon import weights as _weights from pydna.codon import rare_codons as _rare_codons - +from pydna.alphabet import bp_dict_str +from pydna.alphabet import complement_table_dscode from Bio.SeqFeature import SimpleLocation as _sl from Bio.SeqFeature import CompoundLocation as _cl from Bio.SeqFeature import Location as _Location @@ -40,408 +31,6 @@ # For functions that take str or bytes as input and return str or bytes as output, matching the input type StrOrBytes = _TypeVar("StrOrBytes", str, bytes) -# _module_logger = _logging.getLogger("pydna." + __name__) -# _ambiguous_dna_complement.update({"U": "A"}) -_ambiguous_dna_complement.update( - { # dsIUPAC - "P": "J", # G in top strand, complementary strand empty. - "E": "Z", # A " - "X": "F", # T " - "I": "Q", # C " - "U": "O", # U in top strand, A in complementary strand. - "O": "U", # A in top strand, U in complementary strand. - "J": "P", # top strand empty, G in complementary strand. - "Z": "E", # " A " - "F": "X", # " T " - "Q": "I", # " C " - } -) - -_keys = "".join(_ambiguous_dna_complement.keys()).encode("ASCII") -_values = "".join(_ambiguous_dna_complement.values()).encode("ASCII") -_complement_table = bytes.maketrans(_keys + _keys.lower(), _values + _values.lower()) -_complement_table = _maketrans(_ambiguous_dna_complement) - -to_watson_table = bytes.maketrans( - b"PEXIQFZJpexiqfzj" b"12" b"UuOoGATCgatc", b"GATC gatc " b" ." b"UuAaGATCgatc" -) - -to_crick_table = bytes.maketrans( - _keys + _keys.lower() + b"PEXIQFZJpexiqfzj" b"12" b"UuOoGATCgatc", - _values + _values.lower() + b" CTAG ctag" b". " b"AaUuCTAGctag", -) - -to_N = rd = bytes.maketrans(b"PEXIpexiQFZJqfzjUuOo", b"NNNNNNNNNNNNNNNNNNNN") -to_5tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj") -to_3tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") -to_full_sequence = bytes.maketrans(b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc") - - -iupac_regex = { # IUPAC Ambiguity Codes for Nucleotide Degeneracy and U for Uracile - "A": "(?:A)", - "C": "(?:C)", - "G": "(?:G)", - "T": "(?:T|U)", - "U": "(?:U|T|U)", - "R": "(?:A|G|R)", - "Y": "(?:C|T|Y)", - "S": "(?:G|C|S)", - "W": "(?:A|T|W)", - "K": "(?:G|T|K)", - "M": "(?:A|C|M)", - "B": "(?:C|G|T|B)", - "D": "(?:A|G|T|D)", - "H": "(?:A|C|T|H)", - "V": "(?:A|C|G|V)", - "N": "(?:A|G|C|T|N)", -} - -iupac_compl_regex = { # IUPAC Ambiguity Code complements - "A": "(?:T|U)", - "C": "(?:G)", - "G": "(?:C)", - "T": "(?:A)", - "U": "(?:A)", - "R": "(?:T|C|Y)", - "Y": "(?:G|A|R)", - "S": "(?:G|C|S)", - "W": "(?:A|T|W)", - "K": "(?:C|AM)", - "M": "(?:T|G|K)", - "B": "(?:C|G|A|V)", - "D": "(?:A|C|T|H)", - "H": "(?:A|G|T|D)", - "V": "(?:T|C|G|B)", - "N": "(?:A|G|C|T|N)", -} - - -bp_dict = { - (b"P", b"Q"): b"G", # P / Q >>---> G - (b"E", b"F"): b"A", # E / F >>---> A - (b"X", b"Z"): b"T", # X / Z >>---> T - (b"I", b"J"): b"C", # I / J >>---> C - (b"p", b"q"): b"g", # p / q >>---> g - (b"e", b"f"): b"a", # e / f >>---> a - (b"x", b"z"): b"t", # x / z >>---> t - (b"i", b"j"): b"c", # i / j >>---> c - (b"p", b"Q"): b"g", # p / Q >>---> g - (b"e", b"F"): b"a", # e / F >>---> a - (b"x", b"Z"): b"t", # x / Z >>---> t - (b"i", b"J"): b"c", # i / J >>---> c - (b"P", b"q"): b"G", # P / q >>---> G - (b"E", b"f"): b"A", # E / f >>---> A - (b"X", b"z"): b"T", # X / z >>---> T - (b"I", b"j"): b"C", # I / j >>---> C - # (b"P", b" "): b"P", # P / q >>---> G - # (b"E", b" "): b"E", # E / f >>---> A - # (b"X", b" "): b"X", # X / z >>---> T - # (b"I", b" "): b"I", # I / j >>---> C - (b"Q", b"P"): b"G", # Q / P >>---> G - (b"F", b"E"): b"A", # F / E >>---> A - (b"Z", b"X"): b"T", # Z / X >>---> T - (b"J", b"I"): b"C", # J / I >>---> C - (b"q", b"p"): b"g", # q / p >>---> g - (b"f", b"e"): b"a", # f / e >>---> a - (b"z", b"x"): b"t", # z / x >>---> t - (b"j", b"i"): b"c", # j / i >>---> c - (b"q", b"P"): b"G", # Q / P >>---> G - (b"f", b"E"): b"A", # F / E >>---> A - (b"z", b"X"): b"T", # Z / X >>---> T - (b"j", b"I"): b"C", # J / I >>---> C - (b"Q", b"p"): b"g", # q / p >>---> g - (b"F", b"e"): b"a", # f / e >>---> a - (b"Z", b"x"): b"t", # z / x >>---> t - (b"J", b"i"): b"c", # j / i >>---> c - # (b" ", b"Q"): b"Q", # Q / P >>---> G - # (b" ", b"F"): b"F", # F / E >>---> A - # (b" ", b"Z"): b"Z", # Z / X >>---> T - # (b" ", b"J"): b"J", # J / I >>---> C - (b"G", b" "): b"P", - (b"A", b" "): b"E", - (b"T", b" "): b"X", - (b"C", b" "): b"I", - (b"g", b" "): b"p", - (b"a", b" "): b"e", - (b"t", b" "): b"x", - (b"c", b" "): b"i", - (b" ", b"G"): b"J", - (b" ", b"A"): b"Z", - (b" ", b"T"): b"F", - (b" ", b"C"): b"Q", - (b" ", b"g"): b"j", - (b" ", b"a"): b"z", - (b" ", b"t"): b"f", - (b" ", b"c"): b"q", - (b"G", b"C"): b"G", - (b"A", b"T"): b"A", - (b"T", b"A"): b"T", - (b"C", b"G"): b"C", - (b"g", b"c"): b"g", - (b"a", b"t"): b"a", - (b"t", b"a"): b"t", - (b"c", b"g"): b"c", - (b"G", b"c"): b"G", - (b"A", b"t"): b"A", - (b"T", b"a"): b"T", - (b"C", b"g"): b"C", - (b"g", b"C"): b"g", - (b"a", b"T"): b"a", - (b"t", b"A"): b"t", - (b"c", b"G"): b"c", - (b"U", b"O"): b"U", - (b"O", b"U"): b"A", - (b"u", b"o"): b"u", - (b"o", b"u"): b"a", - (b"u", b"O"): b"u", - (b"O", b"u"): b"A", - (b"U", b"o"): b"U", - (b"o", b"U"): b"a", - (b"U", b"A"): b"U", - (b"A", b"U"): b"O", - (b"u", b"a"): b"u", - (b"a", b"u"): b"o", - (b"u", b"A"): b"u", - (b"A", b"u"): b"O", - (b"U", b"a"): b"U", - (b"a", b"U"): b"o", - (b"M", b"T"): b"A", - (b"m", b"t"): b"a", - (b"M", b"t"): b"A", - (b"m", b"T"): b"a", - (b"T", b"M"): b"K", - (b"t", b"m"): b"k", - (b"T", b"m"): b"K", - (b"t", b"M"): b"k", - (b"M", b"G"): b"C", - (b"m", b"g"): b"c", - (b"M", b"g"): b"C", - (b"m", b"G"): b"c", - (b"G", b"M"): b"K", - (b"g", b"m"): b"k", - (b"G", b"m"): b"K", - (b"g", b"M"): b"k", - (b"R", b"T"): b"A", - (b"r", b"t"): b"a", - (b"R", b"t"): b"A", - (b"r", b"T"): b"a", - (b"T", b"R"): b"Y", - (b"t", b"r"): b"y", - (b"T", b"r"): b"Y", - (b"t", b"R"): b"y", - (b"R", b"C"): b"G", - (b"r", b"c"): b"g", - (b"R", b"c"): b"G", - (b"r", b"C"): b"g", - (b"C", b"R"): b"Y", - (b"c", b"r"): b"y", - (b"C", b"r"): b"Y", - (b"c", b"R"): b"y", - (b"W", b"T"): b"A", - (b"w", b"t"): b"a", - (b"W", b"t"): b"A", - (b"w", b"T"): b"a", - (b"T", b"W"): b"W", - (b"t", b"w"): b"w", - (b"T", b"w"): b"W", - (b"t", b"W"): b"w", - (b"W", b"A"): b"T", - (b"w", b"a"): b"t", - (b"W", b"a"): b"T", - (b"w", b"A"): b"t", - (b"A", b"W"): b"W", - (b"a", b"w"): b"w", - (b"A", b"w"): b"W", - (b"a", b"W"): b"w", - (b"S", b"G"): b"C", - (b"s", b"g"): b"c", - (b"S", b"g"): b"C", - (b"s", b"G"): b"c", - (b"G", b"S"): b"S", - (b"g", b"s"): b"s", - (b"G", b"s"): b"S", - (b"g", b"S"): b"s", - (b"S", b"C"): b"G", - (b"s", b"c"): b"g", - (b"S", b"c"): b"G", - (b"s", b"C"): b"g", - (b"C", b"S"): b"S", - (b"c", b"s"): b"s", - (b"C", b"s"): b"S", - (b"c", b"S"): b"s", - (b"Y", b"G"): b"C", - (b"y", b"g"): b"c", - (b"Y", b"g"): b"C", - (b"y", b"G"): b"c", - (b"G", b"Y"): b"R", - (b"g", b"y"): b"r", - (b"G", b"y"): b"R", - (b"g", b"Y"): b"r", - (b"Y", b"A"): b"T", - (b"y", b"a"): b"t", - (b"Y", b"a"): b"T", - (b"y", b"A"): b"t", - (b"A", b"Y"): b"R", - (b"a", b"y"): b"r", - (b"A", b"y"): b"R", - (b"a", b"Y"): b"r", - (b"K", b"C"): b"G", - (b"k", b"c"): b"g", - (b"K", b"c"): b"G", - (b"k", b"C"): b"g", - (b"C", b"K"): b"M", - (b"c", b"k"): b"m", - (b"C", b"k"): b"M", - (b"c", b"K"): b"m", - (b"K", b"A"): b"T", - (b"k", b"a"): b"t", - (b"K", b"a"): b"T", - (b"k", b"A"): b"t", - (b"A", b"K"): b"M", - (b"a", b"k"): b"m", - (b"A", b"k"): b"M", - (b"a", b"K"): b"m", - (b"V", b"T"): b"A", - (b"v", b"t"): b"a", - (b"V", b"t"): b"A", - (b"v", b"T"): b"a", - (b"T", b"V"): b"B", - (b"t", b"v"): b"b", - (b"T", b"v"): b"B", - (b"t", b"V"): b"b", - (b"V", b"G"): b"C", - (b"v", b"g"): b"c", - (b"V", b"g"): b"C", - (b"v", b"G"): b"c", - (b"G", b"V"): b"B", - (b"g", b"v"): b"b", - (b"G", b"v"): b"B", - (b"g", b"V"): b"b", - (b"V", b"C"): b"G", - (b"v", b"c"): b"g", - (b"V", b"c"): b"G", - (b"v", b"C"): b"g", - (b"C", b"V"): b"B", - (b"c", b"v"): b"b", - (b"C", b"v"): b"B", - (b"c", b"V"): b"b", - (b"H", b"T"): b"A", - (b"h", b"t"): b"a", - (b"H", b"t"): b"A", - (b"h", b"T"): b"a", - (b"T", b"H"): b"D", - (b"t", b"h"): b"d", - (b"T", b"h"): b"D", - (b"t", b"H"): b"d", - (b"H", b"G"): b"C", - (b"h", b"g"): b"c", - (b"H", b"g"): b"C", - (b"h", b"G"): b"c", - (b"G", b"H"): b"D", - (b"g", b"h"): b"d", - (b"G", b"h"): b"D", - (b"g", b"H"): b"d", - (b"H", b"A"): b"T", - (b"h", b"a"): b"t", - (b"H", b"a"): b"T", - (b"h", b"A"): b"t", - (b"A", b"H"): b"D", - (b"a", b"h"): b"d", - (b"A", b"h"): b"D", - (b"a", b"H"): b"d", - (b"D", b"T"): b"A", - (b"d", b"t"): b"a", - (b"D", b"t"): b"A", - (b"d", b"T"): b"a", - (b"T", b"D"): b"H", - (b"t", b"d"): b"h", - (b"T", b"d"): b"H", - (b"t", b"D"): b"h", - (b"D", b"C"): b"G", - (b"d", b"c"): b"g", - (b"D", b"c"): b"G", - (b"d", b"C"): b"g", - (b"C", b"D"): b"H", - (b"c", b"d"): b"h", - (b"C", b"d"): b"H", - (b"c", b"D"): b"h", - (b"D", b"A"): b"T", - (b"d", b"a"): b"t", - (b"D", b"a"): b"T", - (b"d", b"A"): b"t", - (b"A", b"D"): b"H", - (b"a", b"d"): b"h", - (b"A", b"d"): b"H", - (b"a", b"D"): b"h", - (b"B", b"G"): b"C", - (b"b", b"g"): b"c", - (b"B", b"g"): b"C", - (b"b", b"G"): b"c", - (b"G", b"B"): b"V", - (b"g", b"b"): b"v", - (b"G", b"b"): b"V", - (b"g", b"B"): b"v", - (b"B", b"C"): b"G", - (b"b", b"c"): b"g", - (b"B", b"c"): b"G", - (b"b", b"C"): b"g", - (b"C", b"B"): b"V", - (b"c", b"b"): b"v", - (b"C", b"b"): b"V", - (b"c", b"B"): b"v", - (b"B", b"A"): b"T", - (b"b", b"a"): b"t", - (b"B", b"a"): b"T", - (b"b", b"A"): b"t", - (b"A", b"B"): b"V", - (b"a", b"b"): b"v", - (b"A", b"b"): b"V", - (b"a", b"B"): b"v", - # (b"N", b"C"): b"G", - # (b"n", b"c"): b"g", - # (b"N", b"c"): b"G", - # (b"n", b"C"): b"g", - # (b"C", b"N"): b"N", - # (b"c", b"n"): b"n", - # (b"C", b"n"): b"N", - # (b"c", b"N"): b"n", - # (b"N", b"T"): b"A", - # (b"n", b"t"): b"a", - # (b"N", b"t"): b"A", - # (b"n", b"T"): b"a", - # (b"T", b"N"): b"N", - # (b"t", b"n"): b"n", - # (b"T", b"n"): b"N", - # (b"t", b"N"): b"n", - # (b"N", b"A"): b"T", - # (b"n", b"a"): b"t", - # (b"N", b"a"): b"T", - # (b"n", b"A"): b"t", - # (b"A", b"N"): b"N", - # (b"a", b"n"): b"n", - # (b"A", b"n"): b"N", - # (b"a", b"N"): b"n", - # (b"N", b"G"): b"C", - # (b"n", b"g"): b"c", - # (b"N", b"g"): b"C", - # (b"n", b"G"): b"c", - # (b"G", b"N"): b"N", - # (b"g", b"n"): b"n", - # (b"G", b"n"): b"N", - # (b"g", b"N"): b"n", - (b"N", b"N"): b"N", - (b"n", b"N"): b"n", - (b"N", b"n"): b"N", - (b"n", b"n"): b"n", - # (b" ", b"N"): b"N", - # (b" ", b"n"): b"n", -} - -bp_dict_str = { - (x.decode("ascii"), y.decode("ascii")): z.decode("ascii") - for (x, y), z in bp_dict.items() -} - def three_frame_orfs( dna: str, @@ -697,7 +286,7 @@ def rc(sequence: StrOrBytes) -> StrOrBytes: accepts mixed DNA/RNA """ - return sequence.translate(_complement_table)[::-1] + return sequence.translate(complement_table_dscode)[::-1] def complement(sequence: str): @@ -705,7 +294,7 @@ def complement(sequence: str): accepts mixed DNA/RNA """ - return sequence.translate(_complement_table) + return sequence.translate(complement_table_dscode) # def memorize(filename): From 5e60e2629762991d9c9115abb7f55054b01bb188 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:27:21 +0100 Subject: [PATCH 24/36] broke out the __repr__ code to a function for clarity, reintroduced the check for internal splits in init --- src/pydna/dseq.py | 215 ++++++++++++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 85 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 700de4ef..824d0c7a 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -30,16 +30,87 @@ from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap -from pydna.utils import bp_dict -from pydna.utils import to_watson_table -from pydna.utils import to_crick_table -from pydna.utils import to_full_sequence -from pydna.utils import to_5tail_table -from pydna.utils import to_3tail_table +from pydna.alphabet import bp_dict +from pydna.alphabet import annealing_dict + +# from pydna.utils import bp_dict +# from pydna.utils import annealing_dict +from pydna.alphabet import dscode_to_watson_table +from pydna.alphabet import dscode_to_crick_table +from pydna.alphabet import dscode_to_to_full_sequence_table +from pydna.alphabet import dscode_to_watson_tail_table +from pydna.alphabet import dscode_to_crick_tail_table +from pydna.alphabet import placeholder1 +from pydna.alphabet import placeholder2 +from pydna.alphabet import interval from pydna.common_sub_strings import common_sub_strings as _common_sub_strings from pydna.common_sub_strings import terminal_overlap as _terminal_overlap from pydna.types import DseqType, EnzymesType, CutSiteType +length_limit_for_repr = ( + 30 # Sequences larger than this gets a truncated representation. +) + + +def representation(data=b"", length_limit_for_repr=30): + """ + Two line string representation of a sequence of symbols. + + + Parameters + ---------- + data : TYPE, optional + DESCRIPTION. The default is b"". + + Returns + ------- + TYPE + DESCRIPTION. + + """ + m = _re.match( + b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUONgatcuon])(.*)(?<=[GATCUONgatcuon])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", + data, + ) + result = m.groups() if m else (b"",) * 6 + sticky_left5, sticky_left3, middle, sticky_right5, sticky_right3, single = result + if len(data) > length_limit_for_repr: + sticky_left5 = ( + sticky_left5[:4] + placeholder2 * 2 + sticky_left5[-4:] + if sticky_left5 and len(sticky_left5) > 10 + else sticky_left5 + ) + sticky_left3 = ( + sticky_left3[:4] + placeholder1 * 2 + sticky_left3[-4:] + if sticky_left3 and len(sticky_left3) > 10 + else sticky_left3 + ) + middle = ( + middle[:4] + interval * 2 + middle[-4:] + if middle and len(middle) > 10 + else middle + ) + sticky_right5 = ( + sticky_right5[:4] + placeholder2 * 2 + sticky_right5[-4:] + if sticky_right5 and len(sticky_right5) > 10 + else sticky_right5 + ) + sticky_right3 = ( + sticky_right3[:4] + placeholder1 * 2 + sticky_right3[-4:] + if sticky_right3 and len(sticky_right3) > 10 + else sticky_right3 + ) + r = ( + (sticky_left5 or sticky_left3 or b"") + + (middle or b"") + + (sticky_right5 or sticky_right3 or single or b"") + ) + + return _pretty_str( + f"{r.translate(dscode_to_watson_table).decode().rstrip()}\n" + f"{r.translate(dscode_to_crick_table).decode().rstrip()}" + ) + class CircularString(str): """ @@ -429,8 +500,6 @@ class Dseq(_Seq): """ - trunc = 30 - def __init__( self, watson: _Union[str, bytes], @@ -490,6 +559,8 @@ def __init__( sense = f"{sense:<{max_len}}" # pad on right side to max_len antisense = f"{antisense:<{max_len}}" # pad on right side to max_len + assert len(sense) == len(antisense) + data = bytearray() for w, c in zip(sense, antisense): @@ -501,11 +572,22 @@ def __init__( self._data = bytes(data) self.circular = circular - # self.watson = _pretty_str(watson) - # self.crick = _pretty_str(crick) - self.length = len(self._data) - # self.ovhg = ovhg self.pos = pos + test_data = self._data + if circular: + test_data += self._data[0:1] + msg = "" + counter = 0 + for mobj in _re.finditer( + b"(.{0,3})([PEXIpexi][QFZJqfzj]|[QFZJqfzj][PEXIpexi])(.{0,3})", test_data + ): + chunk = mobj.group() + msg += f"[{mobj.start()}:{mobj.end()}]\n{representation(chunk)}\n\n" + counter += 1 + if counter: + raise ValueError( + f"Molecule is internally split in {counter} location(s):\n\n{msg}".strip() + ) @classmethod def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs): @@ -516,7 +598,6 @@ def quick(cls, data: bytes, *args, circular=False, pos=0, **kwargs): """ obj = cls.__new__(cls) obj.circular = circular - obj.length = len(data) obj.pos = pos obj._data = data return obj @@ -526,13 +607,11 @@ def from_string( cls, dna: str, *args, - # linear=True, circular=False, **kwargs, ): - obj = cls.__new__(cls) # Does not call __init__ + obj = cls.__new__(cls) obj.circular = circular - obj.length = len(dna) obj.pos = 0 obj._data = dna.encode("ASCII") return obj @@ -558,7 +637,6 @@ def from_representation(cls, dsdna: str, *args, **kwargs): print(f"Base mismatch in representation {err}") raise obj._data = bytes(data) - obj.length = len(data) return obj @classmethod @@ -637,7 +715,7 @@ def watson(self): DESCRIPTION. """ - return self._data.translate(to_watson_table).strip().decode("ascii") + return self._data.translate(dscode_to_watson_table).strip().decode("ascii") @property def crick(self): @@ -650,7 +728,7 @@ def crick(self): DESCRIPTION. """ - return self._data.translate(to_crick_table).strip().decode("ascii")[::-1] + return self._data.translate(dscode_to_crick_table).strip().decode("ascii")[::-1] @property def ovhg(self): @@ -680,7 +758,7 @@ def to_blunt_string(self): DESCRIPTION. """ - return self._data.translate(to_full_sequence).decode("ascii") + return self._data.translate(dscode_to_to_full_sequence_table).decode("ascii") __str__ = to_blunt_string @@ -798,52 +876,10 @@ def __eq__(self, other: DseqType) -> bool: def __repr__(self): header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})" - # m = _re.match( - # b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUOgatcuo])(.*)(?<=[GATCUOgatcuo])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", - # self._data, - # ) - m = _re.match( - b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUONgatcuon])(.*)(?<=[GATCUONgatcuon])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", - self._data, - ) - result = m.groups() if m else (b"",) * 6 - sticky_left5, sticky_left3, middle, sticky_right5, sticky_right3, single = ( - result - ) - if len(self) > self.trunc: - sticky_left5 = ( - sticky_left5[:4] + b"22" + sticky_left5[-4:] - if sticky_left5 and len(sticky_left5) > 10 - else sticky_left5 - ) - sticky_left3 = ( - sticky_left3[:4] + b"11" + sticky_left3[-4:] - if sticky_left3 and len(sticky_left3) > 10 - else sticky_left3 - ) - middle = ( - middle[:4] + b".." + middle[-4:] - if middle and len(middle) > 10 - else middle - ) - sticky_right5 = ( - sticky_right5[:4] + b"22" + sticky_right5[-4:] - if sticky_right5 and len(sticky_right5) > 10 - else sticky_right5 - ) - sticky_right3 = ( - sticky_right3[:4] + b"11" + sticky_right3[-4:] - if sticky_right3 and len(sticky_right3) > 10 - else sticky_right3 - ) - r = ( - (sticky_left5 or sticky_left3 or b"") - + (middle or b"") - + (sticky_right5 or sticky_right3 or single or b"") - ) - return _pretty_str( - f"{header}\n{r.translate(to_watson_table).decode().rstrip()}\n{r.translate(to_crick_table).decode().rstrip()}" - ) + + rpr = representation(self._data) + + return _pretty_str(f"{header}\n{rpr}") def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. @@ -940,7 +976,7 @@ def looped(self: DseqType) -> DseqType: junction = b"".join( [ - bp_dict.get((bytes([w]), bytes([c])), b"-") + annealing_dict.get((bytes([w]), bytes([c])), b"-") for w, c in zip(sticky_left_just, sticky_right_just) ] ) @@ -1125,10 +1161,10 @@ def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: assert len(sticky_self_just) == len(sticky_other_just) if perfectmatch: - + # breakpoint() junction = b"".join( [ - bp_dict.get((bytes([w]), bytes([c])), b"-") + annealing_dict.get((bytes([w]), bytes([c])), b"-") for w, c in zip(sticky_self_just, sticky_other_just) ] ) @@ -1139,8 +1175,9 @@ def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: else: result = _terminal_overlap( - sticky_self.translate(to_full_sequence).decode("ascii"), - sticky_other.translate(to_full_sequence).decode("ascii") + "@", + sticky_self.translate(dscode_to_to_full_sequence_table).decode("ascii"), + sticky_other.translate(dscode_to_to_full_sequence_table).decode("ascii") + + "@", limit=1, ) @@ -1154,7 +1191,7 @@ def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: junction = b"".join( [ - bp_dict.get((bytes([w]), bytes([c])), b"-") + annealing_dict.get((bytes([w]), bytes([c])), b"-") for w, c in zip(sticky_self, sticky_other) ] ) @@ -1530,7 +1567,7 @@ def cas9(self, RNA: str) -> _Tuple[slice, ...]: cuts = [0] for m in _re.finditer(bRNA, self._data): cuts.append(m.start() + 17) - cuts.append(self.length) + cuts.append(len(self)) slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:])) return slices @@ -1869,7 +1906,7 @@ def cast_to_ds_right(self): """ def replace(m): - return m.group(1).translate(to_full_sequence) + return m.group(1).translate(dscode_to_to_full_sequence_table) # Not using f-strings below to avoid bytes/string conversion return self.__class__( @@ -1900,7 +1937,7 @@ def cast_to_ds_left(self): """ def replace(m): - return m.group(1).translate(to_full_sequence) + return m.group(1).translate(dscode_to_to_full_sequence_table) # Not using f-strings below to avoid bytes/string conversion return self.__class__( @@ -1970,14 +2007,14 @@ def _shed_ss_dna(self, length): for x, y in watsonnicks: stuffer = new[x:y] - ss = Dseq.quick(stuffer.translate(to_5tail_table)) - new = new[:x] + stuffer.translate(to_3tail_table) + new[y:] + ss = Dseq.quick(stuffer.translate(dscode_to_watson_tail_table)) + new = new[:x] + stuffer.translate(dscode_to_crick_tail_table) + new[y:] watsonstrands.append((x, y, ss)) for x, y in cricknicks: stuffer = new[x:y] - ss = Dseq.quick(stuffer.translate(to_3tail_table)) - new = new[:x] + stuffer.translate(to_5tail_table) + new[y:] + ss = Dseq.quick(stuffer.translate(dscode_to_crick_tail_table)) + new = new[:x] + stuffer.translate(dscode_to_watson_tail_table) + new[y:] crickstrands.append((x, y, ss)) ordered_strands = sorted(watsonstrands + crickstrands) @@ -1986,8 +2023,8 @@ def _shed_ss_dna(self, length): for x, y, ss in ordered_strands: seq = ( - ss._data[::-1].translate(to_watson_table).strip() - or ss._data.translate(to_crick_table).strip() + ss._data[::-1].translate(dscode_to_watson_table).strip() + or ss._data.translate(dscode_to_crick_table).strip() ) strands.append(_Seq(seq)) @@ -2050,7 +2087,7 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": GttCTTAA """ - if _cuts_overlap(left_cut, right_cut, self.length): + if _cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) if left_cut: @@ -2062,12 +2099,20 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": (right_watson_cut, right_overhang), _ = right_cut else: (right_watson_cut, right_overhang), _ = ( - (self.length, 0), + (len(self), 0), None, ) - table1 = to_5tail_table if left_overhang > 0 else to_3tail_table - table2 = to_5tail_table if right_overhang < 0 else to_3tail_table + table1 = ( + dscode_to_watson_tail_table + if left_overhang > 0 + else dscode_to_crick_tail_table + ) + table2 = ( + dscode_to_watson_tail_table + if right_overhang < 0 + else dscode_to_crick_tail_table + ) left_stck_begin = min(left_watson_cut, left_watson_cut - left_overhang) left_stck_end = left_stck_begin + abs(left_overhang) From 3a536b1e01b0f71c6ee27d629ff319a9fe641ba0 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 1 Oct 2025 08:29:07 +0100 Subject: [PATCH 25/36] alphabet related code in src/pydna/alphabet.py --- src/pydna/alphabet.py | 329 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 src/pydna/alphabet.py diff --git a/src/pydna/alphabet.py b/src/pydna/alphabet.py new file mode 100644 index 00000000..60b8eef2 --- /dev/null +++ b/src/pydna/alphabet.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Six multiline strings are defined in this file. + +codestrings["un_ambiguous_ds_dna"] +codestrings["ambiguous_ds_dna"] +codestrings["ds_rna"] +codestrings["single_stranded_dna_rna"] +codestrings["mismatched_dna_rna"] +codestrings["loops_dna_rna"] + +Each string has five lines and describe the DNA alphabet +used in Pydna in this form: + +W 1 +| 2 +C 3 + 4 +S 5 + +W (line 1) and C (line 2) are complementary bases in a double stranded DNA molecule and S (line 5) are +the symbols of the alphabet used to describe the base pair above the symbol. + +""" + +emptyspace = chr(32) + +codestrings = dict() + + +codestrings[ + "un_ambiguous_ds_dna" +] = """\ +GATC +|||| +CTAG + +GATC +""" + +codestrings[ + "ambiguous_ds_dna" +] = """\ +RYMKSWHBVDN +||||||||||| +YRKMSWDVBHN + +RYMKSWHBVDN +""" + +codestrings[ + "ds_rna" +] = """\ +UA +|| +AU + +UO +""" + +codestrings["single_stranded_dna_rna"] = ( + """\ +GATC....U. +|||||||||| +....CTAG.U + +PEXIQFZJ$% +""".replace( + ".", emptyspace + ) +) + +codestrings[ + "mismatched_dna_rna" +] = """\ +AAACCCGGGTTTUUUGCT +|||||||||||||||||| +ACGACTAGTCGTGCTUUU + +!#{}&*()<>@:?[]=_; +""" + +codestrings[ + "loops_dna_rna" +] = """\ +-----AGCTU +|||||||||| +AGCTU----- + +0123456789 +""" + +keys = set( + ( + "un_ambiguous_ds_dna", + "ambiguous_ds_dna", + "ds_rna", + "single_stranded_dna_rna", + "mismatched_dna_rna", + "loops_dna_rna", + ) +) + +assert set(codestrings.keys()) == keys + +not_dscode = "lL\"',-./\\^`|+~" + +for name, codestring in codestrings.items(): + + # This loops all codestrings and checks for consistency of format. + lines = codestring.splitlines() + + assert len(lines) == 5 + + # We want the Watson, Crick and Symbol lines only + # Second line has to be pipes ("|") and fourth has to be empty + + watsn, pipes, crick, empty, symbl = lines + + assert all(ln.isascii() for ln in (watsn, crick, symbl)) + + assert all(ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()) + + # check so that pipes contain only "|" + assert set(pipes) == set("|") + + # check so strings are the same length + assert all(len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)) + + # These characters are not used. + assert not any([letter in not_dscode for letter in symbl]) + + +codes = dict() + +for name, codestring in codestrings.items(): + + lines = codestring.splitlines() + + watsons, _, cricks, _, symbols = lines + + codes[name] = dct = dict() + + for watson, crick, symbol in zip(watsons, cricks, symbols): + if watson == emptyspace: + dct[watson, crick.lower()] = symbol.lower() + dct[watson, crick.upper()] = symbol.upper() + else: + dct[watson.upper(), crick.upper()] = symbol.upper() + dct[watson.upper(), crick.lower()] = symbol.upper() + dct[watson.lower(), crick.upper()] = symbol.lower() + dct[watson.lower(), crick.lower()] = symbol.lower() + + +bp_dict_str = ( + codes["un_ambiguous_ds_dna"] + | codes["ambiguous_ds_dna"] + | codes["ds_rna"] + | codes["single_stranded_dna_rna"] + # | codes["mismatched_dna_rna"] + # | codes["loops_dna_rna"] +) + +bp_dict = { + (w.encode("ascii"), c.encode("ascii")): s.encode("ascii") + for (w, c), s in bp_dict_str.items() +} + +temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"] + + +annealing_dict_str = dict() + +# The annealing_dict_str is constructed below. This dict contains the information needed +# to tell if two DNA fragments (like a and b below) can anneal. The dict has the form (x, y): s +# Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair +# that is formed. One element in the dict is ('P', 'Q'): 'G' which matches the first +# of the four new base pairings formed between a and b in the example below. +# +# +# (a) +# gggPEXI (dscode for a) +# +# gggGATC +# ccc +# aaa (b) +# CTAGttt +# +# QFZJaaa (dscode for b) +# +# +# gggGATCaaa (annealing product between a and b) +# cccCTAGttt +# +# This loops through the base pairs where the upper or lower +# positions are empty. (w, c), s would be ("G", " "), "P" +# in the first iteration. +# + +d = codes["single_stranded_dna_rna"] # Alias to make the code below more readable. + +for (x, y), symbol in d.items(): + if y == emptyspace: + other = next(b for a, b in temp if a == x) + symbol_other = d[emptyspace, other] + annealing_dict_str[symbol, symbol_other] = temp[x, other] + annealing_dict_str[symbol_other, symbol] = temp[x, other] + elif x == emptyspace: + other = next(a for a, b in temp if b == y) + symbol_other = d[other, emptyspace] + annealing_dict_str[symbol, symbol_other] = temp[other, y] + annealing_dict_str[symbol_other, symbol] = temp[other, y] + else: + raise ValueError("This should not happen") + +del d + +mixed_case_dict = ( + dict() +) # This dict will contain upper and lower case symbols annealing_dict_str + +for (x, y), symbol in annealing_dict_str.items(): + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + +annealing_dict_str = ( + annealing_dict_str | mixed_case_dict +) # Add mixed case entries to the dict + +# A bytestr version of the annealing_dict_str +annealing_dict = { + (x.encode("ascii"), y.encode("ascii")): s.encode("ascii") + for (x, y), s in annealing_dict_str.items() +} + +dscode_sense = [] +dscode_compl = [] +watson = [] +crick = [] + +for (w, c), s in bp_dict.items(): + + if w.isupper() and c.islower() or w.islower() and c.isupper(): + continue + + dscode_sense.append(s) + dscode_compl.append(bp_dict[c, w]) + watson.append(w) + crick.append(c) + +complement_table_dscode = bytes.maketrans( + b"".join(dscode_sense), b"".join(dscode_compl) +) + +placeholder1, placeholder2, interval, empty_bs = ( + b"~", + b"+", + b".", + emptyspace.encode("ascii"), +) + +for bstring in placeholder1, placeholder2, interval: + assert all(letter in not_dscode.encode("ascii") for letter in bstring) + +dscode_to_watson_table = bytes.maketrans( + b"".join(dscode_sense) + placeholder1 + placeholder2, + b"".join(watson) + empty_bs + interval, +) + +dscode_to_crick_table = bytes.maketrans( + b"".join(dscode_sense) + placeholder1 + placeholder2, + b"".join(crick) + interval + empty_bs, +) + + +watson_tail_letter_dict = { + (w.encode("ascii")): s.encode("ascii") + for (w, c), s in codes["single_stranded_dna_rna"].items() + if c.isspace() +} + +from_letters = b"".join(watson_tail_letter_dict.keys()) + +to_letters = b"".join(watson_tail_letter_dict.values()) + +dscode_to_crick_tail_table = bytes.maketrans(from_letters, to_letters) +# dscode_to_crick_tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") + + +crick_tail_letter_dict = { + (c.encode("ascii")): s.encode("ascii") + for (w, c), s in codes["single_stranded_dna_rna"].items() + if w.isspace() +} + +from_letters = b"".join(crick_tail_letter_dict.keys()) + +to_letters = b"".join(crick_tail_letter_dict.values()) + +dscode_to_watson_tail_table = bytes.maketrans(from_letters, to_letters) +dscode_to_watson_tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj") + + +dscode_to_to_full_sequence_table = bytes.maketrans( + b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc" +) + + +iupac_compl_regex = { # IUPAC Ambiguity Code complements + "A": "(?:T|U)", + "C": "(?:G)", + "G": "(?:C)", + "T": "(?:A)", + "U": "(?:A)", + "R": "(?:T|C|Y)", + "Y": "(?:G|A|R)", + "S": "(?:G|C|S)", + "W": "(?:A|T|W)", + "K": "(?:C|AM)", + "M": "(?:T|G|K)", + "B": "(?:C|G|A|V)", + "D": "(?:A|C|T|H)", + "H": "(?:A|G|T|D)", + "V": "(?:T|C|G|B)", + "N": "(?:A|G|C|T|N)", +} From d9acc6006672d67eecfdb27660f1523618440f83 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 14:20:49 +0100 Subject: [PATCH 26/36] mostly comments --- src/pydna/alphabet.py | 589 ++++++++++++++++++++++++++++++------------ 1 file changed, 425 insertions(+), 164 deletions(-) diff --git a/src/pydna/alphabet.py b/src/pydna/alphabet.py index 60b8eef2..6c634d96 100644 --- a/src/pydna/alphabet.py +++ b/src/pydna/alphabet.py @@ -1,18 +1,27 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +from collections import namedtuple +import re as _re + """ -Six multiline strings are defined in this file. +Nucleic acid alphabet used in pydna + +This file serves to define the DNA aplhabet used in pydna. Each symbol usually represents a basepair +(two opposing bases in the two antiparalell DNA strands). The alphabet is defined in six literal strings +in this file. These strings serve as the single source of thruth for the alphabet and +a series of dictionaries and translation tebles are constructed from their content. -codestrings["un_ambiguous_ds_dna"] -codestrings["ambiguous_ds_dna"] -codestrings["ds_rna"] -codestrings["single_stranded_dna_rna"] -codestrings["mismatched_dna_rna"] -codestrings["loops_dna_rna"] +The strings have the following names: -Each string has five lines and describe the DNA alphabet -used in Pydna in this form: +- un_ambiguous_ds_dna +- ambiguous_ds_dna +- ds_rna +- single_stranded_dna_rna +- mismatched_dna_rna +- loops_dna_rna + +Each string has five lines following this form: W 1 | 2 @@ -23,16 +32,27 @@ W (line 1) and C (line 2) are complementary bases in a double stranded DNA molecule and S (line 5) are the symbols of the alphabet used to describe the base pair above the symbol. -""" +Line 2 must contain only the pipe character, indicating basepairing and line 4 is empty. +The lines must be of equal length and a series ot tests are performed to ensure the integrity of +the alphabet. -emptyspace = chr(32) +D R IUPAC Single Gaps DNA / RNA +N N extended Strand - = bond mismatches +A A DNA / RNA + • = empty + +GATC UA RYMKSWHBVDN GATC••••U• -----AGCTU AAACCCGGGTTTUUUGCT +|||| || ||||||||||| |||||||||| |||||||||| |||||||||||||||||| +CTAG AU YRKMSWDVBHN ••••CTAG•U AGCTU----- ACGACTAGTCGTGCTUUU -codestrings = dict() +GATC UO RYMKSWHBVDN PEXIQFZJ$% 0123456789 !#{}&*()<>@:?[]=_; + +""" +# An alias for whitespace +emptyspace = chr(32) -codestrings[ - "un_ambiguous_ds_dna" -] = """\ +un_ambiguous_ds_dna = """\ GATC |||| CTAG @@ -40,19 +60,7 @@ GATC """ -codestrings[ - "ambiguous_ds_dna" -] = """\ -RYMKSWHBVDN -||||||||||| -YRKMSWDVBHN - -RYMKSWHBVDN -""" - -codestrings[ - "ds_rna" -] = """\ +ds_rna = """\ UA || AU @@ -60,21 +68,27 @@ UO """ -codestrings["single_stranded_dna_rna"] = ( - """\ -GATC....U. +ambiguous_ds_dna = """\ +RYMKSWHBVDN +||||||||||| +YRKMSWDVBHN + +RYMKSWHBVDN +""" + +# The dots in the string below are replaced by emptyspace +single_stranded_dna_rna = """\ +GATC••••U• |||||||||| -....CTAG.U +••••CTAG•U PEXIQFZJ$% """.replace( - ".", emptyspace - ) + "•", emptyspace ) -codestrings[ - "mismatched_dna_rna" -] = """\ + +mismatched_dna_rna = """\ AAACCCGGGTTTUUUGCT |||||||||||||||||| ACGACTAGTCGTGCTUUU @@ -82,9 +96,7 @@ !#{}&*()<>@:?[]=_; """ -codestrings[ - "loops_dna_rna" -] = """\ +loops_dna_rna = """\ -----AGCTU |||||||||| AGCTU----- @@ -92,46 +104,57 @@ 0123456789 """ -keys = set( - ( - "un_ambiguous_ds_dna", - "ambiguous_ds_dna", - "ds_rna", - "single_stranded_dna_rna", - "mismatched_dna_rna", - "loops_dna_rna", - ) -) -assert set(codestrings.keys()) == keys +codestrings = { + "un_ambiguous_ds_dna": un_ambiguous_ds_dna, + "ambiguous_ds_dna": ambiguous_ds_dna, + "ds_rna": ds_rna, + "single_stranded_dna_rna": single_stranded_dna_rna, + "mismatched_dna_rna": mismatched_dna_rna, + "loops_dna_rna": loops_dna_rna, +} -not_dscode = "lL\"',-./\\^`|+~" +# This string contains ascii letters not used in the alphabet +letters_not_in_dscode = "lL\"',-./\\^`|+~" for name, codestring in codestrings.items(): # This loops all codestrings and checks for consistency of format. lines = codestring.splitlines() - assert len(lines) == 5 + assert len(lines) == 5, f"{name} does not have 5 lines" # We want the Watson, Crick and Symbol lines only # Second line has to be pipes ("|") and fourth has to be empty watsn, pipes, crick, empty, symbl = lines - assert all(ln.isascii() for ln in (watsn, crick, symbl)) + assert all( + ln.isascii() for ln in (watsn, crick, symbl) + ), f"{name} has non-ascii letters" - assert all(ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()) + assert all( + ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha() + ), f"{name} has non-uppercase letters" # check so that pipes contain only "|" - assert set(pipes) == set("|") + assert set(pipes) == set("|"), f"{name} has non-pipe character(s) in line 2" # check so strings are the same length - assert all(len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)) + assert all( + len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl) + ), f"{name} has lines of unequal length" # These characters are not used. - assert not any([letter in not_dscode for letter in symbl]) + assert not any( + [letter in letters_not_in_dscode for letter in symbl] + ), f"{name} has chars outside alphabet" +""" +The `codes` dictionary is a dict of dicts containing the information of the +code strings in the form if a dict with string names as keys, each containing a +dict wit this structure: (Watson letter, Crick letter): dscode symbol +""" codes = dict() @@ -141,20 +164,15 @@ watsons, _, cricks, _, symbols = lines - codes[name] = dct = dict() + # d is an alias of codes[name] used in this loop for code clarity. + codes[name] = d = dict() for watson, crick, symbol in zip(watsons, cricks, symbols): - if watson == emptyspace: - dct[watson, crick.lower()] = symbol.lower() - dct[watson, crick.upper()] = symbol.upper() - else: - dct[watson.upper(), crick.upper()] = symbol.upper() - dct[watson.upper(), crick.lower()] = symbol.upper() - dct[watson.lower(), crick.upper()] = symbol.lower() - dct[watson.lower(), crick.lower()] = symbol.lower() + d[watson, crick] = symbol +del d -bp_dict_str = ( +basepair_dict = ( codes["un_ambiguous_ds_dna"] | codes["ambiguous_ds_dna"] | codes["ds_rna"] @@ -163,153 +181,242 @@ # | codes["loops_dna_rna"] ) -bp_dict = { - (w.encode("ascii"), c.encode("ascii")): s.encode("ascii") - for (w, c), s in bp_dict_str.items() -} +annealing_dict = dict() -temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"] +""" +The annealing_dict_of_str is constructed below. It contains the information needed +to tell if two DNA fragments (like a and b below) can anneal. This of cource only concerns +single stranded regions. + +The dict has the form (x, y): s +Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair +that is formed. The letters x and y are from the values of the codes["single_stranded_dna_rna"] +dictionary. + +For, example: One key-value pair is ('P', 'Q'): 'G' which matches the first +of the four new base pairings formed between a and b in the example below. + + +(a) +gggPEXI (dscode for a) +gggGATC +ccc + aaa (b) + CTAGttt + + QFZJaaa (dscode for b) + + +gggGATCaaa (annealing product between a and b) +cccCTAGttt + +This loops through the base pairs where the upper or lower +positions are empty. (w, c), s would be ("G", " "), "P" +in the first iteration. +""" + +temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"] -annealing_dict_str = dict() - -# The annealing_dict_str is constructed below. This dict contains the information needed -# to tell if two DNA fragments (like a and b below) can anneal. The dict has the form (x, y): s -# Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair -# that is formed. One element in the dict is ('P', 'Q'): 'G' which matches the first -# of the four new base pairings formed between a and b in the example below. -# -# -# (a) -# gggPEXI (dscode for a) -# -# gggGATC -# ccc -# aaa (b) -# CTAGttt -# -# QFZJaaa (dscode for b) -# -# -# gggGATCaaa (annealing product between a and b) -# cccCTAGttt -# -# This loops through the base pairs where the upper or lower -# positions are empty. (w, c), s would be ("G", " "), "P" -# in the first iteration. -# - -d = codes["single_stranded_dna_rna"] # Alias to make the code below more readable. +# Alias to make the code below more readable. +d = codes["single_stranded_dna_rna"] for (x, y), symbol in d.items(): if y == emptyspace: other = next(b for a, b in temp if a == x) symbol_other = d[emptyspace, other] - annealing_dict_str[symbol, symbol_other] = temp[x, other] - annealing_dict_str[symbol_other, symbol] = temp[x, other] + annealing_dict[symbol, symbol_other] = temp[x, other] + annealing_dict[symbol_other, symbol] = temp[x, other] elif x == emptyspace: other = next(a for a, b in temp if b == y) symbol_other = d[other, emptyspace] - annealing_dict_str[symbol, symbol_other] = temp[other, y] - annealing_dict_str[symbol_other, symbol] = temp[other, y] + annealing_dict[symbol, symbol_other] = temp[other, y] + annealing_dict[symbol_other, symbol] = temp[other, y] else: raise ValueError("This should not happen") -del d +del d, temp -mixed_case_dict = ( - dict() -) # This dict will contain upper and lower case symbols annealing_dict_str +temp = {} -for (x, y), symbol in annealing_dict_str.items(): - mixed_case_dict[x.upper(), y.lower()] = symbol.upper() - mixed_case_dict[x.lower(), y.upper()] = symbol.lower() - mixed_case_dict[x.lower(), y.lower()] = symbol.lower() +for (x, y), symbol in annealing_dict.items(): -annealing_dict_str = ( - annealing_dict_str | mixed_case_dict -) # Add mixed case entries to the dict + temp[x, emptyspace] = x + temp[emptyspace, y] = y -# A bytestr version of the annealing_dict_str -annealing_dict = { - (x.encode("ascii"), y.encode("ascii")): s.encode("ascii") - for (x, y), s in annealing_dict_str.items() -} +annealing_dict_w_holes = annealing_dict | temp -dscode_sense = [] -dscode_compl = [] -watson = [] -crick = [] +del temp -for (w, c), s in bp_dict.items(): +""" +A collection of translation tables are a practical way to obtain Watson and Crick +from dscode or the reverse complement strands when needed. - if w.isupper() and c.islower() or w.islower() and c.isupper(): - continue +These are meant to be used by the bytes.translate method. - dscode_sense.append(s) - dscode_compl.append(bp_dict[c, w]) - watson.append(w) - crick.append(c) -complement_table_dscode = bytes.maketrans( - b"".join(dscode_sense), b"".join(dscode_compl) -) +The translation table "complement_table_for_dscode" is used to obtain the +complement of a DNA sequence in dscode format. +""" -placeholder1, placeholder2, interval, empty_bs = ( - b"~", - b"+", - b".", - emptyspace.encode("ascii"), +complement_dict_for_dscode = { + s: basepair_dict[c, w] for (w, c), s in basepair_dict.items() +} + +from_letters = "".join(complement_dict_for_dscode.keys()) +to_letters = "".join(complement_dict_for_dscode.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() + +complement_table_for_dscode = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") ) -for bstring in placeholder1, placeholder2, interval: - assert all(letter in not_dscode.encode("ascii") for letter in bstring) +""" +dscode_to_watson_table and dscode_to_crick_table are used to obtain the Watson +and (reverse) Crick strands from dscode. Four extra letters are added to the +table and used in the pydna.dseq.representation function. These are used +to add range indicators ("..") in the watson or crick strings for +representation of long sequences. + +The four letters are placeholder1, placeholder2, interval, empty_bs +""" + +dscode_sense = "" +dscode_compl = "" +watson = "" +crick = "" + +for (w, c), s in basepair_dict.items(): + dscode_sense += s + dscode_compl += basepair_dict[c, w] + watson += w + crick += c + +dscode_sense += dscode_sense.lower() +dscode_compl += dscode_compl.lower() +watson += watson.lower() +crick += crick.lower() + +placeholder1 = "~" +placeholder2 = "+" +interval = "." + +assert placeholder1 in letters_not_in_dscode +assert placeholder2 in letters_not_in_dscode +assert interval in letters_not_in_dscode dscode_to_watson_table = bytes.maketrans( - b"".join(dscode_sense) + placeholder1 + placeholder2, - b"".join(watson) + empty_bs + interval, + (dscode_sense + placeholder1 + placeholder2).encode("ascii"), + (watson + emptyspace + interval).encode("ascii"), ) dscode_to_crick_table = bytes.maketrans( - b"".join(dscode_sense) + placeholder1 + placeholder2, - b"".join(crick) + interval + empty_bs, + (dscode_sense + placeholder1 + placeholder2).encode("ascii"), + (crick + interval + emptyspace).encode("ascii"), ) watson_tail_letter_dict = { - (w.encode("ascii")): s.encode("ascii") - for (w, c), s in codes["single_stranded_dna_rna"].items() - if c.isspace() + w: s for (w, c), s in codes["single_stranded_dna_rna"].items() if c.isspace() } -from_letters = b"".join(watson_tail_letter_dict.keys()) +from_letters = "".join(watson_tail_letter_dict.keys()) +to_letters = "".join(watson_tail_letter_dict.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() + +dscode_to_crick_tail_table = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") +) +# crick_tail_to_dscode_table = bytes.maketrans(to_letters.encode("ascii"),from_letters.encode("ascii")) + -to_letters = b"".join(watson_tail_letter_dict.values()) +from_letters_full = five_prime_ss_letters = to_letters +to_letters_full = from_letters -dscode_to_crick_tail_table = bytes.maketrans(from_letters, to_letters) -# dscode_to_crick_tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") +d = codes["single_stranded_dna_rna"] crick_tail_letter_dict = { - (c.encode("ascii")): s.encode("ascii") - for (w, c), s in codes["single_stranded_dna_rna"].items() - if w.isspace() + complement_dict_for_dscode[c]: s for (w, c), s in d.items() if w.isspace() } -from_letters = b"".join(crick_tail_letter_dict.keys()) +del d + +from_letters = "".join(crick_tail_letter_dict.keys()) +to_letters = "".join(crick_tail_letter_dict.values()) + +from_letters += from_letters.lower() +to_letters += to_letters.lower() -to_letters = b"".join(crick_tail_letter_dict.values()) +dscode_to_watson_tail_table = bytes.maketrans( + from_letters.encode("ascii"), to_letters.encode("ascii") +) +# watson_tail_to_dscode_table = bytes.maketrans(to_letters.encode("ascii"), from_letters.encode("ascii")) + +three_prime_ss_letters = to_letters +from_letters_full += to_letters +to_letters_full += from_letters + +dscode_to_full_sequence_table = bytes.maketrans( + from_letters_full.encode("ascii"), to_letters_full.encode("ascii") +) -dscode_to_watson_tail_table = bytes.maketrans(from_letters, to_letters) -dscode_to_watson_tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj") +# This loop adds upper and lower case symbols +mixed_case_dict = {} -dscode_to_to_full_sequence_table = bytes.maketrans( - b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc" +for (x, y), symbol in basepair_dict.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() + + if x == emptyspace: + mixed_case_dict[x, y.lower()] = symbol.lower() + mixed_case_dict[x, y.upper()] = symbol.upper() + if y == emptyspace: + mixed_case_dict[x.lower(), y] = symbol.lower() + mixed_case_dict[x.upper(), y] = symbol.upper() + +# Add mixed case entries to the dict +basepair_dict.update(mixed_case_dict) + +# This loop adds upper and lower case symbols +mixed_case_dict = {} + +for (x, y), symbol in annealing_dict.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() +# Add mixed case entries to the dict +annealing_dict.update(mixed_case_dict) + +ds_letters = ( + "".join(codes["un_ambiguous_ds_dna"].values()) + + "".join(codes["ds_rna"].values()) + + "".join(codes["ambiguous_ds_dna"].values()) +) + +ss_letters_watson = "".join( + s for (w, c), s in codes["single_stranded_dna_rna"].items() if c == emptyspace +) +ss_letters_crick = "".join( + s for (w, c), s in codes["single_stranded_dna_rna"].items() if w == emptyspace ) +ds_letters += ds_letters.lower() +ss_letters_watson += ss_letters_watson.lower() +ss_letters_crick += ss_letters_crick.lower() -iupac_compl_regex = { # IUPAC Ambiguity Code complements + +""" +The dict of regexes below cover IUPAC Ambiguity Code complements +and is used in the amplify module. +""" +iupac_compl_regex = { "A": "(?:T|U)", "C": "(?:G)", "G": "(?:C)", @@ -327,3 +434,157 @@ "V": "(?:T|C|G|B)", "N": "(?:A|G|C|T|N)", } + + +# This loop adds upper and lower case symbols +# mixed_case_dict = {} + +for (x, y), symbol in annealing_dict_w_holes.items(): + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() +# Add mixed case entries to the dict +annealing_dict_w_holes.update(mixed_case_dict) + + +def get_parts(datastring: str): + + m = _re.match( + f"([{ss_letters_watson}]*)" # capture group 0 ssDNA in watson strand + f"([{ss_letters_crick}]*)" # capture group 1 ssDNA in crick strand + f"(?=[{ds_letters}])" # no capture, positive lookahead for dsDNA + "(.*)" # captures group 2 everything in the middle + f"(?<=[{ds_letters}])" # no capture,positive look behind for dsDNA + f"([{ss_letters_watson}]*)" # capture group 3 ssDNA in watson strand + f"([{ss_letters_crick}]*)|" # capture group 4 ssDNA in crick strand + f"([{ss_letters_watson}]+)|" # capture group 5 if data contains only upper strand + f"([{ss_letters_crick}]+)", # capture group 6 if data contains only lower strand + datastring, + ) + + result = m.groups() if m else (None, None, None, None, None, None, None) + + result = ["" if e is None else e for e in result] + + field_names = ( + "sticky_left5", + "sticky_left3", + "middle", + "sticky_right3", + "sticky_right5", + "single_watson", + "single_crick", + ) + + fragment = namedtuple("fragment", field_names) + + return fragment(*result) + + +def dsbreaks(data: str): + + wl = _re.escape(five_prime_ss_letters) + cl = _re.escape(three_prime_ss_letters) + + breaks = [] + regex = ( + "(.{0,3})" # context if present + f"([{wl}][{cl}]|[{cl}][{wl}])" # ss chars next to each other + "(.{0,3})" # context if present + ) + for mobj in _re.finditer(regex, data): + chunk = mobj.group() + w, c = representation_tuple(chunk) + breaks.append(f"[{mobj.start()}:{mobj.end()}]\n{w}\n{c}\n") + return breaks + + +def representation_tuple( + datastring: str = "", length_limit_for_repr: int = 30, chunk: int = 4 +): + """ + Two line string representation of a sequence of dscode symbols. + + See pydna.alphabet module for the definition of the pydna dscode + alphabet. The dscode has a symbol (ascii) character for base pairs + and single stranded DNA. + + This function is used by the Dseq.__repr__() method. + + Parameters + ---------- + data : TYPE, optional + DESCRIPTION. The default is "". + + Returns + ------- + str + A two line string containing The Watson and Crick strands. + + """ + + ( + sticky_left5, + sticky_left3, + middle, + sticky_right5, + sticky_right3, + single_watson, + single_crick, + ) = get_parts(datastring) + + if len(datastring) > length_limit_for_repr: + """ + We need to shorten the repr if the sequence is longer than + limit imposed by length_limit_for_repr. + + The representation has three parts, so we divide by three for each part. + + Long DNA strands are interrupted by interval notation, like agc..att + where the two dots indicate intervening hidden sequence. + + + Dseq(-71) + GAAA..AATCaaaa..aaaa + tttt..ttttCTAA..AAAG + + placeholder1, placeholder2 are two letters that are replaced by + interval characters in the upper or lower strands by the translation + """ + + part_limit = length_limit_for_repr // 3 + + if len(sticky_left5) > part_limit: + sticky_left5 = ( + sticky_left5[:chunk] + placeholder2 * 2 + sticky_left5[-chunk:] + ) + + if len(sticky_left3) > part_limit: + sticky_left3 = ( + sticky_left3[:chunk] + placeholder1 * 2 + sticky_left3[-chunk:] + ) + + if len(middle) > part_limit: + middle = middle[:4] + interval * 2 + middle[-4:] + + if len(sticky_right5) > part_limit: + sticky_right5 = ( + sticky_right5[:chunk] + placeholder2 * 2 + sticky_right5[-chunk:] + ) + + if len(sticky_right3) > part_limit: + sticky_right3 = ( + sticky_right3[:chunk] + placeholder1 * 2 + sticky_right3[-chunk:] + ) + + """ + The processed string contains + """ + processed_dscode = (sticky_left5 or sticky_left3) + middle + ( + sticky_right5 or sticky_right3 + ) or single_watson + single_crick + + watson = processed_dscode.translate(dscode_to_watson_table).rstrip() + crick = processed_dscode.translate(dscode_to_crick_table).rstrip() + + return watson, crick From faec78b63644ee00722809dafdeee0daec997fd6 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 14:24:11 +0100 Subject: [PATCH 27/36] Commented out code to be removed. --- src/pydna/assembly2.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index e3e47cdc..3c791961 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -7,7 +7,8 @@ import networkx as _nx import itertools as _itertools from Bio.SeqFeature import SimpleLocation, Location -from Bio.Seq import reverse_complement + +# from Bio.Seq import reverse_complement from Bio.Restriction.Restriction import RestrictionBatch import regex import copy @@ -606,32 +607,33 @@ def primer_template_overlap( def fill_left(seq: _Dseq) -> _Dseq: """Fill the left overhang of a sequence with the complementary sequence.""" - new_watson = seq.watson - new_crick = seq.crick + # new_watson = seq.watson + # new_crick = seq.crick - # Watson 5' overhang - if seq.ovhg < 0: - new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg]) - # Crick 5' overhang - elif seq.ovhg > 0: - new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson + # # Watson 5' overhang + # if seq.ovhg < 0: + # new_crick = new_crick + reverse_complement(seq.watson[: -seq.ovhg]) + # # Crick 5' overhang + # elif seq.ovhg > 0: + # new_watson = reverse_complement(seq.crick[-seq.ovhg :]) + new_watson + # if _Dseq(new_watson, new_crick, 0) != seq.cast_to_ds_left(): - return seq.cast_to_ds_left() # _Dseq(new_watson, new_crick, 0) + return seq.cast_to_ds_left() def fill_right(seq: _Dseq) -> _Dseq: """Fill the right overhang of a sequence with the complementary sequence.""" - new_watson = seq.watson - new_crick = seq.crick + # new_watson = seq.watson + # new_crick = seq.crick - # Watson 3' overhang - watson_ovhg = seq.watson_ovhg() - if watson_ovhg < 0: - new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg]) + # # Watson 3' overhang + # watson_ovhg = seq.watson_ovhg() + # if watson_ovhg < 0: + # new_watson = new_watson + reverse_complement(seq.crick[:-watson_ovhg]) - # Crick 3' overhang - elif watson_ovhg > 0: - new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick + # # Crick 3' overhang + # elif watson_ovhg > 0: + # new_crick = reverse_complement(seq.watson[-watson_ovhg:]) + new_crick return seq.cast_to_ds_right() # _Dseq(new_watson, new_crick, seq.ovhg) @@ -797,7 +799,7 @@ def assemble( ] # Join the left sequence including the overlap with the right sequence without the overlap # we use fill_right / fill_left so that it works for ligation of sticky ends - # breakpoint() + out_dseqrecord = _Dseqrecord( fill_right(out_dseqrecord.seq) + fill_left(fragment.seq)[ From 6bfd6e60a2d13a32fba7750187cd41c260fc309b Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:09:33 +0100 Subject: [PATCH 28/36] CircularString -> CircularBytes (a byte string version). fuction dsbreaks is called from pydna.alphabet in __init__ simplified code overall, fuction get_parts from pydna.alphabet used in several places simpler looped method using get_parts and __add__ improved error message from __add__ --- src/pydna/dseq.py | 727 ++++++++++++++++++++++------------------------ 1 file changed, 344 insertions(+), 383 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 824d0c7a..dae373aa 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -30,201 +30,101 @@ from pydna.utils import rc as _rc from pydna.utils import flatten as _flatten from pydna.utils import cuts_overlap as _cuts_overlap -from pydna.alphabet import bp_dict -from pydna.alphabet import annealing_dict -# from pydna.utils import bp_dict -# from pydna.utils import annealing_dict +from pydna.alphabet import basepair_dict +from pydna.alphabet import annealing_dict from pydna.alphabet import dscode_to_watson_table from pydna.alphabet import dscode_to_crick_table -from pydna.alphabet import dscode_to_to_full_sequence_table -from pydna.alphabet import dscode_to_watson_tail_table -from pydna.alphabet import dscode_to_crick_tail_table -from pydna.alphabet import placeholder1 -from pydna.alphabet import placeholder2 -from pydna.alphabet import interval -from pydna.common_sub_strings import common_sub_strings as _common_sub_strings -from pydna.common_sub_strings import terminal_overlap as _terminal_overlap -from pydna.types import DseqType, EnzymesType, CutSiteType - -length_limit_for_repr = ( - 30 # Sequences larger than this gets a truncated representation. -) +# from pydna.alphabet import watson_tail_to_dscode_table +# from pydna.alphabet import crick_tail_to_dscode_table -def representation(data=b"", length_limit_for_repr=30): - """ - Two line string representation of a sequence of symbols. - +from pydna.alphabet import dscode_to_full_sequence_table +from pydna.alphabet import dscode_to_watson_tail_table +from pydna.alphabet import dscode_to_crick_tail_table - Parameters - ---------- - data : TYPE, optional - DESCRIPTION. The default is b"". +from pydna.alphabet import get_parts +from pydna.alphabet import representation_tuple +from pydna.alphabet import dsbreaks - Returns - ------- - TYPE - DESCRIPTION. +from pydna.common_sub_strings import common_sub_strings as _common_sub_strings +from pydna.types import DseqType, EnzymesType, CutSiteType - """ - m = _re.match( - b"([PEXIpexi]*)([QFZJqfzj]*)(?=[GATCUONgatcuon])(.*)(?<=[GATCUONgatcuon])([PEXIpexi]*)([QFZJqfzj]*)|([PEXIpexiQFZJqfzj]+)", - data, - ) - result = m.groups() if m else (b"",) * 6 - sticky_left5, sticky_left3, middle, sticky_right5, sticky_right3, single = result - if len(data) > length_limit_for_repr: - sticky_left5 = ( - sticky_left5[:4] + placeholder2 * 2 + sticky_left5[-4:] - if sticky_left5 and len(sticky_left5) > 10 - else sticky_left5 - ) - sticky_left3 = ( - sticky_left3[:4] + placeholder1 * 2 + sticky_left3[-4:] - if sticky_left3 and len(sticky_left3) > 10 - else sticky_left3 - ) - middle = ( - middle[:4] + interval * 2 + middle[-4:] - if middle and len(middle) > 10 - else middle - ) - sticky_right5 = ( - sticky_right5[:4] + placeholder2 * 2 + sticky_right5[-4:] - if sticky_right5 and len(sticky_right5) > 10 - else sticky_right5 - ) - sticky_right3 = ( - sticky_right3[:4] + placeholder1 * 2 + sticky_right3[-4:] - if sticky_right3 and len(sticky_right3) > 10 - else sticky_right3 - ) - r = ( - (sticky_left5 or sticky_left3 or b"") - + (middle or b"") - + (sticky_right5 or sticky_right3 or single or b"") - ) - return _pretty_str( - f"{r.translate(dscode_to_watson_table).decode().rstrip()}\n" - f"{r.translate(dscode_to_crick_table).decode().rstrip()}" - ) +# Sequences larger than this gets a truncated representation. +length_limit_for_repr = 30 -class CircularString(str): +class CircularBytes(bytes): """ - A circular string: indexing and slicing wrap around the origin (index 0). - - Examples - -------- - s = CircularString("ABCDE") - - # Integer indexing (wraps) - assert s[7] == "C" # 7 % 5 == 2 - assert s[-1] == "E" - - # Forward circular slices (wrap when stop <= start) - assert s[3:1] == "DEA" # 3,4,0 - assert s[1:1] == "BCDE" # full turn starting at 1, excluding 1 on next lap - assert s[:] == "ABCDE" # full string - assert s[::2] == "ACE" # every 2nd char, no wrap needed - assert s[4:2:2] == "EA" # 4,0 - - # Reverse circular slices - assert s[1:1:-1] == "BAEDC" # full reverse circle starting at 1 - assert s[2:4:-1] == "CBA" # 2,1,0 - - # Steps > 1 and negatives work with wrap as expected - assert s[0:0:2] == "ACE" - assert s[0:0:-2] == "AECBD" + A circular bytes sequence: indexing and slicing wrap around index 0. """ - def __new__(cls, value: str): - """ - Create a new instance of CircularString. - - For immutable built-in types like str, object initialization must be - performed in __new__ rather than __init__. The __init__ method is called - *after* the immutable value has already been created, which means it - cannot alter the underlying string content. By overriding __new__, we - ensure that the desired value is passed directly into the construction - of the string object before it becomes immutable. - """ - return super().__new__(cls, value) + def __new__(cls, value: bytes | bytearray | memoryview): + return super().__new__(cls, bytes(value)) def __getitem__(self, key): - """ - Return a portion of the sequence, supporting both indexing and slicing. - - This method is called whenever the object is accessed with square brackets, - e.g. obj[i] or obj[i:j]. If `key` is an integer, the method returns the - element at that position. If `key` is a slice, it returns a new instance of - the subclass containing the corresponding subsequence. - - This method has been modified to return subsequences assuming that the - string is circular. - """ n = len(self) if n == 0: - # If the circular string is empty. - # Behave like str: indexing raises, slicing returns empty if isinstance(key, slice): - return self.__class__("") - raise IndexError("CircularString index out of range (empty string)") + return self.__class__(b"") + raise IndexError("CircularBytes index out of range (empty bytes)") if isinstance(key, int): - # If obj[i] is called, return the character at that position. - # The difference from the normal string class is that the index - # wraps around, so it is never out of range. return super().__getitem__(key % n) if isinstance(key, slice): - # A slice object has start, stop and step properties, where step - # indicates taking for example every 3rd character if step == 3. - # str accepts None for each property. start, stop, step = key.start, key.stop, key.step step = 1 if step is None else step if step == 0: raise ValueError("slice step cannot be zero") - # Defaults that mimic normal slicing but on an infinite tiling if step > 0: start = 0 if start is None else start stop = n if stop is None else stop - # A special case is when stop <= start. - # A normal string returns "", but we return the slice - # going forward around the end of the string to start. - # or if stop <= start, go forward wrap once. while stop <= start: stop += n rng = range(start, stop, step) else: - # step < 0 start = (n - 1) if start is None else start stop = -1 if stop is None else stop - # Ensure we move backward; if stop >= start, wrap once (backwards). while stop >= start: stop -= n rng = range(start, stop, step) - # Map to modulo-n and collect - # Cap at one full lap for safety (no infinite loops) - # This part could probably be simplified if step was not supported. - limit = n if step % n == 0 else n * 2 # generous cap for large steps - out = [] + limit = n if step % n == 0 else n * 2 + out = bytearray() count = 0 for i in rng: out.append(super().__getitem__(i % n)) count += 1 if count > limit: - break # should never happen with the above normalization - - return self.__class__("".join(out)) + break + return self.__class__(bytes(out)) - # Fallback (shouldn't be reached) return super().__getitem__(key) + def cutaround(self, start: int, length: int) -> bytes: + """ + Return a circular slice of given length starting at index `start`. + Can exceed len(self), wrapping around as needed. + + Examples + -------- + s = CircularBytes(b"ABCDE") + assert s.cutaround(3, 7) == b"DEABCDE" + assert s.cutaround(-1, 4) == b"EABC" + """ + n = len(self) + if n == 0 or length <= 0: + return self.__class__(b"") + + start %= n + out = bytearray() + for i in range(length): + out.append(self[(start + i) % n]) + return self.__class__(bytes(out)) + class Dseq(_Seq): """Dseq describes a double stranded DNA fragment, linear or circular. @@ -509,36 +409,67 @@ def __init__( pos=0, ): if isinstance(watson, bytes): - watson = watson.decode("ASCII") + # watson is decoded to a string if needed. + watson = watson.decode("ascii") if isinstance(crick, bytes): - crick = crick.decode("ASCII") + # crick is decoded to a string if needed. + crick = crick.decode("ascii") if crick is None: if ovhg is not None: - raise ValueError("ovhg defined without crick strand!") - # Giving only the watson string implies a blunt sequence - self._data = bytes(watson, encoding="ASCII") + raise ValueError("ovhg (overhang) defined without a crick strand.") + """ + Giving only the watson string implies inferring the Crick complementary strand + from the Watson sequence. The watson string can contain dscode letters wich will + be interpreted as outlined in the pydna.alphabet module. + + The _data property must be a byte string for compatibility with + Biopython Bio.Seq.Seq + """ + data = watson + self._data = data.encode("ascii") - else: # crick strand given + else: + """ + Crick strand given, ovhg is optional. An important consequence is that the + watson and crick strands are interpreted as single stranded DNA that is + supposed to anneal. + + If ovhg was not given, we try to guess the value below. This will fail + if there are two or more ways to anneal with equal length of the double + stranded part. + """ if ovhg is None: # ovhg not given, try to guess from sequences + limit = int(_math.log(len(watson)) / _math.log(4)) olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), - int(_math.log(len(watson)) / _math.log(4)), + limit, ) - if len(olaps) == 0: # No overlaps found, strands do not anneal + + """No overlaps found, strands do not anneal""" + if len(olaps) == 0: raise ValueError( - "Could not anneal the two strands." " Please provide ovhg value" + "Could not anneal the two strands." + f" looked for annealing with at least {limit} basepairs" + " Please provide and overhang value (ovhg parameter)" ) - # We extract the positions and length of the first (longest) overlap, - # since common_sub_strings sorts the overlaps by length, longest first. + """ + We extract the positions and length of the first (longest) overlap, + since common_sub_strings sorts the overlaps by length, longest first. + """ + (pos_watson, pos_crick, longest_olap_length), *rest = olaps - # We see if there is another overlap of the same length - # This means that annealing is ambigous. USer should provide - # and ovhg value. - if any(olap[2] >= longest_olap_length for olap in rest): + """ + We see if there is another overlap of the same length + This means that annealing is ambigous. USer should provide + and ovhg value. + """ + if any( + olap_length >= longest_olap_length for _, _, olap_length in rest + ): raise ValueError( "More than one way of annealing the" " strands. Please provide ovhg value" @@ -546,47 +477,44 @@ def __init__( ovhg = pos_crick - pos_watson - sense = f"{watson:>{len(watson) + ovhg}}" # pad on left side ovhg spaces - antisense = ( - f"{crick[::-1]:>{len(crick) - ovhg}}" # pad on left side -ovhg spaces - ) - - assert sense == (ovhg * " ") + watson - assert antisense == (-ovhg * " ") + crick[::-1] - - max_len = max(len(sense), len(antisense)) # find out which is longer + """ + Pad both strands on left side ovhg spaces + a negative number gives no padding, + """ + sense = ovhg * " " + watson + antisense = -ovhg * " " + crick[::-1] - sense = f"{sense:<{max_len}}" # pad on right side to max_len - antisense = f"{antisense:<{max_len}}" # pad on right side to max_len + max_len = max(len(sense), len(antisense)) + """pad both strands on right side to same size.""" + sense = sense.ljust(max_len) + antisense = antisense.ljust(max_len) + """both strands padded so that bsepairs align""" assert len(sense) == len(antisense) - data = bytearray() + data = [] for w, c in zip(sense, antisense): try: - data.extend(bp_dict[w.encode("ascii"), c.encode("ascii")]) + data.append(basepair_dict[w, c]) except KeyError as err: print(f"Base mismatch in representation {err}") - raise - self._data = bytes(data) + raise ValueError() + data = "".join(data) + self._data = data.encode("ascii") self.circular = circular self.pos = pos - test_data = self._data + if circular: - test_data += self._data[0:1] - msg = "" - counter = 0 - for mobj in _re.finditer( - b"(.{0,3})([PEXIpexi][QFZJqfzj]|[QFZJqfzj][PEXIpexi])(.{0,3})", test_data - ): - chunk = mobj.group() - msg += f"[{mobj.start()}:{mobj.end()}]\n{representation(chunk)}\n\n" - counter += 1 - if counter: + data += data[0:1] + + dsb = dsbreaks(data) + + if dsb: + msg = "".join(dsb) raise ValueError( - f"Molecule is internally split in {counter} location(s):\n\n{msg}".strip() + f"Molecule is internally split in {len(dsb)} location(s):\n\n{msg}".strip() ) @classmethod @@ -613,7 +541,7 @@ def from_string( obj = cls.__new__(cls) obj.circular = circular obj.pos = 0 - obj._data = dna.encode("ASCII") + obj._data = dna.encode("ascii") return obj @classmethod @@ -627,17 +555,27 @@ def from_representation(cls, dsdna: str, *args, **kwargs): for ln in clean.splitlines() if ln.strip() and not ln.strip().startswith("Dseq(") ] - watson = f"{watson:<{len(crick)}}" - crick = f"{crick:<{len(watson)}}" - data = bytearray() - for w, c in zip(watson, crick): - try: - data.extend(bp_dict[w.encode("ascii"), c.encode("ascii")]) - except KeyError as err: - print(f"Base mismatch in representation {err}") - raise - obj._data = bytes(data) - return obj + ovhgw = len(watson) - len(watson.lstrip()) + ovhgc = -(len(crick) - len(crick.lstrip())) + + ovhg = ovhgw or ovhgc + + watson = watson.strip() + crick = crick.strip()[::-1] + + return Dseq(watson, crick, ovhg) + + # watson = f"{watson:<{len(crick)}}" + # crick = f"{crick:<{len(watson)}}" + # data = bytearray() + # for w, c in zip(watson, crick): + # try: + # data.extend(basepair_dict[w, c]) + # except KeyError as err: + # print(f"Base mismatch in representation {err}") + # raise + # obj._data = bytes(data) + # return obj @classmethod def from_full_sequence_and_overhangs( @@ -705,7 +643,7 @@ def from_full_sequence_and_overhangs( return Dseq(watson, crick=crick, ovhg=crick_ovhg) @property - def watson(self): + def watson(self) -> str: """ The watson (upper) strand of the double stranded fragment 5'-3'. @@ -715,10 +653,10 @@ def watson(self): DESCRIPTION. """ - return self._data.translate(dscode_to_watson_table).strip().decode("ascii") + return self._data.decode("ascii").translate(dscode_to_watson_table).strip() @property - def crick(self): + def crick(self) -> str: """ The crick (lower) strand of the double stranded fragment 5'-3'. @@ -728,10 +666,10 @@ def crick(self): DESCRIPTION. """ - return self._data.translate(dscode_to_crick_table).strip().decode("ascii")[::-1] + return self._data.decode("ascii").translate(dscode_to_crick_table).strip()[::-1] @property - def ovhg(self): + def ovhg(self) -> int: """ The 5' overhang of the lower strand compared the the upper. @@ -743,9 +681,8 @@ def ovhg(self): DESCRIPTION. """ - ohw = len(_re.match(b"^[PEXIpexi]*", self._data).group(0)) - ohc = len(_re.match(b"^[QFZJqfzj]*", self._data).group(0)) - return -ohw or ohc + parts = get_parts(self._data.decode("ascii")) + return -len(parts.sticky_left5) or len(parts.sticky_left3) def to_blunt_string(self): """ @@ -758,7 +695,7 @@ def to_blunt_string(self): DESCRIPTION. """ - return self._data.translate(dscode_to_to_full_sequence_table).decode("ascii") + return self._data.decode("ascii").translate(dscode_to_full_sequence_table) __str__ = to_blunt_string @@ -873,13 +810,15 @@ def __eq__(self, other: DseqType) -> bool: same = False return same - def __repr__(self): + def __repr__(self, lim: int = length_limit_for_repr): header = f"{self.__class__.__name__}({({False: '-', True: 'o'}[self.circular])}{len(self)})" - rpr = representation(self._data) + w, c = representation_tuple( + self._data.decode("ascii"), length_limit_for_repr=length_limit_for_repr + ) - return _pretty_str(f"{header}\n{rpr}") + return _pretty_str(header + "\n" + w + "\n" + c) def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. @@ -902,7 +841,7 @@ def reverse_complement(self) -> "Dseq": >>> """ - return Dseq(_rc(self._data), circular=self.circular) + return Dseq.quick(_rc(self._data), circular=self.circular) rc = reverse_complement # alias for reverse_complement @@ -961,36 +900,21 @@ def looped(self: DseqType) -> DseqType: if self.circular: return _copy.deepcopy(self) - ms = _re.fullmatch(b"(?:.*)([GATCgatc])([QFZJqfzj]+|[PEXIpexi]+)", self._data) - mo = _re.fullmatch(b"([PEXIpexi]+|[QFZJqfzj]+)([GATCgatc])(?:.*)", self._data) - - sticky_left = ms.group(2) if ms else b"" - sticky_right = mo.group(1) if mo else b"" - - sticky_left_just = bytes(f"{sticky_left.decode():<{len(sticky_right)}}", "utf8") - sticky_right_just = bytes( - f"{sticky_right.decode():>{len(sticky_left)}}", "utf8" - ) - - assert len(sticky_left_just) == len(sticky_right_just) + parts = get_parts(self._data.decode("ascii")) - junction = b"".join( - [ - annealing_dict.get((bytes([w]), bytes([c])), b"-") - for w, c in zip(sticky_left_just, sticky_right_just) - ] - ) + end = Dseq(parts.sticky_right5 or parts.sticky_right3) - if b"-" in junction: + try: + result = end + self + except TypeError: raise TypeError( "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!" ) - return self.__class__( - junction - + self._data[len(sticky_left) or None : -len(sticky_right) or None], - circular=True, - ) + new_ds_part = get_parts(result._data.decode("ascii")).middle + result = Dseq(new_ds_part, circular=True) + + return result def tolinear(self: DseqType) -> DseqType: # pragma: no cover """Returns a blunt, linear copy of a circular Dseq object. This can @@ -1028,7 +952,7 @@ def tolinear(self: DseqType) -> DseqType: # pragma: no cover raise TypeError("DNA is not circular.\n") selfcopy = _copy.deepcopy(self) selfcopy.circular = False - return selfcopy # self.__class__(self.watson, linear=True) + return selfcopy def five_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of @@ -1065,20 +989,24 @@ def five_prime_end(self) -> _Tuple[str, str]: pydna.dseq.Dseq.three_prime_end """ - if self.watson and not self.crick: - return "5'", self.watson.lower() - if not self.watson and self.crick: - return "3'", self.crick.lower() - if self.ovhg < 0: - sticky = self.watson[: -self.ovhg].lower() + parts = get_parts(self._data.decode("ascii")) + + sticky5 = parts.sticky_left5.translate(dscode_to_watson_table) + + sticky3 = parts.sticky_left3.translate(dscode_to_crick_table)[::-1] + + single_watson = parts.single_watson.translate(dscode_to_watson_table) + + single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] + + if sticky5 == sticky3 == "" and not (single_watson or single_crick): + type_, sticky = "blunt", "" + elif sticky := sticky5 or single_watson: type_ = "5'" - elif self.ovhg > 0: - sticky = self.crick[-self.ovhg :].lower() + elif sticky := sticky3 or single_crick: type_ = "3'" - else: - sticky = "" - type_ = "blunt" - return type_, sticky + + return type_, sticky.lower() def three_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of @@ -1114,24 +1042,34 @@ def three_prime_end(self) -> _Tuple[str, str]: """ - ovhg = len(self.watson) - len(self.crick) + self.ovhg + parts = get_parts(self._data.decode("ascii")) + + sticky5 = parts.sticky_right5.translate(dscode_to_crick_table)[::-1] + + sticky3 = parts.sticky_right3.translate(dscode_to_watson_table) - if ovhg < 0: - sticky = self.crick[:-ovhg].lower() + single_watson = parts.single_watson.translate(dscode_to_watson_table)[::-1] + + single_crick = parts.single_crick.translate(dscode_to_crick_table) + + if sticky5 == sticky3 == "" and not (single_watson or single_crick): + type_, sticky = "blunt", "" + elif sticky := sticky5 or single_watson: type_ = "5'" - elif ovhg > 0: - sticky = self.watson[-ovhg:].lower() + elif sticky := sticky3 or single_crick: type_ = "3'" - else: - sticky = "" - type_ = "blunt" - return type_, sticky + + return type_, sticky.lower() def watson_ovhg(self) -> int: - """Returns the overhang of the watson strand at the three prime.""" - return len(self.watson) - len(self.crick) + self.ovhg + """Overhang of the watson strand at the right side.""" + type_, sticky = self.three_prime_end() - def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: + watson_ovhg = {"5'": -len(sticky), "3'": len(sticky), "blunt": 0}[type_] + + return watson_ovhg + + def __add__(self: DseqType, other: DseqType) -> DseqType: if self.circular: raise TypeError("circular DNA cannot be ligated!") @@ -1147,63 +1085,52 @@ def _add(self: DseqType, other: DseqType, perfectmatch) -> DseqType: elif not self: return _copy.deepcopy(other) - ms = _re.search(b"([PEXIpexiQFZJqfzj]+)$", self._data) - mo = _re.match(b"^([PEXIpexiQFZJqfzj]+)", other._data) + parts = get_parts(self._data.decode("ascii")) + other_parts = get_parts(other._data.decode("ascii")) - sticky_self = ms.group() if ms else b"" - sticky_other = mo.group() if mo else b"" - - sticky_self_just = bytes(f"{sticky_self.decode():<{len(sticky_other)}}", "utf8") - sticky_other_just = bytes( - f"{sticky_other.decode():>{len(sticky_self)}}", "utf8" + this_sticky = ( + parts.sticky_right5 + or parts.sticky_right3 + or parts.single_watson + or parts.single_crick + ) + other_sticky = ( + other_parts.sticky_left5 + or other_parts.sticky_left3 + or other_parts.single_watson + or other_parts.single_crick ) - assert len(sticky_self_just) == len(sticky_other_just) + max_len = max(len(this_sticky), len(other_sticky)) - if perfectmatch: - # breakpoint() - junction = b"".join( - [ - annealing_dict.get((bytes([w]), bytes([c])), b"-") - for w, c in zip(sticky_self_just, sticky_other_just) - ] - ) + this_sticky_just = this_sticky.ljust(max_len) + other_sticky_just = other_sticky.rjust(max_len) - if b"-" in junction: - raise TypeError("sticky ends not compatible!") + junction = "" - else: + for x, y in zip(this_sticky_just, other_sticky_just): + try: + letter = annealing_dict[x, y] + except KeyError: - result = _terminal_overlap( - sticky_self.translate(dscode_to_to_full_sequence_table).decode("ascii"), - sticky_other.translate(dscode_to_to_full_sequence_table).decode("ascii") - + "@", - limit=1, - ) + before_cut = parts.middle[-1] if parts.middle else "" + after_cut = other_parts.middle[0] if other_parts.middle else "" - if not result: - return None + w, c = representation_tuple(before_cut + this_sticky) + ow, oc = representation_tuple(other_sticky + after_cut) - (startx, starty, length), *_ = result + maxlen = max(len(w), len(c)) + w, c = w.ljust(maxlen), c.ljust(maxlen) - sticky_self = sticky_self[startx : startx + length] - sticky_other = sticky_other[starty : starty + length] + raise TypeError( + "sticky ends not compatible!\n" f"-{w} {ow}-\n" f"-{c} {oc}-" + ) - junction = b"".join( - [ - annealing_dict.get((bytes([w]), bytes([c])), b"-") - for w, c in zip(sticky_self, sticky_other) - ] - ) + junction += letter - return self.__class__( - self._data[: -len(sticky_self) or None] - + junction - + other._data[len(sticky_other) or None :] - ) + result = Dseq("".join(parts[0:3] + (junction,) + other_parts[2:5])) - def __add__(self: DseqType, other: DseqType) -> DseqType: - return self._add(other, perfectmatch=True) + return result def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): @@ -1556,13 +1483,21 @@ def isblunt(self) -> bool: >>> a.isblunt() False """ - return ( - self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular + parts = get_parts(self._data.decode("ascii")) + + return not any( + ( + parts.sticky_right5, + parts.sticky_right3, + parts.sticky_left3, + parts.sticky_left5, + self.circular, + ) ) def cas9(self, RNA: str) -> _Tuple[slice, ...]: """docstring.""" - bRNA = bytes(RNA, "ASCII") + bRNA = RNA.encode("ascii") slices = [] cuts = [0] for m in _re.finditer(bRNA, self._data): @@ -1590,6 +1525,7 @@ def user(self): None. """ + return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"FfEe"))) def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: @@ -1802,13 +1738,13 @@ def get_ss_meltsites(self: DseqType, length) -> _List[CutSiteType]: if length < 1: return () - regex = _re.compile( - ( - f"(?P((?<=[PEXIpexi]))([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])))|" - f"(?P((?<=[QFZJqfzj]))([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])))" - ).encode("ascii") + regex = ( + f"(?P((?<=[PEXIpexi]))([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])))|" + f"(?P((?<=[QFZJqfzj]))([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])))" ) + regex = _re.compile(regex.encode("ascii")) + if self.circular: spacer = length @@ -1850,13 +1786,13 @@ def get_ds_meltsites(self: DseqType, length) -> _List[CutSiteType]: if length < 1: return () - regex = _re.compile( - ( - f"(?P((?<=[PEXIpexi])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])|$))|" - f"(?P((?<=[QFZJqfzj])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])|$))" - ).encode("ascii") + regex = ( + f"(?P((?<=[PEXIpexi])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^PEXIpexiGATCgatcUuOo])|$))|" + f"(?P((?<=[QFZJqfzj])|^)([GATCgatcUuOo]{{1,{length}}})((?=[^QFZJqfzjGATCgatcUuOo])|$))" ) + regex = _re.compile(regex.encode("ascii")) + if self.circular: spacer = length @@ -1885,18 +1821,12 @@ def get_ds_meltsites(self: DseqType, length) -> _List[CutSiteType]: def cast_to_ds_right(self): """ - NNNNQFZJ - NNNN---- NNNNCTAG NNNNGATC NNNNCTAG - - - NNNNPEXI - NNNNGATC NNNN---- @@ -1905,29 +1835,24 @@ def cast_to_ds_right(self): """ - def replace(m): - return m.group(1).translate(dscode_to_to_full_sequence_table) + p = get_parts(self._data.decode("ascii")) - # Not using f-strings below to avoid bytes/string conversion - return self.__class__( - _re.sub(b"(?<=[GATCgatc])([PEXIpexiQFZJqfzj]+)$", replace, self._data), - circular=False, + ds_stuffer = (p.sticky_right5 or p.sticky_right3).translate( + dscode_to_full_sequence_table ) + result = (p.sticky_left5 or p.sticky_left3) + p.middle + ds_stuffer + + return self.__class__(result, circular=False) + def cast_to_ds_left(self): """ - PEXINNNN - GATCNNNN NNNN GATCNNNN CTAGNNNN - - - QFZJNNNN - NNNN CTAGNNNN @@ -1936,15 +1861,16 @@ def cast_to_ds_left(self): """ - def replace(m): - return m.group(1).translate(dscode_to_to_full_sequence_table) + p = get_parts(self._data.decode("ascii")) - # Not using f-strings below to avoid bytes/string conversion - return self.__class__( - _re.sub(b"^([PEXIpexiQFZJqfzj]+)(?=[GATCgatc])", replace, self._data), - circular=False, + ds_stuffer = (p.sticky_left5 or p.sticky_left3).translate( + dscode_to_full_sequence_table ) + result = ds_stuffer + p.middle + (p.sticky_right5 or p.sticky_right3) + + return self.__class__(result, circular=False) + def get_cut_parameters( self, cut: _Union[CutSiteType, None], is_left: bool ) -> _Tuple[int, int, int]: @@ -2090,54 +2016,89 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": if _cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) - if left_cut: - (left_watson_cut, left_overhang), _ = left_cut - else: - (left_watson_cut, left_overhang), _ = ((0, 0), None) + """left_cut is None if start of the sequence""" + left_cut = left_cut or ((0, 0), None) + """right_cut is None if end of the sequence""" + right_cut = right_cut or ((len(self), 0), None) - if right_cut: - (right_watson_cut, right_overhang), _ = right_cut - else: - (right_watson_cut, right_overhang), _ = ( - (len(self), 0), - None, - ) + """extract (position, overhang), enzyme). Enzyme is not used""" + (x, xovhg), _ = left_cut + (y, yovhg), _ = right_cut - table1 = ( - dscode_to_watson_tail_table - if left_overhang > 0 - else dscode_to_crick_tail_table - ) - table2 = ( - dscode_to_watson_tail_table - if right_overhang < 0 - else dscode_to_crick_tail_table - ) - - left_stck_begin = min(left_watson_cut, left_watson_cut - left_overhang) - left_stck_end = left_stck_begin + abs(left_overhang) - - right_stck_begin = min(right_watson_cut, right_watson_cut - right_overhang) - right_stck_end = right_stck_begin + abs(right_overhang) + """ + Fragment slice begin:end that contains the sticky ends + The length of the overhangs are factored in. + """ + begin = min(x, x - xovhg) + end = max(y, y - yovhg) + """A circular byte string class (CircularBytes) is used to + handle slices across the origin of the sequence when + begin > end + """ if self.circular: - cutfrom = CircularString(self.to_blunt_string()) + cutfrom = CircularBytes(self._data) + if left_cut == right_cut and begin < end: + """ + The edge case below happens when a circular fragment + is cut in the same place, but the length of the sticky + ends leads to begin < end although we want a slice + over the origin. + In the example below, begin is 1 and end is + 4 which would give us "GATC", to solve this + the fragment length is added to the end + which gives us 1 and 4 + 6 = 10 which gives us + "GATCCGGATC" + + Dseq(o6) + GGATCC + CCTAGG + + Dseq(o6) + GATCCG + GCCTAG + """ + end += len(self) else: - cutfrom = self._data.decode("ascii") + cutfrom = self._data + """ + dschunk contains the desired slice including sticky ends + bytarrays are mutable sequences, so we can change slices by + assignment. + """ + dschunk = bytearray(cutfrom[begin:end]) - left_sticky_end = ( - cutfrom[left_stck_begin:left_stck_end] - .translate(table1) - .encode("ascii")[0 : abs(left_overhang)] - ) - ds_middle_part = cutfrom[left_stck_end:right_stck_begin].encode("ascii") - right_sticky_end = ( - cutfrom[right_stck_begin:right_stck_end] - .translate(table2) - .encode("ascii")[0 : abs(right_overhang)] - ) + """A translation table is chosen to process the sticky end""" + if xovhg > 0: + tb = dscode_to_watson_tail_table + else: + tb = dscode_to_crick_tail_table + """ + The slice containing the left sticky end is replaced by + translation to single strand dscode + """ + dschunk[: abs(xovhg)] = dschunk[: abs(xovhg)].translate(tb) + + """ + The processed chunk is reversed, avoiding complicated slicing + edge cases. + """ + dschunk_rev = dschunk[::-1] + + if yovhg < 0: + tb = dscode_to_watson_tail_table + else: + tb = dscode_to_crick_tail_table + """ + The right sticky end is processed + """ + dschunk_rev[: abs(yovhg)] = dschunk_rev[: abs(yovhg)].translate(tb) + """ + Finally, sequence is revesed to correct order. + """ + result = Dseq(dschunk_rev[::-1].decode("ascii")) - return Dseq.quick(left_sticky_end + ds_middle_part + right_sticky_end) + return result def get_cutsite_pairs( self, cutsites: _List[CutSiteType] From fa88fcb6223f7ec2bdc3d4da650ce2c9b612b68a Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:11:12 +0100 Subject: [PATCH 29/36] Only check for start of error message. --- src/pydna/ligate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pydna/ligate.py b/src/pydna/ligate.py index 280bb5ba..50d1e0c0 100644 --- a/src/pydna/ligate.py +++ b/src/pydna/ligate.py @@ -31,7 +31,7 @@ def ligate(fragments: list): try: seq1 + seq2 except TypeError as err: - if str(err) != "sticky ends not compatible!": + if not str(err).startswith("sticky ends not compatible"): raise else: if seq1.seq.three_prime_end() != ( From e8d3627f39390495af2bb6a4f9be9182536c7a46 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:12:46 +0100 Subject: [PATCH 30/36] Clearer names for some dicts --- src/pydna/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pydna/utils.py b/src/pydna/utils.py index 5c615e5e..e7fe94f3 100644 --- a/src/pydna/utils.py +++ b/src/pydna/utils.py @@ -20,8 +20,8 @@ from pydna.codon import weights as _weights from pydna.codon import rare_codons as _rare_codons -from pydna.alphabet import bp_dict_str -from pydna.alphabet import complement_table_dscode +from pydna.alphabet import basepair_dict +from pydna.alphabet import complement_table_for_dscode from Bio.SeqFeature import SimpleLocation as _sl from Bio.SeqFeature import CompoundLocation as _cl from Bio.SeqFeature import Location as _Location @@ -212,7 +212,7 @@ def anneal_from_left(watson: str, crick: str) -> int: result = len( list( _itertools.takewhile( - lambda x: bp_dict_str.get((x[0], x[1])), zip(watson, crick[::-1]) + lambda x: basepair_dict.get((x[0], x[1])), zip(watson, crick[::-1]) ) ) ) @@ -286,15 +286,15 @@ def rc(sequence: StrOrBytes) -> StrOrBytes: accepts mixed DNA/RNA """ - return sequence.translate(complement_table_dscode)[::-1] + return complement(sequence)[::-1] -def complement(sequence: str): +def complement(sequence: StrOrBytes) -> StrOrBytes: """Complement. accepts mixed DNA/RNA """ - return sequence.translate(complement_table_dscode) + return sequence.translate(complement_table_for_dscode) # def memorize(filename): From 9a886cf4975798ce09438c2c40347f0374ab4f82 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:14:52 +0100 Subject: [PATCH 31/36] Replaced numbers 1111 and 2222 with bbbb and rrrr, respectively. Numbers do not work with new Dseq implementation. --- tests/test_module_assembly2.py | 42 ++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/test_module_assembly2.py b/tests/test_module_assembly2.py index eba500df..f5454594 100644 --- a/tests/test_module_assembly2.py +++ b/tests/test_module_assembly2.py @@ -1603,19 +1603,19 @@ def test_insertion_assembly(): # Insertion of linear sequence into linear sequence (like # homologous recombination of PCR product with homology arms in genome) - a = Dseqrecord("CGTACGCACA1111CGTACGCACAC") - b = Dseqrecord("CGTACGCACA2222CGTACGCACAT") + a = Dseqrecord("CGTACGCACAbbbbCGTACGCACAC") + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) # All possibilities, including the single insertions results = [ - "CGTACGCACA2222CGTACGCACA1111CGTACGCACAC", - "CGTACGCACA2222CGTACGCACAC", - "CGTACGCACA1111CGTACGCACA2222CGTACGCACAT", - "CGTACGCACA1111CGTACGCACA2222CGTACGCACAC", - "CGTACGCACA1111CGTACGCACAT", - "CGTACGCACA2222CGTACGCACA1111CGTACGCACAT", + "CGTACGCACArrrrCGTACGCACAbbbbCGTACGCACAC", + "CGTACGCACArrrrCGTACGCACAC", + "CGTACGCACAbbbbCGTACGCACArrrrCGTACGCACAT", + "CGTACGCACAbbbbCGTACGCACArrrrCGTACGCACAC", + "CGTACGCACAbbbbCGTACGCACAT", + "CGTACGCACArrrrCGTACGCACAbbbbCGTACGCACAT", ] assembly_products = [ @@ -1627,21 +1627,21 @@ def test_insertion_assembly(): # TODO: debatable whether this kind of homologous recombination should happen, or how # the overlap restrictions should be applied. - a = Dseqrecord("CGTACGCACA1111C") - b = Dseqrecord("CGTACGCACA2222CGTACGCACAT") + a = Dseqrecord("CGTACGCACAbbbbC") + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) - results = ["CGTACGCACA2222CGTACGCACA1111C"] + results = ["CGTACGCACArrrrCGTACGCACAbbbbC"] for assem, result in zip(f.get_insertion_assemblies(), results): assert result == str(assembly.assemble([a, b], assem).seq) - a = Dseqrecord("CGTACGCACA1111C") - b = Dseqrecord("CGTACGCACA2222T") + a = Dseqrecord("CGTACGCACAbbbbC") + b = Dseqrecord("CGTACGCACArrrrT") f = assembly.Assembly([a, b], use_fragment_order=False, limit=10) assert len(f.get_insertion_assemblies()) == 0 # Does not work for circular molecules - a = Dseqrecord("CGTACGCACA1111CGTACGCACAC", circular=True) - b = Dseqrecord("CGTACGCACA2222CGTACGCACAT", circular=True) + a = Dseqrecord("CGTACGCACAbbbbCGTACGCACAC", circular=True) + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1649,8 +1649,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("CGTACGCACA1111C", circular=True) - b = Dseqrecord("CGTACGCACA2222CGTACGCACAT", circular=True) + a = Dseqrecord("CGTACGCACAbbbbC", circular=True) + b = Dseqrecord("CGTACGCACArrrrCGTACGCACAT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -1658,8 +1658,8 @@ def test_insertion_assembly(): == [] ) - a = Dseqrecord("CGTACGCACA1111C", circular=True) - b = Dseqrecord("CGTACGCACA2222T", circular=True) + a = Dseqrecord("CGTACGCACAbbbbC", circular=True) + b = Dseqrecord("CGTACGCACArrrrT", circular=True) assert ( assembly.Assembly( [a, b], use_fragment_order=False, limit=10 @@ -2451,7 +2451,7 @@ def test_insertion_edge_case(): def test_common_sub_strings(): - a = Dseqrecord("012345", circular=True) + a = Dseqrecord("RYBDKM", circular=True) for shift_1 in range(len(a)): a_shifted = a.shifted(shift_1) for shift_2 in range(len(a)): @@ -2575,5 +2575,3 @@ def test_common_handle_insertion_fragments(): assert [genome] + inserts == assembly.common_handle_insertion_fragments( genome, inserts ) - -# pytest.main([__file__, "-vvv", "-s"]) From 2c08dd15c7557c58a225b5a90cdd33fc8bf39432 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:17:19 +0100 Subject: [PATCH 32/36] Added tests. --- tests/test_module_dseq.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index c2d56f0a..252772ca 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -291,8 +291,13 @@ def test_dseq(): obj = Dseq("G", "", 0) assert obj.five_prime_end() == ("5'", "g") + assert obj.three_prime_end() == ("5'", "g") obj = Dseq("", "C", 0) assert obj.five_prime_end() == ("3'", "c") + assert obj.three_prime_end() == ("3'", "c") + + + obj = Dseq("ccGGATCC", "aaggatcc", -2) # assert obj._data == b"ccGGATCCtt" @@ -564,8 +569,6 @@ def test_dseq(): def test_Dseq_slicing(): - - a = Dseq("ggatcc", "ggatcc", 0) assert a[:].watson == a.watson @@ -623,8 +626,6 @@ def test_Dseq___getitem__(): def test_cut_circular(): - - test = "aaaaaaGGTACCggtctcaaaa" for i in range(len(test)): From 8a0a544eaf9028caa8a8f77bc6d8b60bfd20336a Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Tue, 7 Oct 2025 15:18:05 +0100 Subject: [PATCH 33/36] Moved imports to top. removed main block --- tests/test_module_ligate.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_module_ligate.py b/tests/test_module_ligate.py index 14cde155..bc037be3 100755 --- a/tests/test_module_ligate.py +++ b/tests/test_module_ligate.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- import pytest - +from pydna.ligate import ligate +from pydna.dseq import Dseq +from pydna.dseqrecord import Dseqrecord def test_ligate(): - from pydna.ligate import ligate - from pydna.dseq import Dseq - from pydna.dseqrecord import Dseqrecord + a = Dseqrecord( Dseq.from_representation( @@ -73,7 +73,3 @@ def list_combinations(a, b): for ss in lsequences: assert ss.seguid() == "ldseguid=vBQxmszgfR4b84O0wI7d_ya9uDA" assert len(ss) == 12 - - -if __name__ == "__main__": - pytest.main([__file__, "-vv", "-s"]) From 3c5256233eeeea26ab9653d6cd9c16a2d21c2eb7 Mon Sep 17 00:00:00 2001 From: BjornFJohansson Date: Wed, 8 Oct 2025 18:54:34 +0100 Subject: [PATCH 34/36] bugfix for USER cloning, USER design example in design module --- src/pydna/design.py | 78 +++++++++++++++++++++++++++++++++++++++ src/pydna/dseq.py | 2 +- tests/test_USERcloning.py | 9 +++-- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/src/pydna/design.py b/src/pydna/design.py index 446daa05..f7a4544b 100755 --- a/src/pydna/design.py +++ b/src/pydna/design.py @@ -806,3 +806,81 @@ def circular_assembly_fragments(f, overlap=35, maxlink=40): stacklevel=2, ) return assembly_fragments(f, overlap=overlap, maxlink=maxlink, circular=True) + + +""" + + + + + overlap + _____15______ + | | +TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG | +AATCTTCATCCCTATCAGGGTTTGGAGTGAATGGTGACGGTTATTCCCC | + | aagccgaactgggccagacaacccggcgctaacgcactca + 44 ttcggcttgacccggtctgttgggccgcgattgcgtgagt + | + 9 + + + +TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG 9 + |||||||||||||||| | + GTGACGGTTATUCCCCTTCGGCTTGA + +AATCTTCATCCCTATCAGGGTTTGGAGTGAATGGTGACGGTTATTCCCC + + aagccgaactgggccagacaacccggcgctaacgcactca +AGGGGAAGCCGAACUGGGC + AGGGGAAGCCGAACUGGGC + | |||||||||||||| + 44 ttcggcttgacccggtctgttgggccgcgattgcgtgagt + +""" + + +def user_assembly_fragments(f, overlap=35, maxlink=40, circular=False): + from itertools import pairwise + from pydna.dseqrecord import Dseqrecord + import re + from pydna.primer import Primer + from pydna.amplify import pcr + + a1 = Dseqrecord("TTAGAAGTAGGGATAGTCCCAAACCTCACTTACCACTGCCAATAAGGGG") + b1 = Dseqrecord("AAGCCGAACTGGGCCAGACAACCCGGCGCTAACGCACTCA") + fragments = [a1, b1] + amplicons = [] + + for fragment in fragments: + amplicons.append(primer_design(fragment)) + + for ths, nxt in pairwise(amplicons): + + A_positions = [m.start() for m in re.finditer("A|a", str(ths.seq))] + T_positions = [m.start() for m in re.finditer("T|t", str(nxt.seq))] + + for x, y in zip(A_positions[::-1], T_positions): + + sticky_length = y + len(ths) - x + + rp = bytearray(nxt.seq[: y + 1].rc()._data + ths.reverse_primer.seq._data) + fp = bytearray(ths.seq[x:]._data + nxt.forward_primer.seq._data) + + fp[sticky_length] = ord(b"U") + rp[sticky_length] = ord(b"U") + + ths.reverse_primer = Primer(rp) + nxt.forward_primer = Primer(fp) + + break + + a2 = pcr(ths).seq + a2_user = a2.user() + a2_sticky, stuffer = a2_user.melt(14) + + b2 = pcr(nxt).seq + b2_user = b2.user() + stuffer, b2_sticky = b2_user.melt(14) + + assert (a1 + b1).seq == a2_sticky + b2_sticky diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index dae373aa..97823052 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -1526,7 +1526,7 @@ def user(self): """ - return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"FfEe"))) + return Dseq(self._data.translate(bytes.maketrans(b"UuOo", b"ZzEe"))) def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. diff --git a/tests/test_USERcloning.py b/tests/test_USERcloning.py index 1d635c85..a5f35708 100644 --- a/tests/test_USERcloning.py +++ b/tests/test_USERcloning.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import pytest from pydna.dseq import Dseq from pydna.parsers import parse, parse_primers from pydna.amplify import pcr @@ -56,8 +55,12 @@ def test_USER_cloning(): stuffer, insert, stuffer = USERprocessed.melt(1) - plasmid = Dseq("FqaaaPE") + plasmid = Dseq.from_representation(""" + Dseq(-7) + aaaGT + Tcttt + """) plasmid_insert = (plasmid + insert).looped() - assert plasmid_insert == Dseq("AgaaaGAGGATTaaaCGGCG", circular=True) + assert plasmid_insert == Dseq("AgaaaGTGGATTaaaCGGCG", circular=True) From fe3f38e293526f72e8cd4586a7b5e0990751ccca Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 8 Oct 2025 08:55:45 +0100 Subject: [PATCH 35/36] remove empty file `master` --- master | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 master diff --git a/master b/master deleted file mode 100644 index e69de29b..00000000 From 307b14fad63695ba42db277f7c9c1e4ef16509e2 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 8 Oct 2025 12:52:06 +0100 Subject: [PATCH 36/36] refactor to simplify + address #452 --- src/pydna/dseq.py | 200 +++++++++----------------------------- tests/test_module_dseq.py | 54 +++++++++- 2 files changed, 96 insertions(+), 158 deletions(-) diff --git a/src/pydna/dseq.py b/src/pydna/dseq.py index 97823052..6dfd60ae 100644 --- a/src/pydna/dseq.py +++ b/src/pydna/dseq.py @@ -32,7 +32,6 @@ from pydna.utils import cuts_overlap as _cuts_overlap from pydna.alphabet import basepair_dict -from pydna.alphabet import annealing_dict from pydna.alphabet import dscode_to_watson_table from pydna.alphabet import dscode_to_crick_table @@ -682,6 +681,8 @@ def ovhg(self) -> int: """ parts = get_parts(self._data.decode("ascii")) + if parts.single_watson or parts.single_crick: + return None return -len(parts.sticky_left5) or len(parts.sticky_left3) def to_blunt_string(self): @@ -900,22 +901,15 @@ def looped(self: DseqType) -> DseqType: if self.circular: return _copy.deepcopy(self) - parts = get_parts(self._data.decode("ascii")) - - end = Dseq(parts.sticky_right5 or parts.sticky_right3) - - try: - result = end + self - except TypeError: + type5, sticky5 = self.five_prime_end() + type3, sticky3 = self.three_prime_end() + if type5 == type3 and str(sticky5) == str(_rc(sticky3)): + return self.__class__(self.watson, circular=True) + else: raise TypeError( "DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!" ) - new_ds_part = get_parts(result._data.decode("ascii")).middle - result = Dseq(new_ds_part, circular=True) - - return result - def tolinear(self: DseqType) -> DseqType: # pragma: no cover """Returns a blunt, linear copy of a circular Dseq object. This can only be done if the Dseq object is circular, otherwise a @@ -999,11 +993,15 @@ def five_prime_end(self) -> _Tuple[str, str]: single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] - if sticky5 == sticky3 == "" and not (single_watson or single_crick): + if sticky := single_watson: + type_ = "single" + elif sticky := single_crick: + type_ = "single" + elif sticky5 == sticky3 == "": type_, sticky = "blunt", "" - elif sticky := sticky5 or single_watson: + elif sticky := sticky5: type_ = "5'" - elif sticky := sticky3 or single_crick: + elif sticky := sticky3: type_ = "3'" return type_, sticky.lower() @@ -1048,26 +1046,29 @@ def three_prime_end(self) -> _Tuple[str, str]: sticky3 = parts.sticky_right3.translate(dscode_to_watson_table) - single_watson = parts.single_watson.translate(dscode_to_watson_table)[::-1] + single_watson = parts.single_watson.translate(dscode_to_watson_table) - single_crick = parts.single_crick.translate(dscode_to_crick_table) + single_crick = parts.single_crick.translate(dscode_to_crick_table)[::-1] - if sticky5 == sticky3 == "" and not (single_watson or single_crick): + if sticky := single_watson: + type_ = "single" + elif sticky := single_crick: + type_ = "single" + elif sticky5 == sticky3 == "": type_, sticky = "blunt", "" - elif sticky := sticky5 or single_watson: + elif sticky := sticky5: type_ = "5'" - elif sticky := sticky3 or single_crick: + elif sticky := sticky3: type_ = "3'" return type_, sticky.lower() def watson_ovhg(self) -> int: """Overhang of the watson strand at the right side.""" - type_, sticky = self.three_prime_end() - - watson_ovhg = {"5'": -len(sticky), "3'": len(sticky), "blunt": 0}[type_] - - return watson_ovhg + parts = get_parts(self._data.decode("ascii")) + if parts.single_watson or parts.single_crick: + return None + return -len(parts.sticky_right5) or len(parts.sticky_right3) def __add__(self: DseqType, other: DseqType) -> DseqType: @@ -1085,52 +1086,14 @@ def __add__(self: DseqType, other: DseqType) -> DseqType: elif not self: return _copy.deepcopy(other) - parts = get_parts(self._data.decode("ascii")) - other_parts = get_parts(other._data.decode("ascii")) + self_type, self_tail = self.three_prime_end() + other_type, other_tail = other.five_prime_end() - this_sticky = ( - parts.sticky_right5 - or parts.sticky_right3 - or parts.single_watson - or parts.single_crick - ) - other_sticky = ( - other_parts.sticky_left5 - or other_parts.sticky_left3 - or other_parts.single_watson - or other_parts.single_crick - ) - - max_len = max(len(this_sticky), len(other_sticky)) - - this_sticky_just = this_sticky.ljust(max_len) - other_sticky_just = other_sticky.rjust(max_len) - - junction = "" - - for x, y in zip(this_sticky_just, other_sticky_just): - try: - letter = annealing_dict[x, y] - except KeyError: - - before_cut = parts.middle[-1] if parts.middle else "" - after_cut = other_parts.middle[0] if other_parts.middle else "" - - w, c = representation_tuple(before_cut + this_sticky) - ow, oc = representation_tuple(other_sticky + after_cut) - - maxlen = max(len(w), len(c)) - w, c = w.ljust(maxlen), c.ljust(maxlen) - - raise TypeError( - "sticky ends not compatible!\n" f"-{w} {ow}-\n" f"-{c} {oc}-" - ) - - junction += letter - - result = Dseq("".join(parts[0:3] + (junction,) + other_parts[2:5])) - - return result + if self_type == other_type and str(self_tail) == str(_rc(other_tail)): + return self.__class__( + self.watson + other.watson, other.crick + self.crick, self.ovhg + ) + raise TypeError("sticky ends not compatible!") def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): @@ -2016,89 +1979,18 @@ def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": if _cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) - """left_cut is None if start of the sequence""" - left_cut = left_cut or ((0, 0), None) - """right_cut is None if end of the sequence""" - right_cut = right_cut or ((len(self), 0), None) - - """extract (position, overhang), enzyme). Enzyme is not used""" - (x, xovhg), _ = left_cut - (y, yovhg), _ = right_cut - - """ - Fragment slice begin:end that contains the sticky ends - The length of the overhangs are factored in. - """ - begin = min(x, x - xovhg) - end = max(y, y - yovhg) - - """A circular byte string class (CircularBytes) is used to - handle slices across the origin of the sequence when - begin > end - """ - if self.circular: - cutfrom = CircularBytes(self._data) - if left_cut == right_cut and begin < end: - """ - The edge case below happens when a circular fragment - is cut in the same place, but the length of the sticky - ends leads to begin < end although we want a slice - over the origin. - In the example below, begin is 1 and end is - 4 which would give us "GATC", to solve this - the fragment length is added to the end - which gives us 1 and 4 + 6 = 10 which gives us - "GATCCGGATC" - - Dseq(o6) - GGATCC - CCTAGG - - Dseq(o6) - GATCCG - GCCTAG - """ - end += len(self) - else: - cutfrom = self._data - """ - dschunk contains the desired slice including sticky ends - bytarrays are mutable sequences, so we can change slices by - assignment. - """ - dschunk = bytearray(cutfrom[begin:end]) - - """A translation table is chosen to process the sticky end""" - if xovhg > 0: - tb = dscode_to_watson_tail_table - else: - tb = dscode_to_crick_tail_table - """ - The slice containing the left sticky end is replaced by - translation to single strand dscode - """ - dschunk[: abs(xovhg)] = dschunk[: abs(xovhg)].translate(tb) - - """ - The processed chunk is reversed, avoiding complicated slicing - edge cases. - """ - dschunk_rev = dschunk[::-1] - - if yovhg < 0: - tb = dscode_to_watson_tail_table - else: - tb = dscode_to_crick_tail_table - """ - The right sticky end is processed - """ - dschunk_rev[: abs(yovhg)] = dschunk_rev[: abs(yovhg)].translate(tb) - """ - Finally, sequence is revesed to correct order. - """ - result = Dseq(dschunk_rev[::-1].decode("ascii")) - - return result + left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) + right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) + return Dseq( + str(self[left_watson:right_watson]), + # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case + str( + self.reverse_complement()[ + len(self) - right_crick : len(self) - left_crick + ] + ), + ovhg=ovhg_left, + ) def get_cutsite_pairs( self, cutsites: _List[CutSiteType] diff --git a/tests/test_module_dseq.py b/tests/test_module_dseq.py index 252772ca..d1e40dd9 100644 --- a/tests/test_module_dseq.py +++ b/tests/test_module_dseq.py @@ -290,11 +290,11 @@ def test_dseq(): assert obj[2:1]._data == b"ga" obj = Dseq("G", "", 0) - assert obj.five_prime_end() == ("5'", "g") - assert obj.three_prime_end() == ("5'", "g") + assert obj.five_prime_end() == ("single", "g") + assert obj.three_prime_end() == ("single", "g") obj = Dseq("", "C", 0) - assert obj.five_prime_end() == ("3'", "c") - assert obj.three_prime_end() == ("3'", "c") + assert obj.five_prime_end() == ("single", "c") + assert obj.three_prime_end() == ("single", "c") @@ -1130,3 +1130,49 @@ def test_checksums(): truth = "cdseguid=5fHMG19IbYxn7Yr7_sOCkvaaw7U" assert cdseguid("ACGTT", "AACGT") == truth == Dseq("ACGTT", "AACGT", circular=True).seguid() assert cdseguid("AACGT", "ACGTT") == truth == Dseq("AACGT", "ACGTT", circular=True).seguid() + + +def test_ovhg(): + # No overhang + assert Dseq("AAAA").ovhg == 0 + assert Dseq("AAAA", circular=True).ovhg == 0 + # Sticky ends + assert Dseq("FFAA").ovhg == 2 + assert Dseq("EEAA").ovhg == -2 + + # Sticky end on the other hang does not matter + assert Dseq("AAFF").ovhg == 0 + assert Dseq("AAEE").ovhg == 0 + + # + assert Dseq("FFAAFF").ovhg == 2 + assert Dseq("FFAAEE").ovhg == 2 + assert Dseq("EEAAEE").ovhg == -2 + assert Dseq("EEAAFF").ovhg == -2 + + # Single strand + assert Dseq("EEEE").ovhg is None + assert Dseq("FFFF").ovhg is None + + +def test_watson_ovhg(): + # No overhang + for seq in [ + "AAAA", + "AAAA", + "FFAA", + "EEAA", + "AAFF", + "AAEE", + "FFAAFF", + "FFAAEE", + "EEAAEE", + "EEAAFF", + ]: + assert ( + Dseq(seq).watson_ovhg() == Dseq(seq).reverse_complement().ovhg + ), f"error for {seq}" + + # Single strand + assert Dseq("EEEE").watson_ovhg() is None + assert Dseq("FFFF").watson_ovhg() is None