|
| 1 | +from functools import reduce |
| 2 | +from nmigen import * |
| 3 | +from nmigen import tracer |
| 4 | + |
| 5 | + |
| 6 | +__all__ = ["CSRElement", "CSRMultiplexer"] |
| 7 | + |
| 8 | + |
| 9 | +class CSRElement(Record): |
| 10 | + """Peripheral-side CSR interface. |
| 11 | +
|
| 12 | + A low-level interface to a single atomically readable and writable register in a peripheral. |
| 13 | + This interface supports any register width and semantics, provided that both reads and writes |
| 14 | + always succeed and complete in one cycle. |
| 15 | +
|
| 16 | + Parameters |
| 17 | + ---------- |
| 18 | + width : int |
| 19 | + Width of the register. |
| 20 | + name : str |
| 21 | + Name of the underlying record. |
| 22 | +
|
| 23 | + Attributes |
| 24 | + ---------- |
| 25 | + r_data : Signal(width) |
| 26 | + Read data. Must be always valid, and is sampled when ``r_stb`` is asserted. |
| 27 | + r_stb : Signal() |
| 28 | + Read strobe. Registers with read side effects should perform the read side effect when this |
| 29 | + strobe is asserted. |
| 30 | + w_data : Signal(width) |
| 31 | + Write data. Valid only when ``w_stb`` is asserted. |
| 32 | + w_stb : Signal() |
| 33 | + Write strobe. Registers should update their value or perform the write side effect when |
| 34 | + this strobe is asserted. |
| 35 | + """ |
| 36 | + def __init__(self, width, access, *, name=None, src_loc_at=0): |
| 37 | + if not isinstance(width, int) or width < 0: |
| 38 | + raise ValueError("Width must be a non-negative integer, not {!r}" |
| 39 | + .format(width)) |
| 40 | + if access not in ("r", "w", "rw"): |
| 41 | + raise ValueError("Access mode must be one of \"r\", \"w\", or \"rw\", not {!r}" |
| 42 | + .format(access)) |
| 43 | + |
| 44 | + self.width = int(width) |
| 45 | + self.access = access |
| 46 | + |
| 47 | + layout = [] |
| 48 | + if "r" in self.access: |
| 49 | + layout += [ |
| 50 | + ("r_data", width), |
| 51 | + ("r_stb", 1), |
| 52 | + ] |
| 53 | + if "w" in self.access: |
| 54 | + layout += [ |
| 55 | + ("w_data", width), |
| 56 | + ("w_stb", 1), |
| 57 | + ] |
| 58 | + super().__init__(layout, name=name, src_loc_at=1) |
| 59 | + |
| 60 | + |
| 61 | +class CSRMultiplexer(Elaboratable): |
| 62 | + """CPU-side CSR interface. |
| 63 | +
|
| 64 | + A low-level interface to a set of peripheral CSR registers that implements address-based |
| 65 | + multiplexing and atomic updates of wide registers. |
| 66 | +
|
| 67 | + Operation |
| 68 | + --------- |
| 69 | +
|
| 70 | + The CSR multiplexer splits each CSR register into chunks according to its data width. Each |
| 71 | + chunk is assigned an address, and the first chunk of each register always has the provided |
| 72 | + minimum alignment. This allows accessing CSRs of any size using any datapath width. |
| 73 | +
|
| 74 | + When the first chunk of a register is read, the value of a register is captured, and reads |
| 75 | + from subsequent chunks of the same register return the captured values. When any chunk except |
| 76 | + the last chunk of a register is written, the written value is captured; a write to the last |
| 77 | + chunk writes the captured value to the register. This allows atomically accessing CSRs larger |
| 78 | + than datapath width. |
| 79 | +
|
| 80 | + Reads to padding bytes return zeroes, and writes to padding bytes are ignored. |
| 81 | +
|
| 82 | + Writes are registered, and add 1 cycle of latency. |
| 83 | +
|
| 84 | + Wide registers |
| 85 | + -------------- |
| 86 | +
|
| 87 | + Because the CSR bus conserves logic and routing resources, it is common to e.g. access |
| 88 | + a CSR bus with an *n*-bit data path from a CPU with a *k*-bit datapath in cases where CSR |
| 89 | + access latency is less important than resource usage. In this case, two strategies are |
| 90 | + possible for connecting the CSR bus to the CPU: |
| 91 | + * The CPU could access the CSR bus directly (with no intervening logic other than simple |
| 92 | + translation of control signals). In this case, the register alignment should be set |
| 93 | + to 1, and each *w*-bit register would occupy *ceil(w/n)* addresses from the CPU |
| 94 | + perspective, requiring the same amount of memory instructions to access. |
| 95 | + * The CPU could also access the CSR bus through a width down-converter, which would issue |
| 96 | + *k/n* CSR accesses for each CPU access. In this case, the register alignment should be |
| 97 | + set to *k/n*, and each *w*-bit register would occupy *ceil(w/k)* addresses from the CPU |
| 98 | + perspective, requiring the same amount of memory instructions to access. |
| 99 | +
|
| 100 | + If alignment is greater than 1, it affects which CSR bus write is considered a write to |
| 101 | + the last register chunk. For example, if a 24-bit register is used with a 8-bit CSR bus and |
| 102 | + a CPU with a 32-bit datapath, a write to this register requires 4 CSR bus writes to complete |
| 103 | + and the 4th write is the one that actually writes the value to the register. This allows |
| 104 | + determining write latency solely from the amount of addresses the register occupies in |
| 105 | + the CPU address space, and the width of the CSR bus. |
| 106 | +
|
| 107 | + Parameters |
| 108 | + ---------- |
| 109 | + addr_width : int |
| 110 | + Address width. At most ``(2 ** addr_width) * data_width`` register bits will be available. |
| 111 | + data_width : int |
| 112 | + Data width. Registers are accessed in ``data_width`` sized chunks. |
| 113 | + alignment : int |
| 114 | + Register alignment. The address assigned to each register will be a multiple of |
| 115 | + ``2 ** alignment``. |
| 116 | +
|
| 117 | + Attributes |
| 118 | + ---------- |
| 119 | + addr : Signal(addr_width) |
| 120 | + Address for reads and writes. |
| 121 | + r_data : Signal(data_width) |
| 122 | + Read data. Valid on the next cycle after ``r_stb`` is asserted. |
| 123 | + r_stb : Signal() |
| 124 | + Read strobe. If ``addr`` points to the first chunk of a register, captures register value |
| 125 | + and causes read side effects to be performed (if any). If ``addr`` points to any chunk |
| 126 | + of a register, latches the captured value to ``r_data``. Otherwise, latches zero |
| 127 | + to ``r_data``. |
| 128 | + w_data : Signal(data_width) |
| 129 | + Write data. Must be valid when ``w_stb`` is asserted. |
| 130 | + w_stb : Signal() |
| 131 | + Write strobe. If ``addr`` points to the last chunk of a register, writes captured value |
| 132 | + to the register and causes write side effects to be performed (if any). If ``addr`` points |
| 133 | + to any chunk of a register, latches ``w_data`` to the captured value. Otherwise, does |
| 134 | + nothing. |
| 135 | + """ |
| 136 | + def __init__(self, *, addr_width, data_width, alignment=0): |
| 137 | + if not isinstance(addr_width, int) or addr_width <= 0: |
| 138 | + raise ValueError("Address width must be a positive integer, not {!r}" |
| 139 | + .format(addr_width)) |
| 140 | + if not isinstance(data_width, int) or data_width <= 0: |
| 141 | + raise ValueError("Data width must be a positive integer, not {!r}" |
| 142 | + .format(data_width)) |
| 143 | + if not isinstance(alignment, int) or alignment < 0: |
| 144 | + raise ValueError("Alignment must be a non-negative integer, not {!r}" |
| 145 | + .format(alignment)) |
| 146 | + |
| 147 | + self.addr_width = int(addr_width) |
| 148 | + self.data_width = int(data_width) |
| 149 | + self.alignment = alignment |
| 150 | + |
| 151 | + self._next_addr = 0 |
| 152 | + self._elements = dict() |
| 153 | + |
| 154 | + self.addr = Signal(addr_width) |
| 155 | + self.r_data = Signal(data_width) |
| 156 | + self.r_stb = Signal() |
| 157 | + self.w_data = Signal(data_width) |
| 158 | + self.w_stb = Signal() |
| 159 | + |
| 160 | + def add(self, element): |
| 161 | + """Add a register. |
| 162 | +
|
| 163 | + Arguments |
| 164 | + --------- |
| 165 | + element : CSRElement |
| 166 | + Interface of the register. |
| 167 | +
|
| 168 | + Return value |
| 169 | + ------------ |
| 170 | + An ``(addr, size)`` tuple, where ``addr`` is the address assigned to the first chunk of |
| 171 | + the register, and ``size`` is the amount of chunks it takes, which may be greater than |
| 172 | + ``element.size // self.data_width`` due to alignment. |
| 173 | + """ |
| 174 | + if not isinstance(element, CSRElement): |
| 175 | + raise TypeError("Element must be an instance of CSRElement, not {!r}" |
| 176 | + .format(element)) |
| 177 | + |
| 178 | + addr = self.align_to(self.alignment) |
| 179 | + self._next_addr += (element.width + self.data_width - 1) // self.data_width |
| 180 | + size = self.align_to(self.alignment) - addr |
| 181 | + self._elements[addr] = element, size |
| 182 | + return addr, size |
| 183 | + |
| 184 | + def align_to(self, alignment): |
| 185 | + """Align the next register explicitly. |
| 186 | +
|
| 187 | + Arguments |
| 188 | + --------- |
| 189 | + alignment : int |
| 190 | + Register alignment. The address assigned to the next register will be a multiple of |
| 191 | + ``2 ** alignment`` or ``2 ** self.alignment``, whichever is greater. |
| 192 | +
|
| 193 | + Return value |
| 194 | + ------------ |
| 195 | + Address of the next register. |
| 196 | + """ |
| 197 | + if not isinstance(alignment, int) or alignment < 0: |
| 198 | + raise ValueError("Alignment must be a non-negative integer, not {!r}" |
| 199 | + .format(alignment)) |
| 200 | + |
| 201 | + align_chunks = 1 << alignment |
| 202 | + if self._next_addr % align_chunks != 0: |
| 203 | + self._next_addr += align_chunks - (self._next_addr % align_chunks) |
| 204 | + return self._next_addr |
| 205 | + |
| 206 | + def elaborate(self, platform): |
| 207 | + m = Module() |
| 208 | + |
| 209 | + # Instead of a straightforward multiplexer for reads, use a per-element address comparator, |
| 210 | + # clear the shadow register when it does not match, and OR every selected shadow register |
| 211 | + # part to form the output. This can save a significant amount of logic; the size of |
| 212 | + # a complete k-OR or k-MUX gate tree for n inputs is `s = ceil((n - 1) / (k - 1))`, |
| 213 | + # and its logic depth is `ceil(log_k(s))`, but a 4-LUT can implement either a 4-OR or |
| 214 | + # a 2-MUX gate. |
| 215 | + r_data_fanin = 0 |
| 216 | + |
| 217 | + for elem_addr, (elem, elem_size) in self._elements.items(): |
| 218 | + shadow = Signal(elem.width, name="{}__shadow".format(elem.name)) |
| 219 | + if "w" in elem.access: |
| 220 | + m.d.comb += elem.w_data.eq(shadow) |
| 221 | + |
| 222 | + # Enumerate every address used by the register explicitly, rather than using |
| 223 | + # arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer |
| 224 | + # carry chains for comparisons, even with a constant. (Register sizes don't have |
| 225 | + # to be powers of 2.) |
| 226 | + with m.Switch(self.addr): |
| 227 | + for chunk_offset in range(elem_size): |
| 228 | + chunk_slice = slice(chunk_offset * self.data_width, |
| 229 | + (chunk_offset + 1) * self.data_width) |
| 230 | + with m.Case(elem_addr + chunk_offset): |
| 231 | + if "r" in elem.access: |
| 232 | + chunk_r_stb = Signal(self.data_width, |
| 233 | + name="{}__r_stb_{}".format(elem.name, chunk_offset)) |
| 234 | + r_data_fanin |= Mux(chunk_r_stb, shadow[chunk_slice], 0) |
| 235 | + if chunk_offset == 0: |
| 236 | + m.d.comb += elem.r_stb.eq(self.r_stb) |
| 237 | + with m.If(self.r_stb): |
| 238 | + m.d.sync += shadow.eq(elem.r_data) |
| 239 | + # Delay by 1 cycle, allowing reads to be pipelined. |
| 240 | + m.d.sync += chunk_r_stb.eq(self.r_stb) |
| 241 | + |
| 242 | + if "w" in elem.access: |
| 243 | + if chunk_offset == elem_size - 1: |
| 244 | + # Delay by 1 cycle, avoiding combinatorial paths through |
| 245 | + # the CSR bus and into CSR registers. |
| 246 | + m.d.sync += elem.w_stb.eq(self.w_stb) |
| 247 | + with m.If(self.w_stb): |
| 248 | + m.d.sync += shadow[chunk_slice].eq(self.w_data) |
| 249 | + |
| 250 | + with m.Default(): |
| 251 | + m.d.sync += shadow.eq(0) |
| 252 | + |
| 253 | + m.d.comb += self.r_data.eq(r_data_fanin) |
| 254 | + |
| 255 | + return m |
0 commit comments