|
| 1 | +from collections import defaultdict |
| 2 | +from math import ceil, log2 |
1 | 3 | import enum |
2 | 4 | from amaranth import * |
3 | | -from amaranth.utils import log2_int |
4 | 5 |
|
5 | 6 | from ..memory import MemoryMap |
6 | 7 |
|
@@ -171,10 +172,183 @@ def memory_map(self, memory_map): |
171 | 172 |
|
172 | 173 |
|
173 | 174 | class Multiplexer(Elaboratable): |
| 175 | + class _Shadow: |
| 176 | + class Chunk: |
| 177 | + """The interface between of a CSR multiplexer and a shadow register chunk.""" |
| 178 | + def __init__(self, shadow, offset, elements): |
| 179 | + self.name = f"{shadow.name}__{offset}" |
| 180 | + self.data = Signal(shadow.granularity, name=f"{self.name}__data") |
| 181 | + self.r_en = Signal(name=f"{self.name}__r_en") |
| 182 | + self.w_en = Signal(name=f"{self.name}__w_en") |
| 183 | + self._elements = tuple(elements) |
| 184 | + |
| 185 | + def elements(self): |
| 186 | + """Iterate the address ranges of CSR elements using this chunk.""" |
| 187 | + yield from self._elements |
| 188 | + |
| 189 | + """CSR multiplexer shadow register. |
| 190 | +
|
| 191 | + Attributes |
| 192 | + ---------- |
| 193 | + name : :class:`str` |
| 194 | + Name of the shadow register. |
| 195 | + granularity : :class:`int` |
| 196 | + Amount of bits stored in a chunk of the shadow register. |
| 197 | + overlaps : :class:`int` |
| 198 | + Maximum amount of CSR elements that can share a chunk of the shadow register. Optional. |
| 199 | + If ``None``, it is implicitly set by :meth:`Multiplexer._Shadow.prepare`. |
| 200 | + """ |
| 201 | + def __init__(self, granularity, overlaps, *, name): |
| 202 | + assert isinstance(name, str) |
| 203 | + assert isinstance(granularity, int) and granularity >= 0 |
| 204 | + assert overlaps is None or isinstance(overlaps, int) and overlaps >= 0 |
| 205 | + self.name = name |
| 206 | + self.granularity = granularity |
| 207 | + self.overlaps = overlaps |
| 208 | + self._ranges = set() |
| 209 | + self._size = 1 |
| 210 | + self._chunks = None |
| 211 | + |
| 212 | + @property |
| 213 | + def size(self): |
| 214 | + """Size of the shadow register. |
| 215 | +
|
| 216 | + Returns |
| 217 | + ------- |
| 218 | + :class:`int` |
| 219 | + The amount of :class:`Multiplexer._Shadow.Chunk`s of the shadow. It can increase |
| 220 | + by calling :meth:`Multiplexer._Shadow.add` or :meth:`Multiplexer._Shadow.prepare`. |
| 221 | + """ |
| 222 | + return self._size |
| 223 | + |
| 224 | + def add(self, elem_range): |
| 225 | + """Add a CSR element to the shadow. |
| 226 | +
|
| 227 | + Arguments |
| 228 | + --------- |
| 229 | + elem_range : :class:`range` |
| 230 | + Address range of a CSR :class:`Element`. It uses ``2 ** ceil(log2(elem_range.stop - |
| 231 | + elem_range.start))`` chunks of the shadow register. If this amount is greater than |
| 232 | + :attr:`~Multiplexer._Shadow.size`, it replaces the latter. |
| 233 | + """ |
| 234 | + assert isinstance(elem_range, range) |
| 235 | + self._ranges.add(elem_range) |
| 236 | + elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start)) |
| 237 | + self._size = max(self._size, elem_size) |
| 238 | + |
| 239 | + def decode_address(self, addr, elem_range): |
| 240 | + """Decode a bus address into a shadow register offset. |
| 241 | +
|
| 242 | + Returns |
| 243 | + ------- |
| 244 | + :class:`int` |
| 245 | + The shadow register offset corresponding to the :class:`Multiplexer._Shadow.Chunk` |
| 246 | + used by ``addr``. |
| 247 | +
|
| 248 | + The address decoding scheme is illustrated by the following example: |
| 249 | + * ``addr`` is ``0x1c``; |
| 250 | + * ``elem_range`` is ``range(0x1b, 0x1f)``; |
| 251 | + * the :attr:`~Multiplexer._Shadow.size` of the shadow is ``16``. |
| 252 | +
|
| 253 | + The lower bits of the offset would be ``0b00``, extracted from ``addr``: |
| 254 | +
|
| 255 | + .. code-block:: |
| 256 | +
|
| 257 | + +----+--+--+ |
| 258 | + |0001|11|00| |
| 259 | + +----+--+--+ |
| 260 | + │ └─ 0 |
| 261 | + └──── ceil(log2(elem_range.stop - elem_range.start)) |
| 262 | +
|
| 263 | + The upper bits of the offset would be ``0b10``, extracted from ``elem_range.start``: |
| 264 | +
|
| 265 | + .. code-block:: |
| 266 | +
|
| 267 | + +----+--+--+ |
| 268 | + |0001|10|11| |
| 269 | + +----+--+--+ |
| 270 | + │ │ |
| 271 | + │ └──── ceil(log2(elem_range.stop - elem_range.start)) |
| 272 | + └─────── log2(self.size) |
| 273 | +
|
| 274 | +
|
| 275 | + The decoded offset would therefore be ``0xc`` (i.e. ``0b1100``). |
| 276 | + """ |
| 277 | + assert elem_range in self._ranges and addr in elem_range |
| 278 | + elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start)) |
| 279 | + self_mask = self.size - 1 |
| 280 | + elem_mask = elem_size - 1 |
| 281 | + return elem_range.start & self_mask & ~elem_mask | addr & elem_mask |
| 282 | + |
| 283 | + def encode_offset(self, offset, elem_range): |
| 284 | + """Encode a shadow register offset into a bus address. |
| 285 | +
|
| 286 | + Returns |
| 287 | + ------- |
| 288 | + :class:`int` |
| 289 | + The bus address in ``elem_range`` using the :class:`Multiplexer._Shadow.Chunk` |
| 290 | + located at ``offset``. See :meth:`~Multiplexer._Shadow.decode_address` for details. |
| 291 | + """ |
| 292 | + assert elem_range in self._ranges and isinstance(offset, int) |
| 293 | + elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start)) |
| 294 | + return elem_range.start + ((offset - elem_range.start) % elem_size) |
| 295 | + |
| 296 | + def prepare(self): |
| 297 | + """Balance out and instantiate the shadow register chunks. |
| 298 | +
|
| 299 | + The scheme used by :meth:`~Multiplexer._Shadow.decode_address` allows multiple bus |
| 300 | + addresses to be decoded to the same shadow register offset. Depending on the platform |
| 301 | + and its toolchain, this may create nets with high fan-in (if the chunk is read from |
| 302 | + the bus) or fan-out (if written), which may impact timing closure or resource usage. |
| 303 | +
|
| 304 | + If any shadow register offset is aliased to more bus addresses than permitted by the |
| 305 | + :attr:`~Multiplexer._Shadow.overlaps` constraint, the :attr:`~Multiplexer._Shadow.size` |
| 306 | + of the shadow is doubled. This increases the number of address bits used for decoding, |
| 307 | + which effectively balances chunk usage across the shadow register. |
| 308 | +
|
| 309 | + This method is recursive until the overlap constraint is satisfied. |
| 310 | + """ |
| 311 | + if isinstance(self._ranges, frozenset): |
| 312 | + return |
| 313 | + if self.overlaps is None: |
| 314 | + self.overlaps = len(self._ranges) |
| 315 | + |
| 316 | + elements = defaultdict(list) |
| 317 | + balanced = True |
| 318 | + |
| 319 | + for elem_range in self._ranges: |
| 320 | + for chunk_addr in elem_range: |
| 321 | + chunk_offset = self.decode_address(chunk_addr, elem_range) |
| 322 | + if len(elements[chunk_offset]) > self.overlaps: |
| 323 | + balanced = False |
| 324 | + break |
| 325 | + elements[chunk_offset].append(elem_range) |
| 326 | + |
| 327 | + if balanced: |
| 328 | + self._ranges = frozenset(self._ranges) |
| 329 | + self._chunks = dict() |
| 330 | + for chunk_offset, chunk_elements in elements.items(): |
| 331 | + chunk = Multiplexer._Shadow.Chunk(self, chunk_offset, chunk_elements) |
| 332 | + self._chunks[chunk_offset] = chunk |
| 333 | + else: |
| 334 | + self._size *= 2 |
| 335 | + self.prepare() |
| 336 | + |
| 337 | + def chunks(self): |
| 338 | + """Iterate shadow register chunks used by at least one CSR element.""" |
| 339 | + for chunk_offset, chunk in self._chunks.items(): |
| 340 | + yield chunk_offset, chunk |
| 341 | + |
174 | 342 | """CSR register multiplexer. |
175 | 343 |
|
176 | 344 | An address-based multiplexer for CSR registers implementing atomic updates. |
177 | 345 |
|
| 346 | + This implementation assumes the following from the CSR bus: |
| 347 | + * an initiator must have exclusive ownership over the multiplexer for the full duration of |
| 348 | + a register transaction; |
| 349 | + * an initiator must access a register in ascending order of addresses, but it may abort a |
| 350 | + transaction after any bus cycle. |
| 351 | +
|
178 | 352 | Latency |
179 | 353 | ------- |
180 | 354 |
|
@@ -214,16 +388,22 @@ class Multiplexer(Elaboratable): |
214 | 388 | Register alignment. See :class:`..memory.MemoryMap`. |
215 | 389 | name : str |
216 | 390 | Window name. Optional. |
| 391 | + shadow_overlaps : int |
| 392 | + Maximum number of CSR registers that can share a chunk of a shadow register. |
| 393 | + Optional. If ``None``, any number of CSR registers can share a shadow chunk. |
| 394 | + See :class:`Multiplexer._Shadow` for details. |
217 | 395 |
|
218 | 396 | Attributes |
219 | 397 | ---------- |
220 | 398 | bus : :class:`Interface` |
221 | 399 | CSR bus providing access to registers. |
222 | 400 | """ |
223 | | - def __init__(self, *, addr_width, data_width, alignment=0, name=None): |
| 401 | + def __init__(self, *, addr_width, data_width, alignment=0, name=None, shadow_overlaps=None): |
224 | 402 | self._map = MemoryMap(addr_width=addr_width, data_width=data_width, alignment=alignment, |
225 | 403 | name=name) |
226 | 404 | self._bus = None |
| 405 | + self._r_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="r_shadow") |
| 406 | + self._w_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="w_shadow") |
227 | 407 |
|
228 | 408 | @property |
229 | 409 | def bus(self): |
@@ -258,50 +438,77 @@ def add(self, element, *, addr=None, alignment=None, extend=False): |
258 | 438 | def elaborate(self, platform): |
259 | 439 | m = Module() |
260 | 440 |
|
261 | | - # Instead of a straightforward multiplexer for reads, use a per-element address comparator, |
262 | | - # AND the shadow register chunk with the comparator output, and OR all of those together. |
263 | | - # If the toolchain doesn't already synthesize multiplexer trees this way, this trick can |
264 | | - # save a significant amount of logic, since e.g. one 4-LUT can pack one 2-MUX, but two |
265 | | - # 2-AND or 2-OR gates. |
266 | | - r_data_fanin = 0 |
267 | | - |
268 | 441 | for elem, _, (elem_start, elem_end) in self._map.resources(): |
269 | | - shadow = Signal(elem.width, name="{}__shadow".format(elem.name)) |
| 442 | + elem_range = range(elem_start, elem_end) |
270 | 443 | if elem.access.readable(): |
271 | | - shadow_en = Signal(elem_end - elem_start, name="{}__shadow_en".format(elem.name)) |
272 | | - m.d.sync += shadow_en.eq(0) |
| 444 | + self._r_shadow.add(elem_range) |
273 | 445 | if elem.access.writable(): |
274 | | - m.d.comb += elem.w_data.eq(shadow) |
275 | | - m.d.sync += elem.w_stb.eq(0) |
| 446 | + self._w_shadow.add(elem_range) |
| 447 | + |
| 448 | + self._r_shadow.prepare() |
| 449 | + self._w_shadow.prepare() |
| 450 | + |
| 451 | + # Instead of a straightforward multiplexer for reads, use an address comparator for each |
| 452 | + # shadow register chunk, AND the comparator output with the chunk contents, and OR all of |
| 453 | + # those together. If the toolchain doesn't already synthesize multiplexer trees this way, |
| 454 | + # this trick can save a significant amount of logic, since e.g. one 4-LUT can pack one |
| 455 | + # 2-MUX, but two 2-AND or 2-OR gates. |
| 456 | + r_data_fanin = 0 |
| 457 | + |
| 458 | + for chunk_offset, r_chunk in self._r_shadow.chunks(): |
| 459 | + # Use the same trick to select which element is read into a shadow register chunk. |
| 460 | + r_chunk_w_en_fanin = 0 |
| 461 | + r_chunk_data_fanin = 0 |
| 462 | + |
| 463 | + m.d.sync += r_chunk.r_en.eq(0) |
276 | 464 |
|
277 | | - # Enumerate every address used by the register explicitly, rather than using |
278 | | - # arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer |
279 | | - # carry chains for comparisons, even with a constant. (Register sizes don't have |
280 | | - # to be powers of 2.) |
281 | 465 | with m.Switch(self.bus.addr): |
282 | | - for chunk_offset, chunk_addr in enumerate(range(elem_start, elem_end)): |
283 | | - shadow_slice = shadow.word_select(chunk_offset, self.bus.data_width) |
| 466 | + for elem_range in r_chunk.elements(): |
| 467 | + chunk_addr = self._r_shadow.encode_offset(chunk_offset, elem_range) |
| 468 | + elem = self._map.decode_address(elem_range.start) |
| 469 | + elem_offset = chunk_addr - elem_range.start |
| 470 | + elem_slice = elem.r_data.word_select(elem_offset, self.bus.data_width) |
284 | 471 |
|
285 | 472 | with m.Case(chunk_addr): |
286 | | - if elem.access.readable(): |
287 | | - r_data_fanin |= Mux(shadow_en[chunk_offset], shadow_slice, 0) |
288 | | - if chunk_addr == elem_start: |
289 | | - m.d.comb += elem.r_stb.eq(self.bus.r_stb) |
290 | | - with m.If(self.bus.r_stb): |
291 | | - m.d.sync += shadow.eq(elem.r_data) |
292 | | - # Delay by 1 cycle, allowing reads to be pipelined. |
293 | | - m.d.sync += shadow_en.eq(self.bus.r_stb << chunk_offset) |
294 | | - |
295 | | - if elem.access.writable(): |
296 | | - if chunk_addr == elem_end - 1: |
297 | | - # Delay by 1 cycle, avoiding combinatorial paths through |
298 | | - # the CSR bus and into CSR registers. |
299 | | - m.d.sync += elem.w_stb.eq(self.bus.w_stb) |
300 | | - with m.If(self.bus.w_stb): |
301 | | - m.d.sync += shadow_slice.eq(self.bus.w_data) |
| 473 | + if chunk_addr == elem_range.start: |
| 474 | + m.d.comb += elem.r_stb.eq(self.bus.r_stb) |
| 475 | + # Delay by 1 cycle, allowing reads to be pipelined. |
| 476 | + m.d.sync += r_chunk.r_en.eq(self.bus.r_stb) |
| 477 | + |
| 478 | + r_chunk_w_en_fanin |= elem.r_stb |
| 479 | + r_chunk_data_fanin |= Mux(elem.r_stb, elem_slice, 0) |
| 480 | + |
| 481 | + m.d.comb += r_chunk.w_en.eq(r_chunk_w_en_fanin) |
| 482 | + with m.If(r_chunk.w_en): |
| 483 | + m.d.sync += r_chunk.data.eq(r_chunk_data_fanin) |
| 484 | + |
| 485 | + r_data_fanin |= Mux(r_chunk.r_en, r_chunk.data, 0) |
302 | 486 |
|
303 | 487 | m.d.comb += self.bus.r_data.eq(r_data_fanin) |
304 | 488 |
|
| 489 | + for chunk_offset, w_chunk in self._w_shadow.chunks(): |
| 490 | + with m.Switch(self.bus.addr): |
| 491 | + for elem_range in w_chunk.elements(): |
| 492 | + chunk_addr = self._w_shadow.encode_offset(chunk_offset, elem_range) |
| 493 | + elem = self._map.decode_address(elem_range.start) |
| 494 | + elem_offset = chunk_addr - elem_range.start |
| 495 | + elem_slice = elem.w_data.word_select(elem_offset, self.bus.data_width) |
| 496 | + |
| 497 | + if chunk_addr == elem_range.stop - 1: |
| 498 | + m.d.sync += elem.w_stb.eq(0) |
| 499 | + |
| 500 | + with m.Case(chunk_addr): |
| 501 | + if chunk_addr == elem_range.stop - 1: |
| 502 | + # Delay by 1 cycle, avoiding combinatorial paths through |
| 503 | + # the CSR bus and into CSR registers. |
| 504 | + m.d.sync += elem.w_stb.eq(self.bus.w_stb) |
| 505 | + m.d.comb += w_chunk.w_en.eq(self.bus.w_stb) |
| 506 | + |
| 507 | + m.d.comb += elem_slice.eq(w_chunk.data) |
| 508 | + |
| 509 | + with m.If(w_chunk.w_en): |
| 510 | + m.d.sync += w_chunk.data.eq(self.bus.w_data) |
| 511 | + |
305 | 512 | return m |
306 | 513 |
|
307 | 514 |
|
|
0 commit comments