Skip to content

Commit bc3f0f3

Browse files
committed
csr.bus: redesign Multiplexer shadow registers.
Before this commit, csr.Multiplexer had separate shadows for every element in its memory map. The same shadow was shared for read and write accesses to an element; a combined read/write transaction was impossible despite being allowed by the CSR interface. After this commit, csr.Multiplexer has separate shadows for read and write accesses, but both shadows are shared by every element using them. For multiplexers with many elements, this approach also results in significant resource savings.
1 parent d2ca157 commit bc3f0f3

File tree

2 files changed

+378
-144
lines changed

2 files changed

+378
-144
lines changed

amaranth_soc/csr/bus.py

Lines changed: 243 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
from collections import defaultdict
2+
from math import ceil, log2
13
import enum
24
from amaranth import *
3-
from amaranth.utils import log2_int
45

56
from ..memory import MemoryMap
67

@@ -171,10 +172,183 @@ def memory_map(self, memory_map):
171172

172173

173174
class Multiplexer(Elaboratable):
175+
class _Shadow:
176+
class Chunk:
177+
"""The interface between of a CSR multiplexer and a shadow register chunk."""
178+
def __init__(self, shadow, offset, elements):
179+
self.name = f"{shadow.name}__{offset}"
180+
self.data = Signal(shadow.granularity, name=f"{self.name}__data")
181+
self.r_en = Signal(name=f"{self.name}__r_en")
182+
self.w_en = Signal(name=f"{self.name}__w_en")
183+
self._elements = tuple(elements)
184+
185+
def elements(self):
186+
"""Iterate the address ranges of CSR elements using this chunk."""
187+
yield from self._elements
188+
189+
"""CSR multiplexer shadow register.
190+
191+
Attributes
192+
----------
193+
name : :class:`str`
194+
Name of the shadow register.
195+
granularity : :class:`int`
196+
Amount of bits stored in a chunk of the shadow register.
197+
overlaps : :class:`int`
198+
Maximum amount of CSR elements that can share a chunk of the shadow register. Optional.
199+
If ``None``, it is implicitly set by :meth:`Multiplexer._Shadow.prepare`.
200+
"""
201+
def __init__(self, granularity, overlaps, *, name):
202+
assert isinstance(name, str)
203+
assert isinstance(granularity, int) and granularity >= 0
204+
assert overlaps is None or isinstance(overlaps, int) and overlaps >= 0
205+
self.name = name
206+
self.granularity = granularity
207+
self.overlaps = overlaps
208+
self._ranges = set()
209+
self._size = 1
210+
self._chunks = None
211+
212+
@property
213+
def size(self):
214+
"""Size of the shadow register.
215+
216+
Returns
217+
-------
218+
:class:`int`
219+
The amount of :class:`Multiplexer._Shadow.Chunk`s of the shadow. It can increase
220+
by calling :meth:`Multiplexer._Shadow.add` or :meth:`Multiplexer._Shadow.prepare`.
221+
"""
222+
return self._size
223+
224+
def add(self, elem_range):
225+
"""Add a CSR element to the shadow.
226+
227+
Arguments
228+
---------
229+
elem_range : :class:`range`
230+
Address range of a CSR :class:`Element`. It uses ``2 ** ceil(log2(elem_range.stop -
231+
elem_range.start))`` chunks of the shadow register. If this amount is greater than
232+
:attr:`~Multiplexer._Shadow.size`, it replaces the latter.
233+
"""
234+
assert isinstance(elem_range, range)
235+
self._ranges.add(elem_range)
236+
elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start))
237+
self._size = max(self._size, elem_size)
238+
239+
def decode_address(self, addr, elem_range):
240+
"""Decode a bus address into a shadow register offset.
241+
242+
Returns
243+
-------
244+
:class:`int`
245+
The shadow register offset corresponding to the :class:`Multiplexer._Shadow.Chunk`
246+
used by ``addr``.
247+
248+
The address decoding scheme is illustrated by the following example:
249+
* ``addr`` is ``0x1c``;
250+
* ``elem_range`` is ``range(0x1b, 0x1f)``;
251+
* the :attr:`~Multiplexer._Shadow.size` of the shadow is ``16``.
252+
253+
The lower bits of the offset would be ``0b00``, extracted from ``addr``:
254+
255+
.. code-block::
256+
257+
+----+--+--+
258+
|0001|11|00|
259+
+----+--+--+
260+
│ └─ 0
261+
└──── ceil(log2(elem_range.stop - elem_range.start))
262+
263+
The upper bits of the offset would be ``0b10``, extracted from ``elem_range.start``:
264+
265+
.. code-block::
266+
267+
+----+--+--+
268+
|0001|10|11|
269+
+----+--+--+
270+
│ │
271+
│ └──── ceil(log2(elem_range.stop - elem_range.start))
272+
└─────── log2(self.size)
273+
274+
275+
The decoded offset would therefore be ``0xc`` (i.e. ``0b1100``).
276+
"""
277+
assert elem_range in self._ranges and addr in elem_range
278+
elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start))
279+
self_mask = self.size - 1
280+
elem_mask = elem_size - 1
281+
return elem_range.start & self_mask & ~elem_mask | addr & elem_mask
282+
283+
def encode_offset(self, offset, elem_range):
284+
"""Encode a shadow register offset into a bus address.
285+
286+
Returns
287+
-------
288+
:class:`int`
289+
The bus address in ``elem_range`` using the :class:`Multiplexer._Shadow.Chunk`
290+
located at ``offset``. See :meth:`~Multiplexer._Shadow.decode_address` for details.
291+
"""
292+
assert elem_range in self._ranges and isinstance(offset, int)
293+
elem_size = 2 ** ceil(log2(elem_range.stop - elem_range.start))
294+
return elem_range.start + ((offset - elem_range.start) % elem_size)
295+
296+
def prepare(self):
297+
"""Balance out and instantiate the shadow register chunks.
298+
299+
The scheme used by :meth:`~Multiplexer._Shadow.decode_address` allows multiple bus
300+
addresses to be decoded to the same shadow register offset. Depending on the platform
301+
and its toolchain, this may create nets with high fan-in (if the chunk is read from
302+
the bus) or fan-out (if written), which may impact timing closure or resource usage.
303+
304+
If any shadow register offset is aliased to more bus addresses than permitted by the
305+
:attr:`~Multiplexer._Shadow.overlaps` constraint, the :attr:`~Multiplexer._Shadow.size`
306+
of the shadow is doubled. This increases the number of address bits used for decoding,
307+
which effectively balances chunk usage across the shadow register.
308+
309+
This method is recursive until the overlap constraint is satisfied.
310+
"""
311+
if isinstance(self._ranges, frozenset):
312+
return
313+
if self.overlaps is None:
314+
self.overlaps = len(self._ranges)
315+
316+
elements = defaultdict(list)
317+
balanced = True
318+
319+
for elem_range in self._ranges:
320+
for chunk_addr in elem_range:
321+
chunk_offset = self.decode_address(chunk_addr, elem_range)
322+
if len(elements[chunk_offset]) > self.overlaps:
323+
balanced = False
324+
break
325+
elements[chunk_offset].append(elem_range)
326+
327+
if balanced:
328+
self._ranges = frozenset(self._ranges)
329+
self._chunks = dict()
330+
for chunk_offset, chunk_elements in elements.items():
331+
chunk = Multiplexer._Shadow.Chunk(self, chunk_offset, chunk_elements)
332+
self._chunks[chunk_offset] = chunk
333+
else:
334+
self._size *= 2
335+
self.prepare()
336+
337+
def chunks(self):
338+
"""Iterate shadow register chunks used by at least one CSR element."""
339+
for chunk_offset, chunk in self._chunks.items():
340+
yield chunk_offset, chunk
341+
174342
"""CSR register multiplexer.
175343
176344
An address-based multiplexer for CSR registers implementing atomic updates.
177345
346+
This implementation assumes the following from the CSR bus:
347+
* an initiator must have exclusive ownership over the multiplexer for the full duration of
348+
a register transaction;
349+
* an initiator must access a register in ascending order of addresses, but it may abort a
350+
transaction after any bus cycle.
351+
178352
Latency
179353
-------
180354
@@ -214,16 +388,22 @@ class Multiplexer(Elaboratable):
214388
Register alignment. See :class:`..memory.MemoryMap`.
215389
name : str
216390
Window name. Optional.
391+
shadow_overlaps : int
392+
Maximum number of CSR registers that can share a chunk of a shadow register.
393+
Optional. If ``None``, any number of CSR registers can share a shadow chunk.
394+
See :class:`Multiplexer._Shadow` for details.
217395
218396
Attributes
219397
----------
220398
bus : :class:`Interface`
221399
CSR bus providing access to registers.
222400
"""
223-
def __init__(self, *, addr_width, data_width, alignment=0, name=None):
401+
def __init__(self, *, addr_width, data_width, alignment=0, name=None, shadow_overlaps=None):
224402
self._map = MemoryMap(addr_width=addr_width, data_width=data_width, alignment=alignment,
225403
name=name)
226404
self._bus = None
405+
self._r_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="r_shadow")
406+
self._w_shadow = Multiplexer._Shadow(data_width, shadow_overlaps, name="w_shadow")
227407

228408
@property
229409
def bus(self):
@@ -258,50 +438,77 @@ def add(self, element, *, addr=None, alignment=None, extend=False):
258438
def elaborate(self, platform):
259439
m = Module()
260440

261-
# Instead of a straightforward multiplexer for reads, use a per-element address comparator,
262-
# AND the shadow register chunk with the comparator output, and OR all of those together.
263-
# If the toolchain doesn't already synthesize multiplexer trees this way, this trick can
264-
# save a significant amount of logic, since e.g. one 4-LUT can pack one 2-MUX, but two
265-
# 2-AND or 2-OR gates.
266-
r_data_fanin = 0
267-
268441
for elem, _, (elem_start, elem_end) in self._map.resources():
269-
shadow = Signal(elem.width, name="{}__shadow".format(elem.name))
442+
elem_range = range(elem_start, elem_end)
270443
if elem.access.readable():
271-
shadow_en = Signal(elem_end - elem_start, name="{}__shadow_en".format(elem.name))
272-
m.d.sync += shadow_en.eq(0)
444+
self._r_shadow.add(elem_range)
273445
if elem.access.writable():
274-
m.d.comb += elem.w_data.eq(shadow)
275-
m.d.sync += elem.w_stb.eq(0)
446+
self._w_shadow.add(elem_range)
447+
448+
self._r_shadow.prepare()
449+
self._w_shadow.prepare()
450+
451+
# Instead of a straightforward multiplexer for reads, use an address comparator for each
452+
# shadow register chunk, AND the comparator output with the chunk contents, and OR all of
453+
# those together. If the toolchain doesn't already synthesize multiplexer trees this way,
454+
# this trick can save a significant amount of logic, since e.g. one 4-LUT can pack one
455+
# 2-MUX, but two 2-AND or 2-OR gates.
456+
r_data_fanin = 0
457+
458+
for chunk_offset, r_chunk in self._r_shadow.chunks():
459+
# Use the same trick to select which element is read into a shadow register chunk.
460+
r_chunk_w_en_fanin = 0
461+
r_chunk_data_fanin = 0
462+
463+
m.d.sync += r_chunk.r_en.eq(0)
276464

277-
# Enumerate every address used by the register explicitly, rather than using
278-
# arithmetic comparisons, since some toolchains (e.g. Yosys) are too eager to infer
279-
# carry chains for comparisons, even with a constant. (Register sizes don't have
280-
# to be powers of 2.)
281465
with m.Switch(self.bus.addr):
282-
for chunk_offset, chunk_addr in enumerate(range(elem_start, elem_end)):
283-
shadow_slice = shadow.word_select(chunk_offset, self.bus.data_width)
466+
for elem_range in r_chunk.elements():
467+
chunk_addr = self._r_shadow.encode_offset(chunk_offset, elem_range)
468+
elem = self._map.decode_address(elem_range.start)
469+
elem_offset = chunk_addr - elem_range.start
470+
elem_slice = elem.r_data.word_select(elem_offset, self.bus.data_width)
284471

285472
with m.Case(chunk_addr):
286-
if elem.access.readable():
287-
r_data_fanin |= Mux(shadow_en[chunk_offset], shadow_slice, 0)
288-
if chunk_addr == elem_start:
289-
m.d.comb += elem.r_stb.eq(self.bus.r_stb)
290-
with m.If(self.bus.r_stb):
291-
m.d.sync += shadow.eq(elem.r_data)
292-
# Delay by 1 cycle, allowing reads to be pipelined.
293-
m.d.sync += shadow_en.eq(self.bus.r_stb << chunk_offset)
294-
295-
if elem.access.writable():
296-
if chunk_addr == elem_end - 1:
297-
# Delay by 1 cycle, avoiding combinatorial paths through
298-
# the CSR bus and into CSR registers.
299-
m.d.sync += elem.w_stb.eq(self.bus.w_stb)
300-
with m.If(self.bus.w_stb):
301-
m.d.sync += shadow_slice.eq(self.bus.w_data)
473+
if chunk_addr == elem_range.start:
474+
m.d.comb += elem.r_stb.eq(self.bus.r_stb)
475+
# Delay by 1 cycle, allowing reads to be pipelined.
476+
m.d.sync += r_chunk.r_en.eq(self.bus.r_stb)
477+
478+
r_chunk_w_en_fanin |= elem.r_stb
479+
r_chunk_data_fanin |= Mux(elem.r_stb, elem_slice, 0)
480+
481+
m.d.comb += r_chunk.w_en.eq(r_chunk_w_en_fanin)
482+
with m.If(r_chunk.w_en):
483+
m.d.sync += r_chunk.data.eq(r_chunk_data_fanin)
484+
485+
r_data_fanin |= Mux(r_chunk.r_en, r_chunk.data, 0)
302486

303487
m.d.comb += self.bus.r_data.eq(r_data_fanin)
304488

489+
for chunk_offset, w_chunk in self._w_shadow.chunks():
490+
with m.Switch(self.bus.addr):
491+
for elem_range in w_chunk.elements():
492+
chunk_addr = self._w_shadow.encode_offset(chunk_offset, elem_range)
493+
elem = self._map.decode_address(elem_range.start)
494+
elem_offset = chunk_addr - elem_range.start
495+
elem_slice = elem.w_data.word_select(elem_offset, self.bus.data_width)
496+
497+
if chunk_addr == elem_range.stop - 1:
498+
m.d.sync += elem.w_stb.eq(0)
499+
500+
with m.Case(chunk_addr):
501+
if chunk_addr == elem_range.stop - 1:
502+
# Delay by 1 cycle, avoiding combinatorial paths through
503+
# the CSR bus and into CSR registers.
504+
m.d.sync += elem.w_stb.eq(self.bus.w_stb)
505+
m.d.comb += w_chunk.w_en.eq(self.bus.w_stb)
506+
507+
m.d.comb += elem_slice.eq(w_chunk.data)
508+
509+
with m.If(w_chunk.w_en):
510+
m.d.sync += w_chunk.data.eq(self.bus.w_data)
511+
305512
return m
306513

307514

0 commit comments

Comments
 (0)