Skip to content

Commit b161160

Browse files
kbowers-jumpripatel-fd
authored andcommitted
vinyl completion queue and ABI
This provides the vinyl interprocess shared lockfree SPMC completion queue and specifies the client-vinyl batch completion ABI.
1 parent 4628a55 commit b161160

File tree

5 files changed

+581
-0
lines changed

5 files changed

+581
-0
lines changed

src/vinyl/cq/Local.mk

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
$(call add-hdrs,fd_vinyl_cq.h)
2+
$(call add-objs,fd_vinyl_cq,fd_vinyl)
3+
$(call make-unit-test,test_vinyl_cq,test_vinyl_cq,fd_vinyl fd_tango fd_util)
4+
$(call run-unit-test,test_vinyl_cq)

src/vinyl/cq/fd_vinyl_cq.c

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#include "fd_vinyl_cq.h"
2+
3+
ulong
4+
fd_vinyl_cq_align( void ) {
5+
return alignof(fd_vinyl_cq_t);
6+
}
7+
8+
ulong
9+
fd_vinyl_cq_footprint( ulong comp_cnt ) {
10+
if( FD_UNLIKELY( !((4UL<=comp_cnt) & (comp_cnt<(1UL<<63)/sizeof(fd_vinyl_comp_t)) & fd_ulong_is_pow2( comp_cnt )) ) ) return 0UL;
11+
return fd_ulong_align_up( sizeof(fd_vinyl_cq_t) + comp_cnt*sizeof(fd_vinyl_comp_t), alignof(fd_vinyl_cq_t) ); /* no overflow */
12+
}
13+
14+
void *
15+
fd_vinyl_cq_new( void * shmem,
16+
ulong comp_cnt ) {
17+
fd_vinyl_cq_t * cq = (fd_vinyl_cq_t *)shmem;
18+
19+
if( FD_UNLIKELY( !cq ) ) {
20+
FD_LOG_WARNING(( "NULL shmem"));
21+
return NULL;
22+
}
23+
24+
if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)cq, fd_vinyl_cq_align() ) ) ) {
25+
FD_LOG_WARNING(( "bad align"));
26+
return NULL;
27+
}
28+
29+
ulong footprint = fd_vinyl_cq_footprint( comp_cnt );
30+
if( FD_UNLIKELY( !footprint) ) {
31+
FD_LOG_WARNING(( "bad comp_cnt"));
32+
return NULL;
33+
}
34+
35+
memset( cq, 0, footprint );
36+
37+
cq->comp_cnt = comp_cnt;
38+
cq->seq = 0UL;
39+
40+
fd_vinyl_comp_t * comp = fd_vinyl_cq_comp( cq );
41+
42+
for( ulong seq=0UL; seq<comp_cnt; seq++ ) comp[ seq ].seq = seq - 1UL; /* Just before the next seq to be written to this entry */
43+
44+
FD_COMPILER_MFENCE();
45+
cq->magic = FD_VINYL_CQ_MAGIC;
46+
FD_COMPILER_MFENCE();
47+
48+
return cq;
49+
}
50+
51+
fd_vinyl_cq_t *
52+
fd_vinyl_cq_join ( void * shcq ) {
53+
fd_vinyl_cq_t * cq = (fd_vinyl_cq_t *)shcq;
54+
55+
if( FD_UNLIKELY( !cq ) ) {
56+
FD_LOG_WARNING(( "NULL shcq"));
57+
return NULL;
58+
}
59+
60+
if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)cq, fd_vinyl_cq_align() ) ) ) {
61+
FD_LOG_WARNING(( "bad align"));
62+
return NULL;
63+
}
64+
65+
if( FD_UNLIKELY( cq->magic!=FD_VINYL_CQ_MAGIC ) ) {
66+
FD_LOG_WARNING(( "bad magic"));
67+
return NULL;
68+
}
69+
70+
return (fd_vinyl_cq_t *)shcq;
71+
}
72+
73+
void *
74+
fd_vinyl_cq_leave( fd_vinyl_cq_t * cq ) {
75+
76+
if( FD_UNLIKELY( !cq ) ) {
77+
FD_LOG_WARNING(( "NULL cq"));
78+
return NULL;
79+
}
80+
81+
return cq;
82+
}
83+
84+
void *
85+
fd_vinyl_cq_delete( void * shcq ) {
86+
fd_vinyl_cq_t * cq = (fd_vinyl_cq_t *)shcq;
87+
88+
if( FD_UNLIKELY( !cq ) ) {
89+
FD_LOG_WARNING(( "NULL shcq"));
90+
return NULL;
91+
}
92+
93+
if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)cq, fd_vinyl_cq_align() ) ) ) {
94+
FD_LOG_WARNING(( "bad align"));
95+
return NULL;
96+
}
97+
98+
if( FD_UNLIKELY( cq->magic!=FD_VINYL_CQ_MAGIC ) ) {
99+
FD_LOG_WARNING(( "bad magic"));
100+
return NULL;
101+
}
102+
103+
FD_COMPILER_MFENCE();
104+
cq->magic = 0UL;
105+
FD_COMPILER_MFENCE();
106+
107+
return cq;
108+
}

src/vinyl/cq/fd_vinyl_cq.h

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
#ifndef HEADER_fd_src_vinyl_cq_fd_vinyl_cq_h
2+
#define HEADER_fd_src_vinyl_cq_fd_vinyl_cq_h
3+
4+
/* A fd_vinyl_comp_t provides details about vinyl request completions. */
5+
6+
#include "../fd_vinyl_base.h"
7+
8+
/* FD_VINYL_COMP_{ALIGN,FOOTPRINT} give the byte alignment and footprint
9+
of a fd_vinyl_comp_t. ALIGN is a reasonable power-of-2. FOOTPRINT
10+
is a multiple of ALIGN. */
11+
12+
#define FD_VINYL_COMP_ALIGN (32UL)
13+
#define FD_VINYL_COMP_FOOTPRINT (32UL)
14+
15+
/* FD_VINYL_COMP_QUOTA_MAX gives the maximum client acquire quota that
16+
can be returned by a completion. */
17+
18+
#define FD_VINYL_COMP_QUOTA_MAX (65535UL)
19+
20+
/* FIXME: consider eching back the val_gaddr and err_gaddr array back
21+
too? Maybe helpful in the case of read pipelining (e.g. request made
22+
on one thread, completion received on a different thread). But the
23+
req_id can probably encode this for a case like this without needing
24+
to bloat the completion footprint. */
25+
26+
struct __attribute__((aligned(FD_VINYL_COMP_ALIGN))) fd_vinyl_comp {
27+
ulong seq; /* Completion sequence number */
28+
ulong req_id; /* Echoed from corresponding request */
29+
ulong link_id; /* Echoed from corresponding request */
30+
short err; /* FD_VINYL_SUCCESS (0) if the request was processed (see request err array for individual failure details)
31+
FD_VINYL_ERR_* (negative) otherwise (no items in the request were processed) */
32+
ushort batch_cnt; /* Num items requested */
33+
ushort fail_cnt; /* If a successful completion, num items that failed processing, in [0,batch_cnt].
34+
If a failed completion, 0 (no items were processed). */
35+
ushort quota_rem; /* Client quota remaining when request completed processing.
36+
0<=quota_rem<=client quota_max<=FD_VINYL_COMP_QUOTA_MAX */
37+
};
38+
39+
typedef struct fd_vinyl_comp fd_vinyl_comp_t;
40+
41+
/* A fd_vinyl_cq_t is an interprocess sharable persistent SPMC
42+
queue used to communicate fd_vinyl_comp_t completions from a vinyl
43+
tile to clients. It is implemented as a hybrid direct mapped cache /
44+
queue with similar lockfree properties to a tango mcache.
45+
Specifically, when publishing a completion into a cq, a producer:
46+
47+
- Assigns the completion the cq's next sequence number and determines
48+
the corresponding cache line.
49+
50+
- Sets the cache line seq to seq-1. Because the cq's cache has at
51+
least 4 lines (2 is enough for this particular case), this is a seq
52+
that will never be held in that line. This atomically indicates to
53+
a concurrent consumer that seq-comp_cnt is no longer available in
54+
cache and that seq is in the process of being published into the
55+
line.
56+
57+
- Set the rest of the completion fields.
58+
59+
- Sets the cache line seq to seq. This atomically indicates that
60+
completion seq is ready.
61+
62+
- Advances the cq's seq cursor. From a consumer's point of view,
63+
this cursor has the property that that [0,seq) have been published
64+
and (seq,ULONG_MAX] have not been published. This cursor should
65+
only be used by consumers to synchronize their local cursors at
66+
initialization.
67+
68+
When reading completions from a cq, a consumer:
69+
70+
- Determine the cache line for consumer's seq.
71+
72+
- Reads the cache line seq.
73+
74+
- Reads the rest of the completion fields.
75+
76+
- Reads the cache line seq again.
77+
78+
If the first and second reads do not match, the vinyl tile is in the
79+
process of updating that cache line (most likely the consumer is
80+
caught up and producer is about to produce seq ... the consumer
81+
should try again soon).
82+
83+
Otherwise, if the read sequence numbers match the consumer's seq, the
84+
consumer received completion seq and should advance their local seq.
85+
86+
If they are behind the consumer's seq, the producer has not written
87+
seq yet and the consumer should try again later.
88+
89+
If they are ahead of the consumer's seq, the producer has overrun the
90+
consumer (the amount ahead gives a ballpark how far the consumer fell
91+
behind) and the consumer should recover / reinit. As such, flow
92+
control is managed by the application (e.g. if the application
93+
ensures that there are at most comp_cnt completion generating vinyl
94+
requests pending at any given time on this cq, no overruns are
95+
possible).
96+
97+
Note that because fd_vinyl_comp_t are AVX-2 friendly, it is possible
98+
to SIMD accelerate producers and consumers (also similar to
99+
fd_tango). */
100+
101+
/* FIXME: consider making comp_cnt a compile time constant? */
102+
103+
#define FD_VINYL_CQ_MAGIC (0xfd3a7352dc03a6c0UL) /* fd warm snd cq magc version 0 */
104+
105+
struct __attribute__((aligned(128))) fd_vinyl_cq_private {
106+
107+
ulong magic; /* ==FD_VINYL_CQ_MAGIC */
108+
ulong comp_cnt; /* Number of completions that can be in flight on this cq at any given time, power of 2 of least 4 */
109+
uchar _[ 112 ]; /* Put seq on a separate cache line pair */
110+
ulong seq; /* Completion sequence number to publish next */
111+
/* padding to 128 alignment */
112+
/* fd_vinyl_comp_t comp[ comp_cnt ] here, seq number at idx = seq & (comp_cnt-1UL) when available */
113+
/* padding to 128 alignment */
114+
115+
};
116+
117+
typedef struct fd_vinyl_cq_private fd_vinyl_cq_t;
118+
119+
FD_PROTOTYPES_BEGIN
120+
121+
/* fd_vinyl_cq_{align,footprint,new,join,leave_delete} have the usual
122+
interprocess shared persistent memory object semantics. comp_cnt is
123+
a power-of-2 of at least 4 that gives the number completions that can
124+
be in flight on this cq. */
125+
126+
FD_FN_CONST ulong fd_vinyl_cq_align ( void );
127+
FD_FN_CONST ulong fd_vinyl_cq_footprint( ulong comp_cnt );
128+
void * fd_vinyl_cq_new ( void * shmem, ulong comp_cnt );
129+
fd_vinyl_cq_t * fd_vinyl_cq_join ( void * shcq );
130+
void * fd_vinyl_cq_leave ( fd_vinyl_cq_t * cq );
131+
void * fd_vinyl_cq_delete ( void * shcq );
132+
133+
/* fd_vinyl_cq_comp returns the location in the caller's address space
134+
of the cq's completion array. fd_vinyl_cq_comp_const is a const
135+
correct version. fd_vinyl_cq_comp_cnt is the size of this array.
136+
The lifetime of the returned array is the lifetime of the local join.
137+
These assume cq is a current local join.
138+
139+
fd_vinyl_cq_comp_idx gives the array index that will cache completion
140+
seq in a cq completion array with comp_cnt elements. */
141+
142+
FD_FN_CONST static inline fd_vinyl_comp_t *
143+
fd_vinyl_cq_comp( fd_vinyl_cq_t * cq ) {
144+
return (fd_vinyl_comp_t *)(cq+1);
145+
}
146+
147+
FD_FN_CONST static inline fd_vinyl_comp_t const *
148+
fd_vinyl_cq_comp_const( fd_vinyl_cq_t const * cq ) {
149+
return (fd_vinyl_comp_t const *)(cq+1);
150+
}
151+
152+
FD_FN_PURE static inline ulong fd_vinyl_cq_comp_cnt( fd_vinyl_cq_t const * cq ) { return cq->comp_cnt; }
153+
154+
FD_FN_CONST static inline ulong fd_vinyl_cq_comp_idx( ulong seq, ulong comp_cnt ) { return seq & (comp_cnt-1UL); }
155+
156+
/* fd_vinyl_cq_seq returns the position of the producer's sequence
157+
number cursor. Specifically, at some point during the call,
158+
completions [0,seq) were published, completions (seq,ULONG_MAX] were
159+
not been published, and completion seq was either published, being
160+
published or not published. This is used for initial synchronization
161+
between producer and consumers. This is a compiler fence. */
162+
163+
static inline ulong
164+
fd_vinyl_cq_seq( fd_vinyl_cq_t const * cq ) {
165+
FD_COMPILER_MFENCE();
166+
ulong seq = cq->seq;
167+
FD_COMPILER_MFENCE();
168+
return seq;
169+
}
170+
171+
/* fd_vinyl_cq_send sends a completion. If comp is non-NULL, the
172+
completion will be written out-of-band to the location comp (assumes
173+
comp->seq is not 1 on entry and will set to 1 once the send it done).
174+
Otherwise, if cq is non-NULL, the completion will be enquened into
175+
the given cq (assumes cq is a current local join). If both cq and
176+
comp are NULL, this is a no-op. This is a compiler fence. */
177+
178+
/* FIXME: consider SIMD accelerating */
179+
180+
static inline void
181+
fd_vinyl_cq_send( fd_vinyl_cq_t * cq,
182+
fd_vinyl_comp_t * comp,
183+
ulong req_id,
184+
ulong link_id,
185+
int err, /* In [-2^15,2^15) */
186+
ulong batch_cnt, /* In [0,2^16) */
187+
ulong fail_cnt, /* In [0,2^16) */
188+
ulong quota_rem ) { /* In [0,2^16) */
189+
190+
ulong stack_seq[1];
191+
192+
ulong seq;
193+
ulong * _seq;
194+
195+
if( FD_UNLIKELY( comp ) ) { /* Send directly */
196+
197+
seq = 1UL; /* For direct sends, comp->seq should have already been set to something != 1 */
198+
_seq = stack_seq;
199+
200+
} else if( FD_LIKELY( cq ) ) { /* Send via cq */
201+
202+
seq = cq->seq;
203+
_seq = &cq->seq;
204+
comp = fd_vinyl_cq_comp( cq ) + fd_vinyl_cq_comp_idx( seq, cq->comp_cnt );
205+
206+
FD_COMPILER_MFENCE();
207+
comp->seq = seq - 1UL; /* Mark completion seq being written */
208+
209+
} else { /* No place to send completion */
210+
211+
FD_COMPILER_MFENCE(); /* Consistent semantics in all cases */
212+
return;
213+
214+
}
215+
216+
FD_COMPILER_MFENCE();
217+
comp->req_id = req_id;
218+
comp->link_id = link_id;
219+
comp->err = (short)err;
220+
comp->batch_cnt = (ushort)batch_cnt;
221+
comp->fail_cnt = (ushort)fail_cnt;
222+
comp->quota_rem = (ushort)quota_rem;
223+
FD_COMPILER_MFENCE();
224+
comp->seq = seq; /* Mark completion seq as written */
225+
FD_COMPILER_MFENCE();
226+
*_seq = seq + 1UL; /* Record that completions [0,seq) are published */
227+
FD_COMPILER_MFENCE();
228+
229+
}
230+
231+
/* fd_vinyl_cq_recv receives completion seq from the given cq. Returns
232+
0 on success, positive if completion seq has not been published yet
233+
and negative if the consumer has been overrun by the producer. On
234+
return, *dst contains the desired completion on success and is
235+
clobbered otherwise. Assumes cq is a current local join and dst is
236+
valid. This is a compiler fence. */
237+
238+
/* FIXME: consider SIMD accelerating */
239+
240+
static inline long
241+
fd_vinyl_cq_recv( fd_vinyl_cq_t const * cq,
242+
ulong seq,
243+
fd_vinyl_comp_t * dst ) {
244+
fd_vinyl_comp_t const * src = fd_vinyl_cq_comp_const( cq ) + fd_vinyl_cq_comp_idx( seq, cq->comp_cnt );
245+
246+
FD_COMPILER_MFENCE();
247+
ulong seq0 = src->seq;
248+
FD_COMPILER_MFENCE();
249+
*dst = *src;
250+
FD_COMPILER_MFENCE();
251+
ulong seq1 = src->seq;
252+
FD_COMPILER_MFENCE();
253+
254+
long diff0 = (long)(seq-seq0);
255+
long diff1 = (long)(seq-seq1);
256+
return fd_long_if( !diff0, diff1, diff0 );
257+
}
258+
259+
FD_PROTOTYPES_END
260+
261+
#endif /* HEADER_fd_src_vinyl_cq_fd_vinyl_cq_h */

0 commit comments

Comments
 (0)