xref: /dragonfly/sys/sys/dmsg.h (revision a988b43e78629a379190205ccd368b35bf4fb239)
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef _SYS_DMSG_H_
36 #define _SYS_DMSG_H_
37 
38 #ifndef _SYS_TYPES_H_
39 #include <sys/types.h>
40 #endif
41 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
42 #ifndef _SYS_TREE_H_
43 #include <sys/tree.h>
44 #endif
45 #ifndef _SYS_THREAD_H_
46 #include <sys/thread.h>
47 #endif
48 #endif
49 #ifndef _SYS_UUID_H_
50 #include <sys/uuid.h>
51 #endif
52 
53 /*
54  * Mesh network protocol structures.
55  *
56  *                                      CONN PROTOCOL
57  *
58  * The mesh is constructed via point-to-point streaming links with varying
59  * levels of interconnectedness, forming a graph.  Leafs of the graph are
60  * typically kernel devices (xdisk) or VFSs (HAMMER2).  Internal nodes are
61  * usually (user level) hammer2 service demons.
62  *
63  * Upon connecting and after authentication, a LNK_CONN transaction is opened
64  * to configure the link.  The SPAN protocol is then typically run over the
65  * open LNK_CONN transaction.
66  *
67  * Terminating the LNK_CONN transaction terminates everything running over it
68  * (typically open LNK_SPAN transactions), which in turn terminates everything
69  * running over the LNK_SPANs.
70  *
71  *                                      SPAN PROTOCOL
72  *
73  * The SPAN protocol runs over an open LNK_CONN transaction and is used to
74  * advertise any number of services.  For example, each PFS under a HAMMER2
75  * mount will be advertised as an open LNK_SPAN transaction.
76  *
77  * Any network node on the graph running multiple connections is capable
78  * of relaying LNK_SPANs from any connection to any other connection.  This
79  * is typically done by the user-level hammer2 service demon, and typically
80  * not done by kernel devices or VFSs (though these entities must be able
81  * to manage multiple LNK_SPANs since they might advertise or need to talk
82  * to multiple services).
83  *
84  * Relaying is not necessarily trivial as it requires internal nodes to
85  * track two open transactions (on the two iocom interfaces) and translate
86  * the msgid and circuit.  In addition, the relay may have to track multiple
87  * SPANs from the same iocom or from multiple iocoms which represent the same
88  * end-point and must select the best end-point, must send notifications when
89  * a better path is available, and must allow (when connectivity is still
90  * present) any existing, open, stacked sub-transactions to complete before
91  * terminating the less efficient SPAN.
92  *
93  * Relaying is optional.  It is perfectly acceptable for the hammer2 service
94  * to plug a received socket descriptor directly into the appropriate kernel
95  * device driver.
96  *
97  *                                   STACKED TRANSACTIONS
98  *
99  * Message transactions can be stacked.  That is, you can initiate a DMSG
100  * transaction relative to another open transaction.  sub-transactions can
101  * be initiate without waiting for the parent transaction to complete its
102  * handshake.
103  *
104  * This is done by entering the open transaction's msgid as the circuit field
105  * in the new transaction (typically by populating msg->parent).  The
106  * transaction tracking structure will be referenced and will track the
107  * sub-transaction.  Note that msgids must still be unique on an
108  * iocom-by-iocom basis.
109  *
110  * Messages can race closing circuits.  When a circuit is lost,
111  * messages are simulated to delete any sub-transactions.
112  *
113  *                                MESSAGE TRANSACTIONAL STATES
114  *
115  * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and
116  * CREPLY flags.  Message state is typically recorded at the end points and
117  * will be maintained (preventing reuse of the transaction id) until a DELETE
118  * is both sent and received.
119  *
120  * One-way messages such as those used for debug commands are not recorded
121  * and do not require any transactional state.  These are sent without
122  * the CREATE, DELETE, or ABORT flags set.  ABORT is not supported for
123  * one-off messages.  The REPLY bit can be used to distinguish between
124  * command and status if desired.
125  *
126  * Transactional messages are messages which require a reply to be
127  * returned.  These messages can also consist of multiple message elements
128  * for the command or reply or both (or neither).  The command message
129  * sequence sets CREATE on the first message and DELETE on the last message.
130  * A single message command sets both (CREATE|DELETE).  The reply message
131  * sequence works the same way but of course also sets the REPLY bit.
132  *
133  * Tansactional messages can be aborted by sending a message element
134  * with the ABORT flag set.  This flag can be combined with either or both
135  * the CREATE and DELETE flags.  When combined with the CREATE flag the
136  * command is treated as non-blocking but still executes.  Whem combined
137  * with the DELETE flag no additional message elements are required.
138  *
139  * Transactions are terminated by sending a message with DELETE set.
140  * Transactions must be CREATEd and DELETEd in both directions.  If a
141  * transaction is governing stacked sub-transactions the sub-transactions
142  * are automatically terminated before the governing transaction is terminated.
143  * Terminates are handled by simulating a received DELETE and expecting the
144  * normal function callback and state machine to (ultimately) issue a
145  * terminating (DELETE) response.
146  *
147  * Transactions can operate in full-duplex as both sides are fully open
148  * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone).
149  * Additional commands can be initiated from either side of the transaction.
150  *
151  * ABORT SPECIAL CASE - Mid-stream aborts.  A mid-stream abort can be sent
152  * when supported by the sender by sending an ABORT message with neither
153  * CREATE or DELETE set.  This effectively turns the message into a
154  * non-blocking message (but depending on what is being represented can also
155  * cut short prior data elements in the stream).
156  *
157  * ABORT SPECIAL CASE - Abort-after-DELETE.  Transactional messages have to be
158  * abortable if the stream/pipe/whatever is lost.  In this situation any
159  * forwarding relay needs to unconditionally abort commands and replies that
160  * are still active.  This is done by sending an ABORT|DELETE even in
161  * situations where a DELETE has already been sent in that direction.  This
162  * is done, for example, when links are in a half-closed state.  In this
163  * situation it is possible for the abort request to race a transition to the
164  * fully closed state.  ABORT|DELETE messages which race the fully closed
165  * state are expected to be discarded by the other end.
166  *
167  * --
168  *
169  * All base and extended message headers are 64-byte aligned, and all
170  * transports must support extended message headers up to DMSG_HDR_MAX.
171  * Currently we allow extended message headers up to 2048 bytes.  Note
172  * that the extended header size is encoded in the 'cmd' field of the header.
173  *
174  * Any in-band data is padded to a 64-byte alignment and placed directly
175  * after the extended header (after the higher-level cmd/rep structure).
176  * The actual unaligned size of the in-band data is encoded in the aux_bytes
177  * field in this case.  Maximum data sizes are negotiated during registration.
178  *
179  * Auxillary data can be in-band or out-of-band.  In-band data sets aux_descr
180  * equal to 0.  Any out-of-band data must be negotiated by the SPAN protocol.
181  *
182  * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
183  * aligned.  The aux_bytes field contains the actual byte-granular length
184  * and not the aligned length.  The crc is against the aligned length (so
185  * a faster crc algorithm can be used, theoretically).
186  *
187  * hdr_crc is calculated over the entire, ALIGNED extended header.  For
188  * the purposes of calculating the crc, the hdr_crc field is 0.  That is,
189  * if calculating the crc in HW a 32-bit '0' must be inserted in place of
190  * the hdr_crc field when reading the entire header and compared at the
191  * end (but the actual hdr_crc must be left intact in memory).  A simple
192  * counter to replace the field going into the CRC generator does the job
193  * in HW.  The CRC endian is based on the magic number field and may have
194  * to be byte-swapped, too (which is also easy to do in HW).
195  *
196  * aux_crc is calculated over the entire, ALIGNED auxillary data.
197  *
198  *                            SHARED MEMORY IMPLEMENTATIONS
199  *
200  * Shared-memory implementations typically use a pipe to transmit the extended
201  * message header and shared memory to store any auxilary data.  Auxillary
202  * data in one-way (non-transactional) messages is typically required to be
203  * inline.  CRCs are still recommended and required at the beginning, but
204  * may be negotiated away later.
205  */
206 
207 #define DMSG_TERMINATE_STRING(ary)      \
208           do { (ary)[sizeof(ary) - 1] = 0; } while (0)
209 
210 /*
211  * dmsg_hdr must be 64 bytes
212  */
213 struct dmsg_hdr {
214           uint16_t  magic;              /* 00 sanity, synchro, endian */
215           uint16_t  reserved02;         /* 02 */
216           uint32_t  salt;               /* 04 random salt helps w/crypto */
217 
218           uint64_t  msgid;              /* 08 message transaction id */
219           uint64_t  circuit;  /* 10 circuit id or 0         */
220           uint64_t  link_verifier;      /* 18 link verifier */
221 
222           uint32_t  cmd;                /* 20 flags | cmd | hdr_size / ALIGN */
223           uint32_t  aux_crc;  /* 24 auxillary data crc */
224           uint32_t  aux_bytes;          /* 28 auxillary data length (bytes) */
225           uint32_t  error;              /* 2C error code or 0 */
226           uint64_t  aux_descr;          /* 30 negotiated OOB data descr */
227           uint32_t  reserved38;         /* 38 */
228           uint32_t  hdr_crc;  /* 3C (aligned) extended header crc */
229 };
230 
231 typedef struct dmsg_hdr dmsg_hdr_t;
232 
233 #define DMSG_HDR_MAGIC                  0x4832
234 #define DMSG_HDR_MAGIC_REV    0x3248
235 #define DMSG_HDR_CRCOFF                 offsetof(dmsg_hdr_t, salt)
236 #define DMSG_HDR_CRCBYTES     (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
237 
238 /*
239  * Administrative protocol limits.
240  *
241  * NOTE: A dmsg header must completely fit in the (fifo) buffer, but
242  *         dmsg aux data does not have to completely fit.  The dmsg
243  *         structure allows headers up to 255*64 = 16320 bytes.  There
244  *         is no real limit on the aux_data other than what we deem
245  *         reasonable and defenseable (i.e. not run processes or the
246  *         kernel out of memory).  But it should be able to handle at
247  *         least MAXPHYS bytes which is typically 128KB or 256KB.
248  */
249 #define DMSG_HDR_MAX                    2048                /* <= 8192 */
250 #define DMSG_AUX_MAX                    (1024*1024)         /* <= 1MB */
251 #define DMSG_BUF_SIZE                   (DMSG_HDR_MAX * 4)
252 #define DMSG_BUF_MASK                   (DMSG_BUF_SIZE - 1)
253 
254 /*
255  * The message (cmd) field also encodes various flags and the total size
256  * of the message header.  This allows the protocol processors to validate
257  * persistency and structural settings for every command simply by
258  * switch()ing on the (cmd) field.
259  */
260 #define DMSGF_CREATE                    0x80000000U         /* msg start */
261 #define DMSGF_DELETE                    0x40000000U         /* msg end */
262 #define DMSGF_REPLY           0x20000000U         /* reply path */
263 #define DMSGF_ABORT           0x10000000U         /* abort req */
264 #define DMSGF_REVTRANS                  0x08000000U         /* opposite direction msgid */
265 #define DMSGF_REVCIRC                   0x04000000U         /* opposite direction circuit */
266 #define DMSGF_FLAG1           0x02000000U
267 #define DMSGF_FLAG0           0x01000000U
268 
269 #define DMSGF_FLAGS           0xFF000000U         /* all flags */
270 #define DMSGF_PROTOS                    0x00F00000U         /* all protos */
271 #define DMSGF_CMDS            0x000FFF00U         /* all cmds */
272 #define DMSGF_SIZE            0x000000FFU         /* N*32 */
273 
274 /*
275  * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command
276  *     expects some sort of acknowledgement.  Allows protocol mismatches to
277  *     be detected.
278  */
279 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U         /* in-line command no-ack */
280 
281 #define DMSGF_CMDSWMASK                 (DMSGF_CMDS |       \
282                                                    DMSGF_SIZE |       \
283                                                    DMSGF_PROTOS |     \
284                                                    DMSGF_REPLY)
285 
286 #define DMSGF_BASECMDMASK     (DMSGF_CMDS |       \
287                                                    DMSGF_SIZE |       \
288                                                    DMSGF_PROTOS)
289 
290 #define DMSGF_TRANSMASK                 (DMSGF_CMDS |       \
291                                                    DMSGF_SIZE |       \
292                                                    DMSGF_PROTOS |     \
293                                                    DMSGF_REPLY |      \
294                                                    DMSGF_CREATE |     \
295                                                    DMSGF_DELETE)
296 
297 #define DMSGF_BASEFLAGS                 (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY)
298 
299 #define DMSG_PROTO_LNK                  0x00000000U
300 #define DMSG_PROTO_DBG                  0x00100000U
301 #define DMSG_PROTO_HM2                  0x00200000U
302 #define DMSG_PROTO_XX3                  0x00300000U
303 #define DMSG_PROTO_XX4                  0x00400000U
304 #define DMSG_PROTO_BLK                  0x00500000U
305 #define DMSG_PROTO_VOP                  0x00600000U
306 
307 /*
308  * Message command constructors, sans flags
309  */
310 #define DMSG_ALIGN            64
311 #define DMSG_ALIGNMASK                  (DMSG_ALIGN - 1)
312 #define DMSG_DOALIGN(bytes)   (((bytes) + DMSG_ALIGNMASK) &           \
313                                          ~DMSG_ALIGNMASK)
314 
315 #define DMSG_HDR_ENCODE(elm)  (((uint32_t)sizeof(struct elm) +        \
316                                           DMSG_ALIGNMASK) /                     \
317                                          DMSG_ALIGN)
318 
319 #define DMSG_LNK(cmd, elm)    (DMSG_PROTO_LNK |                       \
320                                                    ((cmd) << 8) |               \
321                                                    DMSG_HDR_ENCODE(elm))
322 
323 #define DMSG_DBG(cmd, elm)    (DMSG_PROTO_DBG |                       \
324                                                    ((cmd) << 8) |               \
325                                                    DMSG_HDR_ENCODE(elm))
326 
327 #define DMSG_HM2(cmd, elm)    (DMSG_PROTO_HM2 |                       \
328                                                    ((cmd) << 8) |               \
329                                                    DMSG_HDR_ENCODE(elm))
330 
331 #define DMSG_BLK(cmd, elm)    (DMSG_PROTO_BLK |                       \
332                                                    ((cmd) << 8) |               \
333                                                    DMSG_HDR_ENCODE(elm))
334 
335 #define DMSG_VOP(cmd, elm)    (DMSG_PROTO_VOP |                       \
336                                                    ((cmd) << 8) |               \
337                                                    DMSG_HDR_ENCODE(elm))
338 
339 /*
340  * Link layer ops basically talk to just the other side of a direct
341  * connection.
342  *
343  * LNK_PAD          - One-way message on circuit 0, ignored by target.  Used to
344  *                    pad message buffers on shared-memory transports.  Not
345  *                    typically used with TCP.
346  *
347  * LNK_PING         - One-way message on circuit-0, keep-alive, run by both sides
348  *                    typically 1/sec on idle link, link is lost after 10 seconds
349  *                    of inactivity.
350  *
351  * LNK_AUTH         - Authenticate the connection, negotiate administrative
352  *                    rights & encryption, protocol class, etc.  Only PAD and
353  *                    AUTH messages (not even PING) are accepted until
354  *                    authentication is complete.  This message also identifies
355  *                    the host.
356  *
357  * LNK_CONN         - Enable the SPAN protocol on circuit-0, possibly also
358  *                    installing a PFS filter (by cluster id, unique id, and/or
359  *                    wildcarded name).
360  *
361  * LNK_SPAN         - A SPAN transaction typically on iocom->state0 enables
362  *                    messages to be relayed to/from a particular cluster node.
363  *                    SPANs are received, sorted, aggregated, filtered, and
364  *                    retransmitted back out across all applicable connections.
365  *
366  *                    The leaf protocol also uses this to make a PFS available
367  *                    to the cluster (e.g. on-mount).
368  */
369 #define DMSG_LNK_PAD                    DMSG_LNK(0x000, dmsg_hdr)
370 #define DMSG_LNK_PING                   DMSG_LNK(0x001, dmsg_hdr)
371 #define DMSG_LNK_AUTH                   DMSG_LNK(0x010, dmsg_lnk_auth)
372 #define DMSG_LNK_CONN                   DMSG_LNK(0x011, dmsg_lnk_conn)
373 #define DMSG_LNK_SPAN                   DMSG_LNK(0x012, dmsg_lnk_span)
374 #define DMSG_LNK_ERROR                  DMSG_LNK(0xFFF, dmsg_hdr)
375 
376 /*
377  * Reserved command codes for third party subsystems.  Structure size is
378  * not known here so do not try to construct the full DMSG_LNK_ define.
379  */
380 #define DMSG_LNK_CMD_HAMMER2_VOLCONF    0x20
381 
382 #define DMSG_LABEL_SIZE                 128       /* fixed at 128, do not change */
383 
384 /*
385  * LNK_AUTH - Authentication (often omitted)
386  */
387 struct dmsg_lnk_auth {
388           dmsg_hdr_t          head;
389           char                dummy[64];
390 };
391 
392 /*
393  * LNK_CONN - Register connection info for SPAN protocol
394  *              (transaction, left open, iocom->state0 only).
395  *
396  * LNK_CONN identifies a streaming connection into the cluster.
397  *
398  * peer_mask serves to filter the SPANs we receive by peer_type.  A cluster
399  * controller typically sets this to (uint64_t)-1, indicating that it wants
400  * everything.  A block devfs interface might set it to 1 << DMSG_PEER_DISK,
401  * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
402  *
403  * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same
404  * media to transmit duplicative LNK_VOLCONF updates without causing confusion
405  * in the cluster controller.
406  *
407  * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
408  * left empty (zero-fill) if not supported by a particular peer.
409  */
410 struct dmsg_lnk_conn {
411           dmsg_hdr_t          head;
412           uuid_t              media_id; /* media configuration id */
413           uuid_t              peer_id;  /* unique peer uuid */
414           uuid_t              reserved01;
415           uint64_t  peer_mask;          /* PEER mask for SPAN filtering */
416           uint8_t             peer_type;          /* see DMSG_PEER_xxx */
417           uint8_t             reserved02;
418           uint16_t  proto_version;      /* high level protocol support */
419           uint32_t  status;             /* status flags */
420           uint32_t  rnss;               /* node's generated rnss */
421           uint8_t             reserved03[8];
422           uint32_t  reserved04[14];
423           char                peer_label[DMSG_LABEL_SIZE]; /* peer identity string */
424 };
425 
426 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
427 
428 /*
429  * PEER types 0-63 are defined here.  There is a limit of 64 types due to
430  * the width of peer_mask.
431  *
432  * PFS types depend on the peer type.  sys/dmsg.h only defines the default.
433  * peer-specific headers define PFS types for any given peer.
434  */
435 #define DMSG_PEER_NONE                            0
436 #define DMSG_PEER_ROUTER                1         /* server: cluster controller */
437 #define DMSG_PEER_BLOCK                           2         /* server: block devices */
438 #define DMSG_PEER_HAMMER2               3         /* server: h2 mounted volume */
439 #define DMSG_PEER_CLIENT                63        /* a client connection */
440 #define DMSG_PEER_MAX                             64
441 
442 #define DMSG_PFSTYPE_DEFAULT            0
443 #define DMSG_PFSTYPE_MASK               0x0F
444 
445 /*
446  * Structures embedded in LNK_SPAN
447  */
448 struct dmsg_media_block {
449           uint64_t  bytes;              /* media size in bytes */
450           uint32_t  blksize;  /* media block size */
451           uint32_t  reserved01;
452 };
453 
454 typedef struct dmsg_media_block dmsg_media_block_t;
455 
456 /*
457  * LNK_SPAN - Initiate or relay a SPAN
458  *              (transaction, left open, typically only on iocom->state0)
459  *
460  * This message registers an end-point with the other end of the connection,
461  * telling the other end who we are and what we can provide or intend to
462  * consume.  Multiple registrations can be maintained as open transactions
463  * with each one specifying a unique end-point.
464  *
465  * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
466  * as open transactions.  Registrations are also received and maintains as
467  * open transactions, creating a matrix of linkid's.
468  *
469  * While these transactions are open additional transactions can be executed
470  * between any two linkid's {source}=S (registrations we sent) to {target}=T
471  * (registrations we received).
472  *
473  * Closure of any registration transaction will automatically abort any open
474  * transactions using the related linkids.  Closure can be initiated
475  * voluntarily from either side with either end issuing a DELETE, or they
476  * can be ABORTed.
477  *
478  * Status updates are performed via the open transaction.
479  *
480  * --
481  *
482  * A registration identifies a node and its various PFS parameters including
483  * the PFS_TYPE.  For example, a diskless HAMMER2 client typically identifies
484  * itself as PFSTYPE_CLIENT.
485  *
486  * Any node may serve as a cluster controller, aggregating and passing
487  * on received registrations, but end-points do not have to implement this
488  * ability.  Most end-points typically implement a single client-style or
489  * server-style PFS_TYPE and rendezvous at a cluster controller.
490  *
491  * The cluster controller does not aggregate/pass-on all received
492  * registrations.  It typically filters what gets passed on based on what it
493  * receives, passing on only the best candidates.
494  *
495  * If a symmetric spanning tree is desired additional candidates whos
496  * {dist, rnss} fields match the last best candidate must also be propagated.
497  * This feature is not currently enabled.
498  *
499  * STATUS UPDATES: Status updates use the same structure but typically
500  *                     only contain incremental changes to e.g. pfs_type, with
501  *                     a text description sent as out-of-band data.
502  */
503 struct dmsg_lnk_span {
504           dmsg_hdr_t          head;
505           uuid_t              peer_id;
506           uuid_t              pfs_id;             /* unique pfs id */
507           uint8_t             pfs_type; /* PFS type */
508           uint8_t             peer_type;          /* PEER type */
509           uint16_t  proto_version;      /* high level protocol support */
510           uint32_t  status;             /* status flags */
511           uint8_t             reserved02[8];
512           uint32_t  dist;               /* span distance */
513           uint32_t  rnss;               /* random number sub-sort */
514           union {
515                     uint32_t  reserved03[14];
516                     dmsg_media_block_t block;
517           } media;
518 
519           /*
520            * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
521            *         is the superroot directory name.
522            *
523            *         for PEER_BLOCK cl_label is typically host/device and
524            *         fs_label is typically the serial number string.
525            */
526           char                peer_label[DMSG_LABEL_SIZE];  /* peer label */
527           char                pfs_label[DMSG_LABEL_SIZE];   /* PFS label */
528 };
529 
530 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
531 
532 #define DMSG_SPAN_PROTO_1     1
533 
534 /*
535  * Debug layer ops operate on any link
536  *
537  * SHELL  - Persist stream, access the debug shell on the target
538  *                    registration.  Multiple shells can be operational.
539  */
540 #define DMSG_DBG_SHELL                  DMSG_DBG(0x001, dmsg_dbg_shell)
541 
542 struct dmsg_dbg_shell {
543           dmsg_hdr_t          head;
544 };
545 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
546 
547 /*
548  * Hammer2 layer ops (low-level chain manipulation used by cluster code)
549  *
550  * HM2_OPENPFS      - Attach a PFS
551  * HM2_FLUSHPFS - Flush a PFS
552  *
553  * HM2_LOOKUP       - Lookup chain (parent-relative transaction)
554  *                    (can request multiple chains)
555  * HM2_NEXT         - Lookup next chain (parent-relative transaction)
556  *                    (can request multiple chains)
557  * HM2_LOCK         - [Re]lock a chain (chain-relative) (non-recursive)
558  * HM2_UNLOCK       - Unlock a chain (chain-relative) (non-recursive)
559  * HM2_RESIZE       - Resize a chain (chain-relative)
560  * HM2_MODIFY       - Modify a chain (chain-relative)
561  * HM2_CREATE       - Create a chain (parent-relative)
562  * HM2_DUPLICATE- Duplicate a chain (target-parent-relative)
563  * HM2_DELDUP       - Delete-Duplicate a chain (chain-relative)
564  * HM2_DELETE       - Delete a chain (chain-relative)
565  * HM2_SNAPSHOT     - Create a snapshot (snapshot-root-relative, w/clid override)
566  */
567 #define DMSG_HM2_OPENPFS      DMSG_HM2(0x001, dmsg_hm2_openpfs)
568 
569 /*
570  * DMSG_PROTO_BLK Protocol
571  *
572  * BLK_OPEN         - Open device.  This transaction must be left open for the
573  *                    duration and the returned keyid passed in all associated
574  *                    BLK commands.  Multiple OPENs can be issued within the
575  *                    transaction.
576  *
577  * BLK_CLOSE        - Close device.  This can be used to close one of the opens
578  *                    within a BLK_OPEN transaction.  It may NOT initiate a
579  *                    transaction.  Note that a termination of the transaction
580  *                    (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
581  *                    for that transaction.  XXX not well defined atm.
582  *
583  * BLK_READ         - Strategy read.  Not typically streaming.
584  *
585  * BLK_WRITE        - Strategy write.  Not typically streaming.
586  *
587  * BLK_FLUSH        - Strategy flush.  Not typically streaming.
588  *
589  * BLK_FREEBLKS     - Strategy freeblks.  Not typically streaming.
590  */
591 #define DMSG_BLK_OPEN                   DMSG_BLK(0x001, dmsg_blk_open)
592 #define DMSG_BLK_CLOSE                  DMSG_BLK(0x002, dmsg_blk_open)
593 #define DMSG_BLK_READ                   DMSG_BLK(0x003, dmsg_blk_read)
594 #define DMSG_BLK_WRITE                  DMSG_BLK(0x004, dmsg_blk_write)
595 #define DMSG_BLK_FLUSH                  DMSG_BLK(0x005, dmsg_blk_flush)
596 #define DMSG_BLK_FREEBLKS     DMSG_BLK(0x006, dmsg_blk_freeblks)
597 #define DMSG_BLK_ERROR                  DMSG_BLK(0xFFF, dmsg_blk_error)
598 
599 struct dmsg_blk_open {
600           dmsg_hdr_t          head;
601           uint32_t  modes;
602           uint32_t  reserved01;
603 };
604 
605 #define DMSG_BLKOPEN_RD                 0x0001
606 #define DMSG_BLKOPEN_WR                 0x0002
607 
608 /*
609  * DMSG_LNK_ERROR is returned for simple results,
610  * DMSG_BLK_ERROR is returned for extended results.
611  */
612 struct dmsg_blk_error {
613           dmsg_hdr_t          head;
614           uint64_t  keyid;
615           uint32_t  resid;
616           uint32_t  reserved02;
617           char                buf[64];
618 };
619 
620 struct dmsg_blk_read {
621           dmsg_hdr_t          head;
622           uint64_t  keyid;
623           uint64_t  offset;
624           uint32_t  bytes;
625           uint32_t  flags;
626           uint32_t  reserved01;
627           uint32_t  reserved02;
628 };
629 
630 struct dmsg_blk_write {
631           dmsg_hdr_t          head;
632           uint64_t  keyid;
633           uint64_t  offset;
634           uint32_t  bytes;
635           uint32_t  flags;
636           uint32_t  reserved01;
637           uint32_t  reserved02;
638 };
639 
640 struct dmsg_blk_flush {
641           dmsg_hdr_t          head;
642           uint64_t  keyid;
643           uint64_t  offset;
644           uint32_t  bytes;
645           uint32_t  flags;
646           uint32_t  reserved01;
647           uint32_t  reserved02;
648 };
649 
650 struct dmsg_blk_freeblks {
651           dmsg_hdr_t          head;
652           uint64_t  keyid;
653           uint64_t  offset;
654           uint32_t  bytes;
655           uint32_t  flags;
656           uint32_t  reserved01;
657           uint32_t  reserved02;
658 };
659 
660 typedef struct dmsg_blk_open            dmsg_blk_open_t;
661 typedef struct dmsg_blk_read            dmsg_blk_read_t;
662 typedef struct dmsg_blk_write           dmsg_blk_write_t;
663 typedef struct dmsg_blk_flush           dmsg_blk_flush_t;
664 typedef struct dmsg_blk_freeblks        dmsg_blk_freeblks_t;
665 typedef struct dmsg_blk_error           dmsg_blk_error_t;
666 
667 /*
668  * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
669  *
670  * General message errors
671  *
672  *        0x00 - 0x1F         Local iocomm errors
673  *        0x20 - 0x2F         Global errors
674  */
675 #define DMSG_ERR_NOSUPP                 0x20
676 #define DMSG_ERR_LOSTLINK     0x21
677 #define DMSG_ERR_IO           0x22      /* generic */
678 #define DMSG_ERR_PARAM                  0x23      /* generic */
679 #define DMSG_ERR_CANTCIRC     0x24      /* (typically means lost span) */
680 
681 union dmsg_any {
682           char                          buf[DMSG_HDR_MAX];
683           dmsg_hdr_t                    head;
684 
685           dmsg_lnk_conn_t               lnk_conn;
686           dmsg_lnk_span_t               lnk_span;
687 
688           dmsg_blk_open_t               blk_open;
689           dmsg_blk_error_t    blk_error;
690           dmsg_blk_read_t               blk_read;
691           dmsg_blk_write_t    blk_write;
692           dmsg_blk_flush_t    blk_flush;
693           dmsg_blk_freeblks_t blk_freeblks;
694 };
695 
696 typedef union dmsg_any dmsg_any_t;
697 
698 /*
699  * Kernel iocom structures and prototypes for kern/kern_dmsg.c
700  */
701 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
702 
703 struct hammer2_mount;
704 struct xa_softc;
705 struct kdmsg_iocom;
706 struct kdmsg_state;
707 struct kdmsg_msg;
708 struct kdmsg_data;
709 
710 /*
711  * msg_ctl flags (atomic)
712  */
713 #define KDMSG_CLUSTERCTL_UNUSED01       0x00000001
714 #define KDMSG_CLUSTERCTL_KILLRX                   0x00000002 /* staged helper exit */
715 #define KDMSG_CLUSTERCTL_KILLTX                   0x00000004 /* staged helper exit */
716 #define KDMSG_CLUSTERCTL_SLEEPING       0x00000008 /* interlocked w/msglk */
717 
718 /*
719  * Transactional state structure, representing an open transaction.  The
720  * transaction might represent a cache state (and thus have a chain
721  * association), or a VOP op, LNK_SPAN, or other things.
722  *
723  * NOTE: A non-empty subq represents one ref.
724  *         If we are inserted on a parent's subq, that's one ref (SUBINSERTED).
725  *         If we are inserted on a RB tree, that's one ref (RBINSERTED).
726  *         msg->state represents a ref.
727  *         Other code references may hold refs.
728  *
729  * NOTE: The parent association stays intact as long as a state has a
730  *         non-empty subq.  Otherwise simulated failures might not be able
731  *         to reach the children.
732  */
733 TAILQ_HEAD(kdmsg_state_list, kdmsg_state);
734 
735 struct kdmsg_state {
736           RB_ENTRY(kdmsg_state) rbnode;           /* indexed by msgid */
737           struct kdmsg_state  *scan;              /* scan check */
738           struct kdmsg_state_list       subq;               /* active stacked states */
739           TAILQ_ENTRY(kdmsg_state) entry;                   /* on parent subq */
740           TAILQ_ENTRY(kdmsg_state) user_entry;    /* available to devices */
741           struct kdmsg_iocom *iocom;
742           struct kdmsg_state *parent;
743           int                 refs;                         /* refs */
744           uint32_t  icmd;                         /* record cmd creating state */
745           uint32_t  txcmd;                        /* mostly for CMDF flags */
746           uint32_t  rxcmd;                        /* mostly for CMDF flags */
747           uint64_t  msgid;                        /* {parent,msgid} uniq */
748           int                 flags;
749           int                 error;
750           void                *chain;                       /* (caller's state) */
751           int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
752           union {
753                     void *any;
754                     struct hammer2_mount *hmp;
755                     struct xa_softc *xa_sc;
756           } any;
757 };
758 
759 #define KDMSG_STATE_SUBINSERTED         0x0001
760 #define KDMSG_STATE_DYNAMIC   0x0002
761 #define KDMSG_STATE_UNUSED0004          0x0004
762 #define KDMSG_STATE_ABORTING  0x0008              /* avoids recursive abort */
763 #define KDMSG_STATE_OPPOSITE  0x0010              /* opposite direction */
764 #define KDMSG_STATE_DYING     0x0020              /* atomic recursive circ fail */
765 #define KDMSG_STATE_INTERLOCK 0x0040
766 #define KDMSG_STATE_RBINSERTED          0x0080
767 #define KDMSG_STATE_SIGNAL    0x0400
768 #define KDMSG_STATE_NEW                 0x0800              /* defer abort processing */
769 
770 struct kdmsg_msg {
771           TAILQ_ENTRY(kdmsg_msg) qentry;                    /* serialized queue */
772           struct kdmsg_state *state;
773           size_t              hdr_size;
774           size_t              aux_size;
775           char                *aux_data;
776           uint32_t  flags;
777           uint32_t  tcmd;                         /* outer transaction cmd */
778           dmsg_any_t          any;                          /* variable sized */
779 };
780 
781 struct kdmsg_data {
782           char                *aux_data;
783           size_t              aux_size;
784           struct kdmsg_iocom *iocom;
785 };
786 
787 #define KDMSG_FLAG_AUXALLOC   0x0001
788 
789 typedef struct kdmsg_link kdmsg_link_t;
790 typedef struct kdmsg_state kdmsg_state_t;
791 typedef struct kdmsg_msg kdmsg_msg_t;
792 typedef struct kdmsg_data kdmsg_data_t;
793 
794 struct kdmsg_state_tree;
795 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
796 RB_HEAD(kdmsg_state_tree, kdmsg_state);
797 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
798 
799 struct file;                            /* forward decl */
800 struct malloc_type;
801 
802 /*
803  * Structure embedded in e.g. mount, master control structure for
804  * DMSG stream handling.
805  */
806 struct kdmsg_iocom {
807           struct malloc_type  *mmsg;
808           struct file                   *msg_fp;  /* cluster pipe->userland */
809           thread_t            msgrd_td; /* cluster thread */
810           thread_t            msgwr_td; /* cluster thread */
811           int                           msg_ctl;  /* wakeup flags */
812           int                           msg_seq;  /* cluster msg sequence id */
813           uint32_t            flags;
814           struct lock                   msglk;              /* lockmgr lock */
815           TAILQ_HEAD(, kdmsg_msg) msgq;           /* transmit queue */
816           void                          *handle;
817           void                          (*auto_callback)(kdmsg_msg_t *);
818           int                           (*rcvmsg)(kdmsg_msg_t *);
819           void                          (*exit_func)(struct kdmsg_iocom *);
820           struct kdmsg_state  state0;             /* root state for stacking */
821           struct kdmsg_state  *conn_state;        /* active LNK_CONN state */
822           struct kdmsg_state  *freerd_state;      /* allocation cache */
823           struct kdmsg_state  *freewr_state;      /* allocation cache */
824           struct kdmsg_state_tree staterd_tree;   /* active messages */
825           struct kdmsg_state_tree statewr_tree;   /* active messages */
826           dmsg_lnk_conn_t               auto_lnk_conn;
827           dmsg_lnk_span_t               auto_lnk_span;
828 };
829 
830 typedef struct kdmsg_iocom    kdmsg_iocom_t;
831 
832 #define KDMSG_IOCOMF_AUTOCONN 0x0001    /* handle RX/TX LNK_CONN */
833 #define KDMSG_IOCOMF_AUTORXSPAN         0x0002    /* handle RX LNK_SPAN */
834 #define KDMSG_IOCOMF_AUTOTXSPAN         0x0008    /* handle TX LNK_SPAN */
835 #define KDMSG_IOCOMF_EXITNOACC          0x8000    /* cannot accept writes */
836 
837 #define KDMSG_IOCOMF_AUTOANY  (KDMSG_IOCOMF_AUTOCONN |      \
838                                          KDMSG_IOCOMF_AUTORXSPAN |    \
839                                          KDMSG_IOCOMF_AUTOTXSPAN)
840 
841 #endif    /* _KERNEL || _KERNEL_STRUCTURES */
842 
843 #ifdef _KERNEL
844 
845 /*
846  * kern_dmsg.c
847  */
848 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
849                               struct malloc_type *mmsg,
850                               int (*rcvmsg)(kdmsg_msg_t *msg));
851 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
852                               const char *subsysname);
853 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
854                               void (*conn_callback)(kdmsg_msg_t *msg));
855 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
856 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
857 
858 void kdmsg_msg_free(kdmsg_msg_t *msg);
859 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
860                                         int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
861                                         void *data);
862 void kdmsg_msg_write(kdmsg_msg_t *msg);
863 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
864 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
865 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
866 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
867 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data);
868 void kdmsg_free_aux_data(kdmsg_data_t *data);
869 
870 #endif    /* _KERNEL */
871 
872 #endif    /* !_SYS_DMSG_H_ */
873