xref: /f-stack/app/redis-5.0.5/src/cluster.h (revision 572c4311)
1 #ifndef __CLUSTER_H
2 #define __CLUSTER_H
3 
4 /*-----------------------------------------------------------------------------
5  * Redis cluster data structures, defines, exported API.
6  *----------------------------------------------------------------------------*/
7 
8 #define CLUSTER_SLOTS 16384
9 #define CLUSTER_OK 0          /* Everything looks ok */
10 #define CLUSTER_FAIL 1        /* The cluster can't work */
11 #define CLUSTER_NAMELEN 40    /* sha1 hex length */
12 #define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
13 
14 /* The following defines are amount of time, sometimes expressed as
15  * multiplicators of the node timeout value (when ending with MULT). */
16 #define CLUSTER_DEFAULT_NODE_TIMEOUT 15000
17 #define CLUSTER_DEFAULT_SLAVE_VALIDITY 10 /* Slave max data age factor. */
18 #define CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE 1
19 #define CLUSTER_DEFAULT_SLAVE_NO_FAILOVER 0 /* Failover by default. */
20 #define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
21 #define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
22 #define CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
23 #define CLUSTER_FAILOVER_DELAY 5 /* Seconds */
24 #define CLUSTER_DEFAULT_MIGRATION_BARRIER 1
25 #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
26 #define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
27 #define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
28 
29 /* Redirection errors returned by getNodeByQuery(). */
30 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
31 #define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */
32 #define CLUSTER_REDIR_UNSTABLE 2      /* -TRYAGAIN redirection required */
33 #define CLUSTER_REDIR_ASK 3           /* -ASK redirection required. */
34 #define CLUSTER_REDIR_MOVED 4         /* -MOVED redirection required. */
35 #define CLUSTER_REDIR_DOWN_STATE 5    /* -CLUSTERDOWN, global state. */
36 #define CLUSTER_REDIR_DOWN_UNBOUND 6  /* -CLUSTERDOWN, unbound slot. */
37 
38 struct clusterNode;
39 
40 /* clusterLink encapsulates everything needed to talk with a remote node. */
41 typedef struct clusterLink {
42     mstime_t ctime;             /* Link creation time */
43     int fd;                     /* TCP socket file descriptor */
44     sds sndbuf;                 /* Packet send buffer */
45     sds rcvbuf;                 /* Packet reception buffer */
46     struct clusterNode *node;   /* Node related to this link if any, or NULL */
47 } clusterLink;
48 
49 /* Cluster node flags and macros. */
50 #define CLUSTER_NODE_MASTER 1     /* The node is a master */
51 #define CLUSTER_NODE_SLAVE 2      /* The node is a slave */
52 #define CLUSTER_NODE_PFAIL 4      /* Failure? Need acknowledge */
53 #define CLUSTER_NODE_FAIL 8       /* The node is believed to be malfunctioning */
54 #define CLUSTER_NODE_MYSELF 16    /* This node is myself */
55 #define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
56 #define CLUSTER_NODE_NOADDR   64  /* We don't know the address of this node */
57 #define CLUSTER_NODE_MEET 128     /* Send a MEET message to this node */
58 #define CLUSTER_NODE_MIGRATE_TO 256 /* Master elegible for replica migration. */
59 #define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failver. */
60 #define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
61 
62 #define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
63 #define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
64 #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
65 #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
66 #define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR)
67 #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
68 #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
69 #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
70 
71 /* Reasons why a slave is not able to failover. */
72 #define CLUSTER_CANT_FAILOVER_NONE 0
73 #define CLUSTER_CANT_FAILOVER_DATA_AGE 1
74 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
75 #define CLUSTER_CANT_FAILOVER_EXPIRED 3
76 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
77 #define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */
78 
79 /* clusterState todo_before_sleep flags. */
80 #define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
81 #define CLUSTER_TODO_UPDATE_STATE (1<<1)
82 #define CLUSTER_TODO_SAVE_CONFIG (1<<2)
83 #define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
84 
85 /* Message types.
86  *
87  * Note that the PING, PONG and MEET messages are actually the same exact
88  * kind of packet. PONG is the reply to ping, in the exact format as a PING,
89  * while MEET is a special PING that forces the receiver to add the sender
90  * as a node (if it is not already in the list). */
91 #define CLUSTERMSG_TYPE_PING 0          /* Ping */
92 #define CLUSTERMSG_TYPE_PONG 1          /* Pong (reply to Ping) */
93 #define CLUSTERMSG_TYPE_MEET 2          /* Meet "let's join" message */
94 #define CLUSTERMSG_TYPE_FAIL 3          /* Mark node xxx as failing */
95 #define CLUSTERMSG_TYPE_PUBLISH 4       /* Pub/Sub Publish propagation */
96 #define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
97 #define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6     /* Yes, you have my vote */
98 #define CLUSTERMSG_TYPE_UPDATE 7        /* Another node slots configuration */
99 #define CLUSTERMSG_TYPE_MFSTART 8       /* Pause clients for manual failover */
100 #define CLUSTERMSG_TYPE_MODULE 9        /* Module cluster API message. */
101 #define CLUSTERMSG_TYPE_COUNT 10        /* Total number of message types. */
102 
103 /* Flags that a module can set in order to prevent certain Redis Cluster
104  * features to be enabled. Useful when implementing a different distributed
105  * system on top of Redis Cluster message bus, using modules. */
106 #define CLUSTER_MODULE_FLAG_NONE 0
107 #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1)
108 #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2)
109 
110 /* This structure represent elements of node->fail_reports. */
111 typedef struct clusterNodeFailReport {
112     struct clusterNode *node;  /* Node reporting the failure condition. */
113     mstime_t time;             /* Time of the last report from this node. */
114 } clusterNodeFailReport;
115 
116 typedef struct clusterNode {
117     mstime_t ctime; /* Node object creation time. */
118     char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
119     int flags;      /* CLUSTER_NODE_... */
120     uint64_t configEpoch; /* Last configEpoch observed for this node */
121     unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
122     int numslots;   /* Number of slots handled by this node */
123     int numslaves;  /* Number of slave nodes, if this is a master */
124     struct clusterNode **slaves; /* pointers to slave nodes */
125     struct clusterNode *slaveof; /* pointer to the master node. Note that it
126                                     may be NULL even if the node is a slave
127                                     if we don't have the master node in our
128                                     tables. */
129     mstime_t ping_sent;      /* Unix time we sent latest ping */
130     mstime_t pong_received;  /* Unix time we received the pong */
131     mstime_t fail_time;      /* Unix time when FAIL flag was set */
132     mstime_t voted_time;     /* Last time we voted for a slave of this master */
133     mstime_t repl_offset_time;  /* Unix time we received offset for this node */
134     mstime_t orphaned_time;     /* Starting time of orphaned master condition */
135     long long repl_offset;      /* Last known repl offset for this node. */
136     char ip[NET_IP_STR_LEN];  /* Latest known IP address of this node */
137     int port;                   /* Latest known clients port of this node */
138     int cport;                  /* Latest known cluster port of this node. */
139     clusterLink *link;          /* TCP/IP link with this node */
140     list *fail_reports;         /* List of nodes signaling this as failing */
141 } clusterNode;
142 
143 typedef struct clusterState {
144     clusterNode *myself;  /* This node */
145     uint64_t currentEpoch;
146     int state;            /* CLUSTER_OK, CLUSTER_FAIL, ... */
147     int size;             /* Num of master nodes with at least one slot */
148     dict *nodes;          /* Hash table of name -> clusterNode structures */
149     dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
150     clusterNode *migrating_slots_to[CLUSTER_SLOTS];
151     clusterNode *importing_slots_from[CLUSTER_SLOTS];
152     clusterNode *slots[CLUSTER_SLOTS];
153     uint64_t slots_keys_count[CLUSTER_SLOTS];
154     rax *slots_to_keys;
155     /* The following fields are used to take the slave state on elections. */
156     mstime_t failover_auth_time; /* Time of previous or next election. */
157     int failover_auth_count;    /* Number of votes received so far. */
158     int failover_auth_sent;     /* True if we already asked for votes. */
159     int failover_auth_rank;     /* This slave rank for current auth request. */
160     uint64_t failover_auth_epoch; /* Epoch of the current election. */
161     int cant_failover_reason;   /* Why a slave is currently not able to
162                                    failover. See the CANT_FAILOVER_* macros. */
163     /* Manual failover state in common. */
164     mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
165                                    It is zero if there is no MF in progress. */
166     /* Manual failover state of master. */
167     clusterNode *mf_slave;      /* Slave performing the manual failover. */
168     /* Manual failover state of slave. */
169     long long mf_master_offset; /* Master offset the slave needs to start MF
170                                    or zero if stil not received. */
171     int mf_can_start;           /* If non-zero signal that the manual failover
172                                    can start requesting masters vote. */
173     /* The followign fields are used by masters to take state on elections. */
174     uint64_t lastVoteEpoch;     /* Epoch of the last vote granted. */
175     int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
176     /* Messages received and sent by type. */
177     long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
178     long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
179     long long stats_pfail_nodes;    /* Number of nodes in PFAIL status,
180                                        excluding nodes without address. */
181 } clusterState;
182 
183 /* Redis cluster messages header */
184 
185 /* Initially we don't know our "name", but we'll find it once we connect
186  * to the first node, using the getsockname() function. Then we'll use this
187  * address for all the next messages. */
188 typedef struct {
189     char nodename[CLUSTER_NAMELEN];
190     uint32_t ping_sent;
191     uint32_t pong_received;
192     char ip[NET_IP_STR_LEN];  /* IP address last time it was seen */
193     uint16_t port;              /* base port last time it was seen */
194     uint16_t cport;             /* cluster port last time it was seen */
195     uint16_t flags;             /* node->flags copy */
196     uint32_t notused1;
197 } clusterMsgDataGossip;
198 
199 typedef struct {
200     char nodename[CLUSTER_NAMELEN];
201 } clusterMsgDataFail;
202 
203 typedef struct {
204     uint32_t channel_len;
205     uint32_t message_len;
206     unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
207 } clusterMsgDataPublish;
208 
209 typedef struct {
210     uint64_t configEpoch; /* Config epoch of the specified instance. */
211     char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
212     unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */
213 } clusterMsgDataUpdate;
214 
215 typedef struct {
216     uint64_t module_id;     /* ID of the sender module. */
217     uint32_t len;           /* ID of the sender module. */
218     uint8_t type;           /* Type from 0 to 255. */
219     unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
220 } clusterMsgModule;
221 
222 union clusterMsgData {
223     /* PING, MEET and PONG */
224     struct {
225         /* Array of N clusterMsgDataGossip structures */
226         clusterMsgDataGossip gossip[1];
227     } ping;
228 
229     /* FAIL */
230     struct {
231         clusterMsgDataFail about;
232     } fail;
233 
234     /* PUBLISH */
235     struct {
236         clusterMsgDataPublish msg;
237     } publish;
238 
239     /* UPDATE */
240     struct {
241         clusterMsgDataUpdate nodecfg;
242     } update;
243 
244     /* MODULE */
245     struct {
246         clusterMsgModule msg;
247     } module;
248 };
249 
250 #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
251 
252 typedef struct {
253     char sig[4];        /* Signature "RCmb" (Redis Cluster message bus). */
254     uint32_t totlen;    /* Total length of this message */
255     uint16_t ver;       /* Protocol version, currently set to 1. */
256     uint16_t port;      /* TCP base port number. */
257     uint16_t type;      /* Message type */
258     uint16_t count;     /* Only used for some kind of messages. */
259     uint64_t currentEpoch;  /* The epoch accordingly to the sending node. */
260     uint64_t configEpoch;   /* The config epoch if it's a master, or the last
261                                epoch advertised by its master if it is a
262                                slave. */
263     uint64_t offset;    /* Master replication offset if node is a master or
264                            processed replication offset if node is a slave. */
265     char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
266     unsigned char myslots[CLUSTER_SLOTS/8];
267     char slaveof[CLUSTER_NAMELEN];
268     char myip[NET_IP_STR_LEN];    /* Sender IP, if not all zeroed. */
269     char notused1[34];  /* 34 bytes reserved for future usage. */
270     uint16_t cport;      /* Sender TCP cluster bus port */
271     uint16_t flags;      /* Sender node flags */
272     unsigned char state; /* Cluster state from the POV of the sender */
273     unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
274     union clusterMsgData data;
275 } clusterMsg;
276 
277 #define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
278 
279 /* Message flags better specify the packet content or are used to
280  * provide some information about the node state. */
281 #define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
282 #define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
283                                             master is up. */
284 
285 /* ---------------------- API exported outside cluster.c -------------------- */
286 clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
287 int clusterRedirectBlockedClientIfNeeded(client *c);
288 void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code);
289 
290 #endif /* __CLUSTER_H */
291