1 #ifndef __TCP_STREAM_H_
2 #define __TCP_STREAM_H_
3 
4 #ifdef DARWIN
5 #include <netinet/tcp.h>
6 #else
7 #include <linux/tcp.h>
8 #endif
9 #include <netinet/ip.h>
10 #include <sys/queue.h>
11 
12 #include "mtcp.h"
13 #include "socket.h"
14 #include "memory_mgt.h"
15 #include "tcp_rb.h"
16 
17 #define TCP_MAX_SEQ 		4294967295
18 
19 #define HAS_STREAM_TYPE(str, type)	(str->stream_type & (1 << type))
20 #define IS_STREAM_TYPE(str, type)	(str->stream_type == (1 << type))
21 #define STREAM_TYPE(type)		(1 << type)
22 
23 /*----------------------------------------------------------------------------*/
24 /**
25  * routines for traversing stream + raw sockets
26  */
27 struct sockent {
28 	struct socket_map *sock;
29 	TAILQ_ENTRY(sockent) link;
30 };
31 #define SOCKQ_HEAD(name) TAILQ_HEAD(name, sockent)
32 #define SOCKQ_INIT(head) TAILQ_INIT(head)
33 #define SOCKQ_INSERT_TAIL(head, socket) \
34 do { \
35 	struct sockent *__s = \
36 			(struct sockent *)MPAllocateChunk(mtcp->sockent_pool); \
37 	if (__s) { \
38 		__s->sock = (socket); \
39 		TAILQ_INSERT_TAIL(head, __s, link); \
40 	} \
41 } while (0)
42 #define SOCKQ_REMOVE(head, socket) \
43 do { \
44 	struct sockent *__walk, *__temp; \
45 	for (__walk = TAILQ_FIRST(head); __walk != NULL; __walk = __temp) { \
46 		__temp = TAILQ_NEXT(__walk, link); \
47 		if ((socket) == __walk->sock) break; \
48 	} \
49 	if (__walk) { \
50 		TAILQ_REMOVE(head, __walk, link); \
51 		MPFreeChunk(mtcp->sockent_pool, __walk); \
52 	} \
53 } while (0)
54 #define SOCKQ_FOREACH_START(var, head) \
55 do { \
56 	struct sockent *__walk, *__temp; \
57 	for (__walk = ((head)->tqh_first); \
58 		((var) = __walk ? __walk->sock : NULL, __walk); \
59 		__walk = __temp) { \
60 		__temp = ((__walk)->link.tqe_next);
61 #define SOCKQ_FOREACH_REVERSE(var, head) \
62 	do { \
63         struct sockent *__walk, *__temp; \
64         for (__walk = (*(((struct mlist *)((head)->tqh_last))->tqh_last));	\
65 	((var) = __walk ? __walk->sock : NULL, __walk); \
66 	     __walk = __temp) { \
67         __temp = (*(((struct mlist *)((__walk)->link.tqe_prev))->tqh_last));
68 #define SOCKQ_FOREACH_END }} while (0)
69 /*----------------------------------------------------------------------------*/
70 
71 struct rtm_stat
72 {
73 	uint32_t tdp_ack_cnt;
74 	uint32_t tdp_ack_bytes;
75 	uint32_t ack_upd_cnt;
76 	uint32_t ack_upd_bytes;
77 #if TCP_OPT_SACK_ENABLED
78 	uint32_t sack_cnt;
79 	uint32_t sack_bytes;
80 	uint32_t tdp_sack_cnt;
81 	uint32_t tdp_sack_bytes;
82 #endif /* TCP_OPT_SACK_ENABLED */
83 	uint32_t rto_cnt;
84 	uint32_t rto_bytes;
85 };
86 
87 typedef struct seq_remap_entry {
88 	uint32_t seq_base;
89 	uint32_t seq_off;
90 } seq_remap_entry;
91 
92 #define SRE_MAX			8
93 
94 #if TCP_OPT_SACK_ENABLED
95 struct sack_entry
96 {
97 	uint32_t left_edge;
98 	uint32_t right_edge;
99 	uint32_t expire;
100 };
101 #endif /* TCP_OPT_SACK_ENABLED */
102 
103 struct tcp_recv_vars
104 {
105 	/* receiver variables */
106 	uint32_t rcv_wnd;		/* receive window (unscaled) */
107 	//uint32_t rcv_up;		/* receive urgent pointer */
108 	uint32_t irs;			/* initial receiving sequence */
109 	uint32_t snd_wl1;		/* segment seq number for last window update */
110 	uint32_t snd_wl2;		/* segment ack number for last window update */
111 
112 	/* variables for fast retransmission */
113 	uint32_t last_ack_seq;	/* highest ackd seq */
114 	uint8_t dup_acks;		/* number of duplicated acks */
115 
116 	/* timestamps */
117 	uint32_t ts_recent;			/* recent peer timestamp */
118 	uint32_t ts_lastack_rcvd;	/* last ack rcvd time */
119 	uint32_t ts_last_ts_upd;	/* last peer ts update time */
120 	uint32_t ts_tw_expire;	// timestamp for timewait expire
121 
122 	/* RTT estimation variables */
123 	uint32_t srtt;			/* smoothed round trip time << 3 (scaled) */
124 	uint32_t mdev;			/* medium deviation */
125 	uint32_t mdev_max;		/* maximal mdev ffor the last rtt period */
126 	uint32_t rttvar;		/* smoothed mdev_max */
127 	uint32_t rtt_seq;		/* sequence number to update rttvar */
128 
129 #if TCP_OPT_SACK_ENABLED		/* currently not used */
130 #define MAX_SACK_ENTRY 8
131 	struct sack_entry sack_table[MAX_SACK_ENTRY];
132 	uint8_t sacks:3;
133 #endif /* TCP_OPT_SACK_ENABLED */
134 
135 	tcprb_t *rcvbuf;
136 
137 #if USE_SPIN_LOCK
138 	pthread_spinlock_t read_lock;
139 #else
140 	pthread_mutex_t read_lock;
141 #endif
142 	struct hash_bucket_head *he_mybucket;
143 	TAILQ_ENTRY(tcp_stream) he_link;	/* hash table entry link */
144 };
145 
146 struct tcp_send_vars
147 {
148 	/* IP-level information */
149 	uint16_t ip_id;
150 
151 	uint16_t mss;			/* maximum segment size */
152 	uint16_t eff_mss;		/* effective segment size (excluding tcp option) */
153 
154 	uint8_t wscale_mine;		/* my window scale (advertising window) */
155 	uint8_t wscale_peer;		/* peer's window scale (advertised window) */
156 	int8_t nif_out;			/* cached output network interface */
157 	unsigned char *d_haddr;	/* cached destination MAC address */
158 
159 	/* send sequence variables */
160 	uint32_t snd_una;		/* send unacknoledged */
161 	uint32_t snd_wnd;		/* send window (unscaled) */
162 	uint32_t peer_wnd;		/* client window size */
163 	//uint32_t snd_up;		/* send urgent pointer (not used) */
164 	uint32_t iss;			/* initial sending sequence */
165 	uint32_t fss;			/* final sending sequence */
166 
167 	/* retransmission timeout variables */
168 	uint8_t nrtx;			/* number of retransmission */
169 	uint8_t max_nrtx;		/* max number of retransmission */
170 	uint32_t rto;			/* retransmission timeout */
171 	uint32_t ts_rto;		/* timestamp for retransmission timeout */
172 
173 	/* congestion control variables */
174 	uint32_t cwnd;				/* congestion window */
175 	uint32_t ssthresh;			/* slow start threshold */
176 
177 	/* timestamp */
178 	uint32_t ts_lastack_sent;	/* last ack sent time */
179 
180 	uint8_t is_wack:1, 			/* is ack for window adertisement? */
181 			ack_cnt:6;			/* number of acks to send. max 64 */
182 
183 	uint8_t on_control_list;
184 	uint8_t on_send_list;
185 	uint8_t on_ack_list;
186 	uint8_t on_sendq;
187 	uint8_t on_ackq;
188 	uint8_t on_closeq;
189 	uint8_t on_resetq;
190 
191 	uint8_t on_closeq_int:1,
192 			on_resetq_int:1,
193 			is_fin_sent:1,
194 			is_fin_ackd:1;
195 
196 	TAILQ_ENTRY(tcp_stream) control_link;
197 	TAILQ_ENTRY(tcp_stream) send_link;
198 	TAILQ_ENTRY(tcp_stream) ack_link;
199 
200 	TAILQ_ENTRY(tcp_stream) timer_link;		/* timer link (rto list, tw list) */
201 	TAILQ_ENTRY(tcp_stream) timeout_link;	/* connection timeout link */
202 
203 	struct tcp_send_buffer *sndbuf;
204 	struct seq_remap_entry sre[SRE_MAX];	/* seq # translation table */
205 	uint8_t sre_index;			/* seq # translation index */
206 
207 #if USE_SPIN_LOCK
208 	pthread_spinlock_t write_lock;
209 #else
210 	pthread_mutex_t write_lock;
211 #endif
212 
213 #if RTM_STAT
214 	struct rtm_stat rstat;			/* retransmission statistics */
215 #endif
216 };
217 
218 typedef struct tcp_stream
219 {
220 	/*
221 	 * This is a direct replacement for fctx...
222 	 * However this could be replaced by some
223 	 * more elaborate data structure that supports
224 	 * multiple monitors in the future...
225 	 *
226 	 * In case no monitor is attached, msock will be
227 	 * NULL.
228 	 *
229 	 * Support for standalone monitors will be patched
230 	 * in future revisions...
231 	 */
232 
233 	SOCKQ_HEAD(mlist) msocks;        /* in case monitoring is enabled */
234 	socket_map_t socket;		/* relating to MOS_SOCK_STREAM */
235 
236 	uint32_t id;
237 	uint32_t stream_type;		/* to identify sock_stream/mon_stream */
238 
239 	uint32_t saddr;			/* in network order */
240 	uint32_t daddr;			/* in network order */
241 	uint16_t sport;			/* in network order */
242 	uint16_t dport;			/* in network order */
243 
244 	uint32_t actions;
245 	uint64_t cb_events;
246 
247 	uint8_t state;			/* tcp state */
248 	uint8_t close_reason;	/* close reason */
249 	uint8_t on_hash_table;
250 	uint8_t on_timewait_list;
251 	uint8_t ht_idx;
252 	uint8_t closed;
253 	uint8_t is_bound_addr;
254 	uint8_t need_wnd_adv;
255 	int16_t on_rto_idx;
256 
257 	uint16_t on_timeout_list:1,
258 		on_rcv_br_list:1,
259 		on_snd_br_list:1,
260 		saw_timestamp:1,	/* whether peer sends timestamp */
261 		sack_permit:1,		/* whether peer permits SACK */
262 		control_list_waiting:1,
263 		have_reset:1,
264 		side:2,
265 		buffer_mgmt:2,
266 		status_mgmt:1,
267 		allow_pkt_modification:1;
268 
269 	uint32_t snd_nxt;		/* send next */
270 	uint32_t rcv_nxt;		/* receive next */
271 
272 	struct tcp_recv_vars *rcvvar;
273 	struct tcp_send_vars *sndvar;
274 
275 	uint32_t last_active_ts;		/* ts_last_ack_sent or ts_last_ts_upd */
276 
277 	struct tcp_stream *pair_stream; /* pair stream in case of monitor / proxy socket */
278 #ifdef RECORDPKT_PER_STREAM
279 	struct pkt_ctx last_pctx;
280 	unsigned char  last_pkt_data[ETHERNET_FRAME_LEN];
281 #endif
282 
283 } tcp_stream;
284 
285 extern inline char *
286 TCPStateToString(const tcp_stream *cur_stream);
287 
288 extern inline int
289 AddEpollEvent(struct mtcp_epoll *ep,
290 		int queue_type, socket_map_t socket, uint32_t event);
291 
292 extern inline void
293 RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream);
294 
295 extern inline void
296 RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream);
297 
298 extern inline void
299 RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream);
300 
301 extern inline int
302 RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream);
303 
304 tcp_stream *
305 CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
306 		uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport,
307 		unsigned int *hash);
308 
309 extern inline tcp_stream *
310 CreateDualTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type, uint32_t saddr,
311 		    uint16_t sport, uint32_t daddr, uint16_t dport, unsigned int *hash);
312 
313 extern inline tcp_stream *
314 CreateClientTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
315 			uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport, unsigned int *hash);
316 
317 extern inline tcp_stream *
318 AttachServerTCPStream(mtcp_manager_t mtcp, tcp_stream *cs, int type,
319 			uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport);
320 
321 void
322 DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream);
323 
324 void
325 DumpStream(mtcp_manager_t mtcp, tcp_stream *stream);
326 
327 int
328 GetFragInfo(socket_map_t sock, int side, void *optval, socklen_t *optlen);
329 
330 int
331 GetBufInfo(socket_map_t sock, int side, void *optval, socklen_t *optlen);
332 
333 int
334 GetTCPState(struct tcp_stream *stream, int side,
335 			void *optval, socklen_t *optlen);
336 
337 int
338 DisableBuf(socket_map_t sock, int side);
339 
340 int
341 GetLastTimestamp(struct tcp_stream *stream, uint32_t *usecs, socklen_t *sz);
342 
343 void
344 posix_seq_srand(unsigned seed);
345 
346 #endif /* __TCP_STREAM_H_ */
347