1 #ifndef __TCP_STREAM_H_
2 #define __TCP_STREAM_H_
3 
4 #ifdef DARWIN
5 #include <netinet/tcp.h>
6 #else
7 #include <linux/tcp.h>
8 #endif
9 #include <netinet/ip.h>
10 #include <sys/queue.h>
11 
12 #include "mtcp.h"
13 #include "socket.h"
14 #include "memory_mgt.h"
15 #include "tcp_rb.h"
16 
17 #define TCP_MAX_SEQ 		4294967295
18 
19 #define HAS_STREAM_TYPE(str, type)	(str->stream_type & (1 << type))
20 #define IS_STREAM_TYPE(str, type)	(str->stream_type == (1 << type))
21 #define STREAM_TYPE(type)		(1 << type)
22 
23 /*----------------------------------------------------------------------------*/
24 /**
25  * routines for traversing stream + raw sockets
26  */
27 struct sockent {
28 	struct socket_map *sock;
29 	TAILQ_ENTRY(sockent) link;
30 };
31 #define SOCKQ_HEAD(name) TAILQ_HEAD(name, sockent)
32 #define SOCKQ_INIT(head) TAILQ_INIT(head)
33 #define SOCKQ_INSERT_TAIL(head, socket) \
34 do { \
35 	struct sockent *__s = \
36 			(struct sockent *)MPAllocateChunk(mtcp->sockent_pool); \
37 	if (__s) { \
38 		__s->sock = (socket); \
39 		TAILQ_INSERT_TAIL(head, __s, link); \
40 	} \
41 } while (0)
42 #define SOCKQ_REMOVE(head, socket) \
43 do { \
44 	struct sockent *__walk, *__temp; \
45 	for (__walk = TAILQ_FIRST(head); __walk != NULL; __walk = __temp) { \
46 		__temp = TAILQ_NEXT(__walk, link); \
47 		if ((socket) == __walk->sock) break; \
48 	} \
49 	if (__walk) { \
50 		TAILQ_REMOVE(head, __walk, link); \
51 		MPFreeChunk(mtcp->sockent_pool, __walk); \
52 	} \
53 } while (0)
54 #define SOCKQ_FOREACH_START(var, head) \
55 do { \
56 	struct sockent *__walk, *__temp; \
57 	for (__walk = ((head)->tqh_first); \
58 		((var) = __walk ? __walk->sock : NULL, __walk); \
59 		__walk = __temp) { \
60 		__temp = ((__walk)->link.tqe_next);
61 #define SOCKQ_FOREACH_END }} while (0)
62 /*----------------------------------------------------------------------------*/
63 
64 struct rtm_stat
65 {
66 	uint32_t tdp_ack_cnt;
67 	uint32_t tdp_ack_bytes;
68 	uint32_t ack_upd_cnt;
69 	uint32_t ack_upd_bytes;
70 #if TCP_OPT_SACK_ENABLED
71 	uint32_t sack_cnt;
72 	uint32_t sack_bytes;
73 	uint32_t tdp_sack_cnt;
74 	uint32_t tdp_sack_bytes;
75 #endif /* TCP_OPT_SACK_ENABLED */
76 	uint32_t rto_cnt;
77 	uint32_t rto_bytes;
78 };
79 
80 typedef struct seq_remap_entry {
81 	uint32_t seq_base;
82 	uint32_t seq_off;
83 } seq_remap_entry;
84 
85 #define SRE_MAX			8
86 
87 #if TCP_OPT_SACK_ENABLED
88 struct sack_entry
89 {
90 	uint32_t left_edge;
91 	uint32_t right_edge;
92 	uint32_t expire;
93 };
94 #endif /* TCP_OPT_SACK_ENABLED */
95 
96 struct tcp_recv_vars
97 {
98 	/* receiver variables */
99 	uint32_t rcv_wnd;		/* receive window (unscaled) */
100 	//uint32_t rcv_up;		/* receive urgent pointer */
101 	uint32_t irs;			/* initial receiving sequence */
102 	uint32_t snd_wl1;		/* segment seq number for last window update */
103 	uint32_t snd_wl2;		/* segment ack number for last window update */
104 
105 	/* variables for fast retransmission */
106 	uint32_t last_ack_seq;	/* highest ackd seq */
107 	uint8_t dup_acks;		/* number of duplicated acks */
108 
109 	/* timestamps */
110 	uint32_t ts_recent;			/* recent peer timestamp */
111 	uint32_t ts_lastack_rcvd;	/* last ack rcvd time */
112 	uint32_t ts_last_ts_upd;	/* last peer ts update time */
113 	uint32_t ts_tw_expire;	// timestamp for timewait expire
114 
115 	/* RTT estimation variables */
116 	uint32_t srtt;			/* smoothed round trip time << 3 (scaled) */
117 	uint32_t mdev;			/* medium deviation */
118 	uint32_t mdev_max;		/* maximal mdev ffor the last rtt period */
119 	uint32_t rttvar;		/* smoothed mdev_max */
120 	uint32_t rtt_seq;		/* sequence number to update rttvar */
121 
122 #if TCP_OPT_SACK_ENABLED		/* currently not used */
123 #define MAX_SACK_ENTRY 8
124 	struct sack_entry sack_table[MAX_SACK_ENTRY];
125 	uint8_t sacks:3;
126 #endif /* TCP_OPT_SACK_ENABLED */
127 
128 #ifdef NEWRB
129 	tcprb_t *rcvbuf;
130 #else
131 	struct tcp_ring_buffer *rcvbuf;
132 #endif
133 
134 #if USE_SPIN_LOCK
135 	pthread_spinlock_t read_lock;
136 #else
137 	pthread_mutex_t read_lock;
138 #endif
139 	struct hash_bucket_head *he_mybucket;
140 	TAILQ_ENTRY(tcp_stream) he_link;	/* hash table entry link */
141 };
142 
143 struct tcp_send_vars
144 {
145 	/* IP-level information */
146 	uint16_t ip_id;
147 
148 	uint16_t mss;			/* maximum segment size */
149 	uint16_t eff_mss;		/* effective segment size (excluding tcp option) */
150 
151 	uint8_t wscale_mine;		/* my window scale (advertising window) */
152 	uint8_t wscale_peer;		/* peer's window scale (advertised window) */
153 	int8_t nif_out;			/* cached output network interface */
154 	unsigned char *d_haddr;	/* cached destination MAC address */
155 
156 	/* send sequence variables */
157 	uint32_t snd_una;		/* send unacknoledged */
158 	uint32_t snd_wnd;		/* send window (unscaled) */
159 	uint32_t peer_wnd;		/* client window size */
160 	//uint32_t snd_up;		/* send urgent pointer (not used) */
161 	uint32_t iss;			/* initial sending sequence */
162 	uint32_t fss;			/* final sending sequence */
163 
164 	/* retransmission timeout variables */
165 	uint8_t nrtx;			/* number of retransmission */
166 	uint8_t max_nrtx;		/* max number of retransmission */
167 	uint32_t rto;			/* retransmission timeout */
168 	uint32_t ts_rto;		/* timestamp for retransmission timeout */
169 
170 	/* congestion control variables */
171 	uint32_t cwnd;				/* congestion window */
172 	uint32_t ssthresh;			/* slow start threshold */
173 
174 	/* timestamp */
175 	uint32_t ts_lastack_sent;	/* last ack sent time */
176 
177 	uint8_t is_wack:1, 			/* is ack for window adertisement? */
178 			ack_cnt:6;			/* number of acks to send. max 64 */
179 
180 	uint8_t on_control_list;
181 	uint8_t on_send_list;
182 	uint8_t on_ack_list;
183 	uint8_t on_sendq;
184 	uint8_t on_ackq;
185 	uint8_t on_closeq;
186 	uint8_t on_resetq;
187 
188 	uint8_t on_closeq_int:1,
189 			on_resetq_int:1,
190 			is_fin_sent:1,
191 			is_fin_ackd:1;
192 
193 	TAILQ_ENTRY(tcp_stream) control_link;
194 	TAILQ_ENTRY(tcp_stream) send_link;
195 	TAILQ_ENTRY(tcp_stream) ack_link;
196 
197 	TAILQ_ENTRY(tcp_stream) timer_link;		/* timer link (rto list, tw list) */
198 	TAILQ_ENTRY(tcp_stream) timeout_link;	/* connection timeout link */
199 
200 	struct tcp_send_buffer *sndbuf;
201 	struct seq_remap_entry sre[SRE_MAX];	/* seq # translation table */
202 	uint8_t sre_index;			/* seq # translation index */
203 
204 #if USE_SPIN_LOCK
205 	pthread_spinlock_t write_lock;
206 #else
207 	pthread_mutex_t write_lock;
208 #endif
209 
210 #if RTM_STAT
211 	struct rtm_stat rstat;			/* retransmission statistics */
212 #endif
213 };
214 
215 typedef struct tcp_stream
216 {
217 	/*
218 	 * This is a direct replacement for fctx...
219 	 * However this could be replaced by some
220 	 * more elaborate data structure that supports
221 	 * multiple monitors in the future...
222 	 *
223 	 * In case no monitor is attached, msock will be
224 	 * NULL.
225 	 *
226 	 * Support for standalone monitors will be patched
227 	 * in future revisions...
228 	 */
229 
230 	SOCKQ_HEAD() msocks;        /* in case monitoring is enabled */
231 	socket_map_t socket;		/* relating to MOS_SOCK_STREAM */
232 
233 	uint32_t id;
234 	uint32_t stream_type;		/* to identify sock_stream/mon_stream */
235 
236 	uint32_t saddr;			/* in network order */
237 	uint32_t daddr;			/* in network order */
238 	uint16_t sport;			/* in network order */
239 	uint16_t dport;			/* in network order */
240 
241 	uint32_t actions;
242 	uint64_t cb_events;
243 
244 	uint8_t state;			/* tcp state */
245 	uint8_t close_reason;	/* close reason */
246 	uint8_t on_hash_table;
247 	uint8_t on_timewait_list;
248 	uint8_t ht_idx;
249 	uint8_t closed;
250 	uint8_t is_bound_addr;
251 	uint8_t need_wnd_adv;
252 	int16_t on_rto_idx;
253 
254 	uint16_t on_timeout_list:1,
255 		on_rcv_br_list:1,
256 		on_snd_br_list:1,
257 		saw_timestamp:1,	/* whether peer sends timestamp */
258 		sack_permit:1,		/* whether peer permits SACK */
259 		control_list_waiting:1,
260 		have_reset:1,
261 		side:2,
262 		buffer_mgmt:2,
263 		status_mgmt:1,
264 		allow_pkt_modification:1;
265 
266 	uint32_t snd_nxt;		/* send next */
267 	uint32_t rcv_nxt;		/* receive next */
268 
269 	struct tcp_recv_vars *rcvvar;
270 	struct tcp_send_vars *sndvar;
271 
272 	uint32_t last_active_ts;		/* ts_last_ack_sent or ts_last_ts_upd */
273 
274 	struct tcp_stream *pair_stream; /* pair stream in case of monitor / proxy socket */
275 
276 	struct pkt_ctx last_pctx;
277 	unsigned char  last_pkt_data[ETHERNET_FRAME_LEN];
278 
279 } tcp_stream;
280 
281 extern inline char *
282 TCPStateToString(const tcp_stream *cur_stream);
283 
284 extern inline int
285 AddEpollEvent(struct mtcp_epoll *ep,
286 		int queue_type, socket_map_t socket, uint32_t event);
287 
288 extern inline void
289 RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream);
290 
291 extern inline void
292 RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream);
293 
294 extern inline void
295 RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream);
296 
297 extern inline int
298 RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream);
299 
300 tcp_stream *
301 CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
302 		uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport,
303 		unsigned int *hash);
304 
305 extern inline tcp_stream *
306 CreateDualTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type, uint32_t saddr,
307 		    uint16_t sport, uint32_t daddr, uint16_t dport, unsigned int *hash);
308 
309 extern inline tcp_stream *
310 CreateClientTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
311 			uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport, unsigned int *hash);
312 
313 extern inline tcp_stream *
314 AttachServerTCPStream(mtcp_manager_t mtcp, tcp_stream *cs, int type,
315 			uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport);
316 
317 void
318 DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream);
319 
320 void
321 DumpStream(mtcp_manager_t mtcp, tcp_stream *stream);
322 
323 int
324 GetFragInfo(socket_map_t sock, int side, void *optval, socklen_t *optlen);
325 
326 int
327 GetBufInfo(socket_map_t sock, int side, void *optval, socklen_t *optlen);
328 
329 int
330 GetTCPState(struct tcp_stream *stream, int side,
331 			void *optval, socklen_t *optlen);
332 
333 int
334 DisableBuf(socket_map_t sock, int side);
335 
336 int
337 GetLastTimestamp(struct tcp_stream *stream, uint32_t *usecs, socklen_t *sz);
338 
339 int
340 TcpSeqChange(socket_map_t socket, uint32_t seq_drift, int side, uint32_t seqno);
341 
342 uint32_t
343 FetchSeqDrift(struct tcp_stream *stream, uint32_t seqno);
344 
345 void
346 posix_seq_srand(unsigned seed);
347 
348 #endif /* __TCP_STREAM_H_ */
349