1 #ifndef __MOS_API_H_
2 #define __MOS_API_H_
3 
4 #ifdef DARWIN
5 #include <netinet/tcp.h>
6 #include <netinet/if_ether.h>
7 #else
8 #include <linux/tcp.h>
9 #include <linux/if_ether.h>
10 #endif
11 #include <netinet/in.h>
12 #include <arpa/inet.h>
13 #include <netinet/ip.h>
14 #include <stddef.h> /* for offsetof */
15 #include "mtcp_epoll.h"
16 #include <stdbool.h>
17 
18 #ifndef __MTCP_MANAGER
19 #define __MTCP_MANAGER
20 typedef struct mtcp_manager * mtcp_manager_t;
21 #endif
22 #ifndef __SOCKET_MAP
23 #define __SOCKET_MAP
24 typedef struct socket_map * socket_map_t;
25 #endif
26 
27 /** Available hooking points */
28 enum mtcp_hook_point
29 {
30 	/* NOTE: The value of hooking points should not overlap with any of
31 	 * mos_event_types */
32 
33 	/** Very first hooking point of incoming packet even before flow
34 	 * identification*/
35 	MOS_NULL	= (1 << 29),
36 	/** Hooking point before TCP receiver */
37 	MOS_HK_RCV     = (1 << 30),
38 	/** Hooking point after TCP sender */
39 	MOS_HK_SND	= (1 << 31),
40 };
41 
42 /** Built-in events provided by mOS */
43 enum mos_event_type
44 {
45 	/** invalid event */
46 	MOS_NULL_EVENT          = (0),
47 	/* mos-defined tcp build-in events */
48 	/** A packet is coming in. */
49 	MOS_ON_PKT_IN 		= (0x1<<0),
50 	/** A packet is going out. */
51 	/* THIS EVENT IS NOW DEPRECATED (USED ONLY FOR DEBUGGING) */
52 	MOS_ON_PKT_OUT 		= (0x1<<1),
53 	/** SYN packet as seen by the monitor
54 	 *  client side: activated when the client state is set to SYN_SENT
55 	 *  server side: activated when the server state is set to SYN_RCVD
56 	 *
57 	 *  Retransmitted SYN packets don't activate this event.
58 	 */
59 	MOS_ON_CONN_START	= (0x1<<2),
60 	/** 3-way handshake is finished.
61 	 * server side: ACK is coming in as a response of SYNACK.
62 	 * client side: SYNACK is coming in as a response of SYN. */
63 	/* THIS EVENT IS NOW DEPRECATED */
64 	MOS_ON_CONN_SETUP	= (0x1<<3),
65 	/** New data is now readable.
66 	 * This event is available in only MOS_NULL hook point.
67 	 * mOS raises this event only once while batched packet processing. */
68 	MOS_ON_CONN_NEW_DATA	= (0x1<<4),
69 	/** Abnormal behavior is detected.
70 	 * NOTE: This is not fully implemented yet. */
71 	MOS_ON_ERROR 		= (0x1<<5),
72 	/** No packet is seen for a long time.
73 	 * This is implemented as mtcp_cb_settimer()
74 	 */
75 	MOS_ON_TIMEOUT 		= (0x1<<6),
76 	/** TCP state is being changed. */
77 	MOS_ON_TCP_STATE_CHANGE	= (0x1<<7),
78 	/** A packet is not SYN and has no identified flow. */
79 	MOS_ON_ORPHAN		= (0x1<<8),
80 	/** Retransmission is detected */
81 	MOS_ON_REXMIT           = (0x1<<9),
82 	/** A flow is about to be destroyed.
83 	 * 4-way handshake, RST packet or timeout could be the reason.
84 	 * NOTE: In current implementation, mOS raises this event while destroying
85 	 * `struct tcp_stream`. There is possibility of false-positive especially
86 	 * when mOS is running out of memory. */
87 	MOS_ON_CONN_END		= (0x1<<10),
88 
89 	/** This event is for debugging. We can easily mute this later. */
90 	MOS_ON_DEBUG_MESSAGE  = (0x1<<11),
91 };
92 
93 #if 0
94 /* This may go away in future revisions */
95 typedef union event_data {
96 	uint32_t u32;
97 	uint64_t u64;
98 	void *ptr;
99 } event_data_t;
100 #endif
101 
102 /* Macros for updating packet context */
103 #define MOS_ETH_HDR		(1 << 0)
104 #define MOS_IP_HDR		(1 << 1)
105 #define MOS_TCP_HDR		(1 << 2)
106 #define MOS_TCP_PAYLOAD		(1 << 3)
107 #define MOS_UPDATE_IP_CHKSUM	(1 << 4)
108 #define MOS_UPDATE_TCP_CHKSUM	(1 << 5)
109 #define MOS_DROP		(1 << 6)
110 #define MOS_OVERWRITE		(1 << 7)
111 #define MOS_CHOMP		(1 << 8)
112 #define MOS_INSERT		(1 << 9)
113 
114 /**
115  * struct pkt_info is the struct that is actually
116  * exposed to the monitor application.
117  *
118  * NOTE: When you retrieve the packet information using mtcp_getlastpkt()
119  * via MOS_SOCK_MONITOR_RAW socket, you can only use up to L3 information.
120  * (cur_ts, eth_len, ip_len, ethh, iph)
121  */
122 struct pkt_info {
123 	uint32_t      cur_ts;    /**< packet receiving time (read-only:ro) */
124 	int8_t        in_ifidx;  /**< input interface (ro) */
125 
126 	/* ETH */
127 	uint16_t      eth_len;
128 
129 	/* IP */
130 	uint16_t      ip_len;
131 
132 	/* TCP */
133 	uint64_t      offset;    /**< TCP ring buffer offset */
134 	uint16_t      payloadlen;
135 	uint32_t      seq;
136 	uint32_t      ack_seq;
137 	uint16_t      window;
138 
139 	/* ~~ 28 byte boundary ~~ */
140 
141 	/*
142 	 * CAUTION!!!
143 	 * It is extremely critical that the last 5 fields (ethh .. frame)
144 	 * are always placed at the end of the definition. MOS relies on
145 	 * this specific arrangement when it is creating a new instantiation
146 	 * of pctx during mtcp_getlastpkt() invocation.
147 	 */
148 	struct ethhdr *ethh;
149 	struct iphdr  *iph;
150 	struct tcphdr *tcph;
151 	uint8_t       *payload;
152 };
153 
154 /**
155  * PACKET CONTEXT is the packet structure that goes through
156  * the mOS core...
157  */
158 struct pkt_ctx {
159 	struct pkt_info  p;
160 
161 	int8_t        direction; /**< where does this packet originate from? (ro)*/
162 	uint8_t       forward;   /**< 0: drop, 1: forward to out_ifidx (rw) */
163 	int8_t        out_ifidx; /**< output interface (rw) */
164 	int8_t        batch_index; /**< index of packet in the rx batch */
165 	/* ~~ 64 byte boundary ~~ */
166 };
167 #define PKT_INFO_LEN		offsetof(struct pkt_info, ethh)
168 
169 /*
170  * Sequence number change structure.
171  * Used for MOS_SEQ_REMAP.
172  */
173 typedef struct {
174 	int64_t seq_off;	/* the amount of sequence number drift */
175 	int side;		/* which side does this sequence number change apply to? */
176 	uint32_t base_seq;	/* seq # of the flow where the actual sequence # translation starts */
177 } seq_remap_info;
178 
179 typedef struct filter_arg {
180 	void *arg;
181 	size_t len;
182 } filter_arg_t;
183 
184 /**
185  * The available level number in the POSIX library for sockets is
186  * on SOL_SOCKET
187  */
188 #ifndef SOL_SOCKET
189 /* Level number for (get/set)sockopt() to apply to socket itself. */
190 #define SOL_SOCKET 		0xffff	/* options for socket level */
191 #endif
192 #define SOL_MONSOCKET		0xfffe	/* MOS monitor socket level */
193 
194 /**
195  * MOS monitor socket option names (and values)
196  * This will contain options pertaining to monitor stream sockets
197  *
198  * MOS_FRAGINFO_CLIBUF	: Gives back offsets to fragments of buffers
199  * (optname)		      currently stored in client's TCP ring buffer.
200  *          			  (getsockopt)
201  *
202  * MOS_FRAGINFO_SVRBUF	: Gives back offsets to fragments of buffers
203  * (optname)		      currently stored in server's TCP ring buffer.
204  *			             (getsockopt)
205  *
206  * MOS_INFO_CLIBUF	: Gives back tcp info for client-side ring buffer.
207  * (optname)		  (getsockopt)
208  *
209  * MOS_INFO_SVRBUF	: Gives back tcp info for server-side ring buffer.
210  * (optname)		  (getsockopt)
211  *
212  * MOS_TCP_STATE_CLI	: Retrieves current TCP state for client side
213  * (optname)		  (getsockopt)
214  *
215  * MOS_TCP_STATE_SVR	: Retrieves current TCP state for server side
216  * (optname)		  (getsockopt)
217  *
218  * MOS_TIMESTAMP	: Retrieves timestamp of last packet seen for
219  * (optname)		  given flow. (in usecs)
220  *			  (getsockopt)
221  *
222  * MOS_SEQ_REMAP	: Changes the sequence number change
223  * (optname)		  (setsockopt)
224  *
225  * MOS_STOP_MON		: Stop monitoring
226  * (optname)		  (setsockopt)
227  */
228 enum mos_socket_opts {
229 	MOS_FRAGINFO_CLIBUF	= 0x01,
230 	MOS_FRAGINFO_SVRBUF	= 0x02,
231 	MOS_INFO_CLIBUF		= 0x03,
232 	MOS_INFO_SVRBUF		= 0x04,
233 	MOS_TCP_STATE_CLI	= 0x05,
234 	MOS_TCP_STATE_SVR	= 0x06,
235 	MOS_TIMESTAMP		= 0x07,
236 	MOS_MONLEVEL		= 0x08,
237 	MOS_CLIBUF  		= 0x09,
238 	MOS_SVRBUF  		= 0x0a,
239 	MOS_SEQ_REMAP		= 0x0b,
240 	MOS_STOP_MON		= 0x0c,
241 	MOS_FRAG_CLIBUF   	= 0x0d,
242 	MOS_FRAG_SVRBUF   	= 0x0e,
243 	MOS_CLIOVERLAP		= 0x0f,
244 	MOS_SVROVERLAP		= 0x10,
245 #ifdef OLD_API
246 	MOS_NO_CLIBUF		= 0x0f,
247 	MOS_NO_SVRBUF		= 0x10,
248 #endif
249 };
250 
251 /**
252  * MOS tcp buf info structure.
253  * Used by the monitor application to retreive
254  * tcp_stream-related info. Usually called via
255  * getsockopt() function
256  */
257 struct tcp_buf_info {
258 	/** The initial TCP sequence number of TCP ring buffer. */
259 	uint32_t tcpbi_init_seq;
260 	/** TCP sequence number of the 'last byte of payload that has
261 	 * already been read by the end application' (applies in the case
262 	 * of embedded monitor setup)
263 	 */
264 	uint32_t tcpbi_last_byte_read;
265 	/** TCP sequence number of the 'last byte of the payload that
266 	 * is currently buffered and needs to be read by the end
267 	 * application' (applies in the case of embedded monitor setup).
268 	 *
269 	 * In case of standalone monitors, tcpbi_last_byte_read =
270 	 * tcpbi_next_byte_expected
271 	 */
272 	uint32_t tcpbi_next_byte_expected;
273 	/** TCP sequence number of the 'last byte of the payload that
274 	 * is currently stored' in the TCP ring buffer. This value
275 	 * may be greater than tcpbi_next_byte_expected if packets
276 	 * arrive out of order.
277 	 */
278 	uint32_t tcpbi_last_byte_received;
279 };
280 
281 /** Structure to expose TCP ring buffer's fragment information. */
282 struct tcp_ring_fragment {
283 	uint64_t offset;
284 	uint32_t len;
285 };
286 
287 /**
288  * mOS tcp stream states.
289  * used by the monitor application to retreive
290  * tcp_stream-state info. Usually called via
291  * getsockopt() function
292  */
293 enum tcpstate
294 {
295 	TCP_CLOSED		= 0,
296 	TCP_LISTEN		= 1,
297 	TCP_SYN_SENT		= 2,
298 	TCP_SYN_RCVD		= 3,
299 	TCP_ESTABLISHED		= 4,
300 	TCP_FIN_WAIT_1		= 5,
301 	TCP_FIN_WAIT_2		= 6,
302 	TCP_CLOSE_WAIT		= 7,
303 	TCP_CLOSING		= 8,
304 	TCP_LAST_ACK		= 9,
305 	TCP_TIME_WAIT		= 10
306 };
307 
308 /** mOS segment overlapping policies */
309 enum {
310 	MOS_OVERLAP_POLICY_FIRST=0,
311 	MOS_OVERLAP_POLICY_LAST,
312 	MOS_OVERLAP_CNT
313 };
314 
315 /** Definition of event type */
316 typedef uint64_t event_t;
317 
318 /** Definition of monitor side */
319 enum {MOS_SIDE_CLI=0, MOS_SIDE_SVR, MOS_SIDE_BOTH};
320 
321 /* mos callback/filter function type definition */
322 /** Prototype of callback function */
323 typedef void (*callback_t)(mctx_t mctx, int sock, int side,
324 			 event_t event, filter_arg_t *arg);
325 /** Prototype of UDE's filter function */
326 typedef bool (*filter_t)(mctx_t mctx, int sock, int side,
327 		       event_t event, filter_arg_t *arg);
328 
329 /*----------------------------------------------------------------------------*/
330 /* Definition of monitor_filter type */
331 union monitor_filter {
332 	/** For MOS_SOCK_MONITOR_RAW type socket **/
333 	char *raw_pkt_filter;
334 	/** For MOS_SOCK_MONITOR_STREAM type socket **/
335 	struct {
336 		char *stream_syn_filter;
337 		char *stream_orphan_filter;
338 	};
339 };
340 typedef union monitor_filter *monitor_filter_t;
341 
342 /* Assign an address range (specified by ft) to monitor via sock
343  *
344  * (1) If sock is MOS_SOCK_MONITOR_RAW type, ft.raw_pkt_filter is applied to
345  *     every packet coming in.
346  * (2) If sock is MOS_SOCK_MONITOR_STREAM type,
347  *     ft.stream_syn_filter is applied to the first SYN pkt of the flow.
348  *     (The succeeding packets of that flow will bypass the filter operation.)
349  *     ft.stream_orphan_filter is applied to the pkts that don't belong to any
350  *     of the existing TCP streams which are being monitored.
351  *     (e.g., non-SYN pkt with no identified flow)
352  * [*] ft.stream_syn_filter and ft.stream_orphan_filter should be consisted
353  *     only of the following keywords:
354  *     - 'tcp, 'host', 'src', 'dst', 'net', 'mask', 'port', 'portrange'
355  *     - 'and', 'or', '&', '|'
356  *
357  * @param [in] mctx: mtcp context
358  * @param [in] sock: socket id (should be MOS_SOCK_MONITOR_RAW
359  *                   or MOS_SOCK_MONITOR_STREAM type)
360  * @param [in] cf: Describe a set of connections to accept
361  *                 in a BPF (Berkerley Packet Filter) format
362  *                 NULL if you want to monitor any packet
363  * @return zero on success, -1 on error
364  */
365 int
366 mtcp_bind_monitor_filter(mctx_t mctx, int sock, monitor_filter_t ft);
367 /*----------------------------------------------------------------------------*/
368 
369 /** Register a callback function in hook_point
370  * @param [in] mctx: mtcp context
371  * @param [in] sock: socket id
372  * @param [in] event: event id
373  * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_DONTCARE
374  * @param [in] cb: callback fucntion
375  * @return zero on success, -1 on error
376  *
377  * (both for packet-level and flow-level) for events in hook_point
378  */
379 int
380 mtcp_register_callback(mctx_t mctx, int sock, event_t event,
381 		                       int hook_point, callback_t cb);
382 
383 /** Remove registered callback functions
384  * @param [in] mctx: mtcp context
385  * @param [in] sock: socket id
386  * @param [in] event: event id
387  * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_NULL
388  * @return zero on success, -1 on error
389  *
390  * (both for packet-level and flow-level) for events in hook_point
391  */
392 //int
393 //mtcp_unregister_callback(mctx_t mctx, int sock, event_t event,
394 //		                       int hook_point);
395 
396 /** Allocate a child event
397  * @param [in] event: event id
398  * @return new event id on success, 0 on error
399  */
400 event_t
401 mtcp_alloc_event(event_t event);
402 
403 /** Define a user-defined event function
404  * @param [in] event: event id
405  * @param [in] filter: filter fucntion for new event
406  * @param [in] arg: a filter argument to be delivered to the filter
407  * @return new event id on success, 0 on error
408  *
409  * (both for packet-level and flow-level)
410  */
411 event_t
412 mtcp_define_event(event_t event, filter_t filter, struct filter_arg *arg);
413 
414 /** Raise a event
415  * @param [in] mctx: mtcp context
416  * @param [in] event: event id
417  * @return 0 on success, -1 on error
418  */
419 int
420 mtcp_raise_event(mctx_t mctx, event_t event);
421 
422 /*
423  * Callback only functions
424  */
425 
426 /** Set user-level context
427  * (e.g., to store any per-flow user-defined meatadata)
428  * @param [in] mctx: mtcp context
429  * @param [in] sock: the monitor socket id
430  * @param [in] uctx: user-level context
431  */
432 void
433 mtcp_set_uctx(mctx_t mctx, int sock, void *uctx);
434 
435 /** Get user-level context
436  * (e.g., to retrieve user-defined metadata stored in mtcp_set_uctx())
437  * @param [in] mctx: mtcp context
438  * @param [in] sock: the monitor socket id
439  * @return user-level context for input flow_ocntext
440  */
441 void *
442 mtcp_get_uctx(mctx_t mctx, int sock);
443 
444 /** Peeking bytestream from flow_context
445  * @param [in] mctx: mtcp context
446  * @param [in] sock: monitoring stream socket id
447  * @param [in] side: side of monitoring (client side, server side or both)
448  * @param [in] buf: buffer for read byte stream
449  * @param [in] len: requested length
450  *
451  * It will return the number of bytes actually read.
452  * It will return -1 if there is an error
453 */
454 ssize_t
455 mtcp_peek(mctx_t mctx, int sock, int side,
456 	     char *buf, size_t len);
457 
458 /**
459  * The mtcp_ppeek() function reads up to count bytes from the TCP ring
460  * buffer of the monitor socket sock in mctx into buf, starting from
461  * the TCP sequence number seq_num.
462  * Note that seq_num can point the data in the fragmented buffer list
463  * of the TCP ring buffer. If there is no received byte with TCP sequence
464  * number seq_num in the TCP ring buffer, it returns error. If there are
465  * received bytes starting from seq_num, count is set to be the number
466  * of bytes read from the buffer. After mtcp_ppeek(), the data in the
467  * TCP ring buffer will not be flushed, and the monitor offset used by
468  * mtcp_peek() is not changed.
469  *
470  * @param [in] mctx: mtcp context
471  * @param [in] sock: monitoring stream socket id
472  * @param [in] side: side of monitoring (client side, server side or both)
473  * @param [in] buf: buffer for read byte stream
474  * @param [in] count: No. of bytes to be read
475  * @param [in] seq_num: byte offset of the TCP bytestream (absolute offset: offset 0 = init_seq_num)
476  * @return # of bytes actually read on success, -1 for error
477  */
478 ssize_t mtcp_ppeek(mctx_t mctx, int sock, int side,
479 			  char *buf, size_t count, uint64_t off);
480 
481 /* Use this macro to copy packets when mtcp_getlastpkt is called */
482 #define MTCP_CB_GETCURPKT_CREATE_COPY
483 
484 /** Get current packet of mtcp context
485  * @param [in] mctx: mTCP/mOS context
486  * @param [in] sock: monitoring stream socket id
487  * @param [in] side: side of monitoring
488  *                   (MOS_NULL for MOS_SOCK_MONITOR_RAW socket)
489  * @param [in] p: ptr to packet info ptr
490  * (only L2-L3 information is available for MOS_SOCK_MONITOR_RAW socket)
491  * @return 0 on success, -1 on failure
492  * This is useful for running callback-only applications
493  */
494 int
495 mtcp_getlastpkt(mctx_t mctx, int sock, int side, struct pkt_info *p);
496 
497 /** Register user's custom timer
498  * @param [in] mctx: mtcp context
499  * @param [in] id: timer id
500  * @param [in] timeout: timeout length
501  * @param [in] cb: callback function
502  */
503 int
504 mtcp_settimer(mctx_t mctx, int id, struct timeval *timeout, callback_t cb);
505 
506 /** A sibling function to mtcp_settimer that returns
507  * the current timestamp of the machine in microseconds.
508  * This avoids the monitor application to call current
509  * time getter functions (e.g. gettimeofday) that may
510  * incur overhead.
511  *
512  * @param [in] mctx: mtcp context
513  * Returns timestamp on success, 0 on failure.
514  */
515 uint32_t
516 mtcp_cb_get_ts(mctx_t mctx);
517 
518 /** Pause mtcp application context since it is not running anything
519  * @param [in] mctx: mtcp context
520  *
521  * This is useful for running callback-only applications
522  */
523 void
524 mtcp_app_join(mctx_t mctx);
525 
526 /** Get IP addrs/ports for both sides.
527  * (Server IP/port in 0th element) (Client IP/port in 1st element)
528  * Should only be called with MOS_SOCK_MONITOR_STREAM_ACTIVE socket
529  * _NOTE_: Code is currently not set for MOS_SOCK_STREAM!!!
530  * Returns 0 on success, -1 on failure
531  */
532 int
533 mtcp_getpeername(mctx_t mctx, int sock, struct sockaddr *saddr, socklen_t *addrlen, int side);
534 
535 /**
536  * Updates the Ethernet frame at a given offset across
537  * datalen bytes.
538  *
539  * @param [in] mctx: mtcp context
540  * @param [in] sock: monitoring socket
541  * @param [in] side: monitoring side
542  *                   (MOS_NULL for MOS_SOCK_MONITOR_RAW socket)
543  * @param [in] offset: the offset from where the data needs to be written
544  * @param [in] data: the data buffer that needs to be written
545  * @param [in] datalen: the length of data that needs to be written
546  * @param [in] option: disjunction of MOS_ETH_HDR, MOS_IP_HDR, MOS_TCP_HDR,
547  *			MOS_TCP_PAYLOAD, MOS_DROP_PKT, MOS_UPDATE_TCP_CHKSUM,
548  *			MOS_UPDATE_IP_CHKSUM
549  * @return Returns 0 on success, -1 on failure
550  *
551  * If you want to chomp/insert something in the payload:
552  * (i) first update the ip header to adjust iph->tot_len field; (MOS_OVERWRITE)
553  * (ii) then update the tcp payload accordingly (MOS_CHOMP or MOS_INSERT)
554  *
555  * MOS_DROP, MOS_OVERWRITE, MOS_CHOMP and MOS_INSERT are mutually
556  * exclusive operations
557  */
558 int
559 mtcp_setlastpkt(mctx_t mctx, int sock, int side, off_t offset,
560 		byte *data, uint16_t datalen, int option);
561 
562 /** Drop current packet (don't forward it to the peer node)
563  * @param [in] mctx: mtcp context
564  *
565  * This is useful for running callback-only applications
566  * This function is now deprecated...
567  */
568 //int
569 //mtcp_cb_dropcurpkt(mctx_t mctx);
570 
571 /* Reset the connection (send RST to both sides)
572  * (This API will be updated after discussion.)
573  */
574 int
575 mtcp_reset_conn(mctx_t mctx, int sock);
576 
577 int
578 mtcp_set_debug_string(mtcp_manager_t mtcp, const char *fmt, ...);
579 
580 int
581 mtcp_get_debug_string(mctx_t mctx, char *buf, int len);
582 
583 /**************************************************************************/
584 /** Send a TCP packet of struct pkt_info
585  * @param [in] mctx: mTCP/mOS context
586  * @param [in] sock: monitoring stream socket id
587  * @param [in] pkt: ptr to packet info (e.g., captured by mtcp_getlastpkt)
588  * @return 0 on success, -1 on failure
589  * (NOTE: this function supports only TCP packet for now.
590  *  we will add the support for any ethernet packets when required)
591  */
592 int
593 mtcp_sendpkt(mctx_t mctx, int sock, const struct pkt_info *pkt);
594 
595 /**************************************************************************/
596 
597 #endif /* __MOS_API_H_ */
598