1 #ifndef __MOS_API_H_
2 #define __MOS_API_H_
3 
4 #ifdef DARWIN
5 #include <netinet/tcp.h>
6 #include <netinet/if_ether.h>
7 #else
8 #include <linux/tcp.h>
9 #include <linux/if_ether.h>
10 #endif
11 #include <netinet/in.h>
12 #include <arpa/inet.h>
13 #include <netinet/ip.h>
14 #include <stddef.h> /* for offsetof */
15 #include "mtcp_epoll.h"
16 #include <stdbool.h>
17 
18 #ifndef __MTCP_MANAGER
19 #define __MTCP_MANAGER
20 typedef struct mtcp_manager * mtcp_manager_t;
21 #endif
22 #ifndef __SOCKET_MAP
23 #define __SOCKET_MAP
24 typedef struct socket_map * socket_map_t;
25 #endif
26 
27 /** Available hooking points */
28 enum mtcp_hook_point
29 {
30 	/* NOTE: The value of hooking points should not overlap with any of
31 	 * mos_event_types */
32 
33 	/** Very first hooking point of incoming packet even before flow
34 	 * identification*/
35 	MOS_NULL	= (1 << 29),
36 	/** Hooking point before TCP receiver */
37 	MOS_HK_RCV     = (1 << 30),
38 	/** Hooking point after TCP sender */
39 	MOS_HK_SND	= (1 << 31),
40 };
41 
42 /** Built-in events provided by mOS */
43 enum mos_event_type
44 {
45 	/** invalid event */
46 	MOS_NULL_EVENT          = (0),
47 	/* mos-defined tcp build-in events */
48 	/** A packet is coming in. */
49 	MOS_ON_PKT_IN 		= (0x1<<0),
50 	/** A packet is going out. */
51 	/* THIS EVENT IS NOW DEPRECATED (USED ONLY FOR DEBUGGING) */
52 	MOS_ON_PKT_OUT 		= (0x1<<1),
53 	/** SYN packet as seen by the monitor
54 	 *  client side: activated when the client state is set to SYN_SENT
55 	 *  server side: activated when the server state is set to SYN_RCVD
56 	 *
57 	 *  Retransmitted SYN packets don't activate this event.
58 	 */
59 	MOS_ON_CONN_START	= (0x1<<2),
60 	/** 3-way handshake is finished.
61 	 * server side: ACK is coming in as a response of SYNACK.
62 	 * client side: SYNACK is coming in as a response of SYN. */
63 	/* THIS EVENT IS NOW DEPRECATED */
64 	MOS_ON_CONN_SETUP	= (0x1<<3),
65 	/** New data is now readable.
66 	 * This event is available in only MOS_NULL hook point.
67 	 * mOS raises this event only once while batched packet processing. */
68 	MOS_ON_CONN_NEW_DATA	= (0x1<<4),
69 	/** Abnormal behavior is detected.
70 	 * NOTE: This is not fully implemented yet. */
71 	MOS_ON_ERROR 		= (0x1<<5),
72 	/** No packet is seen for a long time.
73 	 * This is implemented as mtcp_cb_settimer()
74 	 */
75 	MOS_ON_TIMEOUT 		= (0x1<<6),
76 	/** TCP state is being changed. */
77 	MOS_ON_TCP_STATE_CHANGE	= (0x1<<7),
78 	/** A packet is not SYN and has no identified flow. */
79 	MOS_ON_ORPHAN		= (0x1<<8),
80 	/** Retransmission is detected */
81 	MOS_ON_REXMIT           = (0x1<<9),
82 	/** A flow is about to be destroyed.
83 	 * 4-way handshake, RST packet or timeout could be the reason.
84 	 * NOTE: In current implementation, mOS raises this event while destroying
85 	 * `struct tcp_stream`. There is possibility of false-positive especially
86 	 * when mOS is running out of memory. */
87 	MOS_ON_CONN_END		= (0x1<<10),
88 
89 	/** This event is for debugging. We can easily mute this later. */
90 	MOS_ON_DEBUG_MESSAGE  = (0x1<<11),
91 };
92 
93 #if 0
94 /* This may go away in future revisions */
95 typedef union event_data {
96 	uint32_t u32;
97 	uint64_t u64;
98 	void *ptr;
99 } event_data_t;
100 #endif
101 
102 /* Macros for updating packet context */
103 #define MOS_ETH_HDR		(1 << 0)
104 #define MOS_IP_HDR		(1 << 1)
105 #define MOS_TCP_HDR		(1 << 2)
106 #define MOS_TCP_PAYLOAD		(1 << 3)
107 #define MOS_UPDATE_IP_CHKSUM	(1 << 4)
108 #define MOS_UPDATE_TCP_CHKSUM	(1 << 5)
109 #define MOS_DROP		(1 << 6)
110 #define MOS_OVERWRITE		(1 << 7)
111 #define MOS_CHOMP		(1 << 8)
112 #define MOS_INSERT		(1 << 9)
113 
114 /**
115  * struct pkt_info is the struct that is actually
116  * exposed to the monitor application.
117  *
118  * NOTE: When you retrieve the packet information using mtcp_getlastpkt()
119  * via MOS_SOCK_MONITOR_RAW socket, you can only use up to L3 information.
120  * (cur_ts, eth_len, ip_len, ethh, iph)
121  */
122 struct pkt_info {
123 	uint32_t      cur_ts;    /**< packet receiving time (read-only:ro) */
124 
125 	/* ETH */
126 	uint16_t      eth_len;
127 
128 	/* IP */
129 	uint16_t      ip_len;
130 
131 	/* TCP */
132 	uint64_t      offset;    /**< TCP ring buffer offset */
133 	uint16_t      payloadlen;
134 	uint32_t      seq;
135 	uint32_t      ack_seq;
136 	uint16_t      window;
137 
138 	/* ~~ 28 byte boundary ~~ */
139 
140 	/*
141 	 * CAUTION!!!
142 	 * It is extremely critical that the last 5 fields (ethh .. frame)
143 	 * are always placed at the end of the definition. MOS relies on
144 	 * this specific arrangement when it is creating a new instantiation
145 	 * of pctx during mtcp_getlastpkt() invocation.
146 	 */
147 	struct ethhdr *ethh;
148 	struct iphdr  *iph;
149 	struct tcphdr *tcph;
150 	uint8_t       *payload;
151 };
152 
153 /**
154  * PACKET CONTEXT is the packet structure that goes through
155  * the mOS core...
156  */
157 struct pkt_ctx {
158 	struct pkt_info  p;
159 
160 	int8_t        direction; /**< where does this packet originate from? (ro)*/
161 	uint8_t       forward;   /**< 0: drop, 1: forward to out_ifidx (rw) */
162 	int8_t        in_ifidx;  /**< input interface (ro) */
163 	int8_t        out_ifidx; /**< output interface (rw) */
164 	int8_t        batch_index; /**< index of packet in the rx batch */
165 	/* ~~ 64 byte boundary ~~ */
166 };
167 #define PKT_INFO_LEN		offsetof(struct pkt_info, ethh)
168 
169 /*
170  * Sequence number change structure.
171  * Used for MOS_SEQ_REMAP.
172  */
173 typedef struct {
174 	int64_t seq_off;	/* the amount of sequence number drift */
175 	int side;		/* which side does this sequence number change apply to? */
176 	uint32_t base_seq;	/* seq # of the flow where the actual sequence # translation starts */
177 } seq_remap_info;
178 
179 typedef struct filter_arg {
180 	void *arg;
181 	size_t len;
182 } filter_arg_t;
183 
184 /**
185  * The available level number in the POSIX library for sockets is
186  * on SOL_SOCKET
187  */
188 #ifndef SOL_SOCKET
189 /* Level number for (get/set)sockopt() to apply to socket itself. */
190 #define SOL_SOCKET 		0xffff	/* options for socket level */
191 #endif
192 #define SOL_MONSOCKET		0xfffe	/* MOS monitor socket level */
193 
194 /**
195  * MOS monitor socket option names (and values)
196  * This will contain options pertaining to monitor stream sockets
197  *
198  * MOS_FRAGINFO_CLIBUF	: Gives back offsets to fragments of buffers
199  * (optname)		      currently stored in client's TCP ring buffer.
200  *          			  (getsockopt)
201  *
202  * MOS_FRAGINFO_SVRBUF	: Gives back offsets to fragments of buffers
203  * (optname)		      currently stored in server's TCP ring buffer.
204  *			             (getsockopt)
205  *
206  * MOS_INFO_CLIBUF	: Gives back tcp info for client-side ring buffer.
207  * (optname)		  (getsockopt)
208  *
209  * MOS_INFO_SVRBUF	: Gives back tcp info for server-side ring buffer.
210  * (optname)		  (getsockopt)
211  *
212  * MOS_TCP_STATE_CLI	: Retrieves current TCP state for client side
213  * (optname)		  (getsockopt)
214  *
215  * MOS_TCP_STATE_SVR	: Retrieves current TCP state for server side
216  * (optname)		  (getsockopt)
217  *
218  * MOS_TIMESTAMP	: Retrieves timestamp of last packet seen for
219  * (optname)		  given flow. (in usecs)
220  *			  (getsockopt)
221  *
222  * MOS_SEQ_REMAP	: Changes the sequence number change
223  * (optname)		  (setsockopt)
224  *
225  * MOS_STOP_MON		: Stop monitoring
226  * (optname)		  (setsockopt)
227  */
228 enum mos_socket_opts {
229 	MOS_FRAGINFO_CLIBUF	= 0x01,
230 	MOS_FRAGINFO_SVRBUF	= 0x02,
231 	MOS_INFO_CLIBUF		= 0x03,
232 	MOS_INFO_SVRBUF		= 0x04,
233 	MOS_TCP_STATE_CLI	= 0x05,
234 	MOS_TCP_STATE_SVR	= 0x06,
235 	MOS_TIMESTAMP		= 0x07,
236 	MOS_MONLEVEL		= 0x08,
237 	MOS_CLIBUF  		= 0x09,
238 	MOS_SVRBUF  		= 0x0a,
239 	MOS_SEQ_REMAP		= 0x0b,
240 	MOS_STOP_MON		= 0x0c,
241 	MOS_FRAG_CLIBUF   	= 0x0d,
242 	MOS_FRAG_SVRBUF   	= 0x0e,
243 #ifdef OLD_API
244 	MOS_NO_CLIBUF		= 0x0f,
245 	MOS_NO_SVRBUF		= 0x10,
246 #endif
247 };
248 
249 /**
250  * MOS tcp buf info structure.
251  * Used by the monitor application to retreive
252  * tcp_stream-related info. Usually called via
253  * getsockopt() function
254  */
255 struct tcp_buf_info {
256 	/** The initial TCP sequence number of TCP ring buffer. */
257 	uint32_t tcpbi_init_seq;
258 	/** TCP sequence number of the 'last byte of payload that has
259 	 * already been read by the end application' (applies in the case
260 	 * of embedded monitor setup)
261 	 */
262 	uint32_t tcpbi_last_byte_read;
263 	/** TCP sequence number of the 'last byte of the payload that
264 	 * is currently buffered and needs to be read by the end
265 	 * application' (applies in the case of embedded monitor setup).
266 	 *
267 	 * In case of standalone monitors, tcpbi_last_byte_read =
268 	 * tcpbi_next_byte_expected
269 	 */
270 	uint32_t tcpbi_next_byte_expected;
271 	/** TCP sequence number of the 'last byte of the payload that
272 	 * is currently stored' in the TCP ring buffer. This value
273 	 * may be greater than tcpbi_next_byte_expected if packets
274 	 * arrive out of order.
275 	 */
276 	uint32_t tcpbi_last_byte_received;
277 };
278 
279 #ifdef NEWPPEEK
280 /** Structure to expose TCP ring buffer's fragment information. */
281 struct tcp_ring_fragment {
282 	uint64_t offset;
283 	uint32_t len;
284 };
285 #else
286 /** Structure to expose TCP ring buffer's fragment information. */
287 struct tcp_ring_fragment {
288 	/** TCP sequence number of the packet */
289 	uint32_t seq_num;
290 	/** TCP sequence number */
291 	uint32_t len;
292 	/** points the next fragment argument, NULL if it is the end of the list */
293 	struct tcp_ring_fragment *next;
294 };
295 #endif
296 
297 /**
298  * mOS tcp stream states.
299  * used by the monitor application to retreive
300  * tcp_stream-state info. Usually called via
301  * getsockopt() function
302  */
303 enum tcpstate
304 {
305 	TCP_CLOSED		= 0,
306 	TCP_LISTEN		= 1,
307 	TCP_SYN_SENT		= 2,
308 	TCP_SYN_RCVD		= 3,
309 	TCP_ESTABLISHED		= 4,
310 	TCP_FIN_WAIT_1		= 5,
311 	TCP_FIN_WAIT_2		= 6,
312 	TCP_CLOSE_WAIT		= 7,
313 	TCP_CLOSING		= 8,
314 	TCP_LAST_ACK		= 9,
315 	TCP_TIME_WAIT		= 10
316 };
317 
318 /** Definition of event type */
319 typedef uint64_t event_t;
320 
321 /** Definition of monitor side */
322 enum {MOS_SIDE_CLI=0, MOS_SIDE_SVR, MOS_SIDE_BOTH};
323 
324 /* mos callback/filter function type definition */
325 /** Prototype of callback function */
326 typedef void (*callback_t)(mctx_t mctx, int sock, int side,
327 			 event_t event, filter_arg_t *arg);
328 /** Prototype of UDE's filter function */
329 typedef bool (*filter_t)(mctx_t mctx, int sock, int side,
330 		       event_t event, filter_arg_t *arg);
331 
332 /*----------------------------------------------------------------------------*/
333 /* Definition of monitor_filter type */
334 union monitor_filter {
335 	/** For MOS_SOCK_MONITOR_RAW type socket **/
336 	char *raw_pkt_filter;
337 	/** For MOS_SOCK_MONITOR_STREAM type socket **/
338 	struct {
339 		char *stream_syn_filter;
340 		char *stream_orphan_filter;
341 	};
342 };
343 typedef union monitor_filter *monitor_filter_t;
344 
345 /* Assign an address range (specified by ft) to monitor via sock
346  *
347  * (1) If sock is MOS_SOCK_MONITOR_RAW type, ft.raw_pkt_filter is applied to
348  *     every packet coming in.
349  * (2) If sock is MOS_SOCK_MONITOR_STREAM type,
350  *     ft.stream_syn_filter is applied to the first SYN pkt of the flow.
351  *     (The succeeding packets of that flow will bypass the filter operation.)
352  *     ft.stream_orphan_filter is applied to the pkts that don't belong to any
353  *     of the existing TCP streams which are being monitored.
354  *     (e.g., non-SYN pkt with no identified flow)
355  * [*] ft.stream_syn_filter and ft.stream_orphan_filter should be consisted
356  *     only of the following keywords:
357  *     - 'tcp, 'host', 'src', 'dst', 'net', 'mask', 'port', 'portrange'
358  *     - 'and', 'or', '&', '|'
359  *
360  * @param [in] mctx: mtcp context
361  * @param [in] sock: socket id (should be MOS_SOCK_MONITOR_RAW
362  *                   or MOS_SOCK_MONITOR_STREAM type)
363  * @param [in] cf: Describe a set of connections to accept
364  *                 in a BPF (Berkerley Packet Filter) format
365  *                 NULL if you want to monitor any packet
366  * @return zero on success, -1 on error
367  */
368 int
369 mtcp_bind_monitor_filter(mctx_t mctx, int sock, monitor_filter_t ft);
370 /*----------------------------------------------------------------------------*/
371 
372 /** Register a callback function in hook_point
373  * @param [in] mctx: mtcp context
374  * @param [in] sock: socket id
375  * @param [in] event: event id
376  * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_DONTCARE
377  * @param [in] cb: callback fucntion
378  * @return zero on success, -1 on error
379  *
380  * (both for packet-level and flow-level) for events in hook_point
381  */
382 int
383 mtcp_register_callback(mctx_t mctx, int sock, event_t event,
384 		                       int hook_point, callback_t cb);
385 
386 /** Remove registered callback functions
387  * @param [in] mctx: mtcp context
388  * @param [in] sock: socket id
389  * @param [in] event: event id
390  * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_NULL
391  * @return zero on success, -1 on error
392  *
393  * (both for packet-level and flow-level) for events in hook_point
394  */
395 //int
396 //mtcp_unregister_callback(mctx_t mctx, int sock, event_t event,
397 //		                       int hook_point);
398 
399 /** Allocate a child event
400  * @param [in] event: event id
401  * @return new event id on success, 0 on error
402  */
403 event_t
404 mtcp_alloc_event(event_t event);
405 
406 /** Define a user-defined event function
407  * @param [in] event: event id
408  * @param [in] filter: filter fucntion for new event
409  * @param [in] arg: a filter argument to be delivered to the filter
410  * @return new event id on success, 0 on error
411  *
412  * (both for packet-level and flow-level)
413  */
414 event_t
415 mtcp_define_event(event_t event, filter_t filter, struct filter_arg *arg);
416 
417 /** Raise a event
418  * @param [in] mctx: mtcp context
419  * @param [in] event: event id
420  * @return 0 on success, -1 on error
421  */
422 int
423 mtcp_raise_event(mctx_t mctx, event_t event);
424 
425 /*
426  * Callback only functions
427  */
428 
429 /** Set user-level context
430  * (e.g., to store any per-flow user-defined meatadata)
431  * @param [in] mctx: mtcp context
432  * @param [in] sock: the monitor socket id
433  * @param [in] uctx: user-level context
434  */
435 void
436 mtcp_set_uctx(mctx_t mctx, int sock, void *uctx);
437 
438 /** Get user-level context
439  * (e.g., to retrieve user-defined metadata stored in mtcp_set_uctx())
440  * @param [in] mctx: mtcp context
441  * @param [in] sock: the monitor socket id
442  * @return user-level context for input flow_ocntext
443  */
444 void *
445 mtcp_get_uctx(mctx_t mctx, int sock);
446 
447 /** Peeking bytestream from flow_context
448  * @param [in] mctx: mtcp context
449  * @param [in] sock: monitoring stream socket id
450  * @param [in] side: side of monitoring (client side, server side or both)
451  * @param [in] buf: buffer for read byte stream
452  * @param [in] len: requested length
453  *
454  * It will return the number of bytes actually read.
455  * It will return -1 if there is an error
456 */
457 ssize_t
458 mtcp_peek(mctx_t mctx, int sock, int side,
459 	     char *buf, size_t len);
460 
461 /**
462  * The mtcp_ppeek() function reads up to count bytes from the TCP ring
463  * buffer of the monitor socket sock in mctx into buf, starting from
464  * the TCP sequence number seq_num.
465  * Note that seq_num can point the data in the fragmented buffer list
466  * of the TCP ring buffer. If there is no received byte with TCP sequence
467  * number seq_num in the TCP ring buffer, it returns error. If there are
468  * received bytes starting from seq_num, count is set to be the number
469  * of bytes read from the buffer. After mtcp_ppeek(), the data in the
470  * TCP ring buffer will not be flushed, and the monitor offset used by
471  * mtcp_peek() is not changed.
472  *
473  * @param [in] mctx: mtcp context
474  * @param [in] sock: monitoring stream socket id
475  * @param [in] side: side of monitoring (client side, server side or both)
476  * @param [in] buf: buffer for read byte stream
477  * @param [in] count: No. of bytes to be read
478  * @param [in] seq_num: byte offset of the TCP bytestream (absolute offset: offset 0 = init_seq_num)
479  * @return # of bytes actually read on success, -1 for error
480  */
481 #ifdef NEWPPEEK
482 ssize_t mtcp_ppeek(mctx_t mctx, int sock, int side,
483 			  char *buf, size_t count, uint64_t off);
484 #else
485 ssize_t mtcp_ppeek(mctx_t mctx, int sock, int side,
486 		      char *buf, size_t count, off_t seq_num);
487 #endif
488 
489 /* Use this macro to copy packets when mtcp_getlastpkt is called */
490 #define MTCP_CB_GETCURPKT_CREATE_COPY
491 
492 /** Get current packet of mtcp context
493  * @param [in] mctx: mTCP/mOS context
494  * @param [in] sock: monitoring stream socket id
495  * @param [in] side: side of monitoring
496  *                   (MOS_NULL for MOS_SOCK_MONITOR_RAW socket)
497  * @param [in] p: ptr to packet info ptr
498  * (only L2-L3 information is available for MOS_SOCK_MONITOR_RAW socket)
499  * @return 0 on success, -1 on failure
500  * This is useful for running callback-only applications
501  */
502 int
503 mtcp_getlastpkt(mctx_t mctx, int sock, int side, struct pkt_info *p);
504 
505 /** Register user's custom timer
506  * @param [in] mctx: mtcp context
507  * @param [in] id: timer id
508  * @param [in] timeout: timeout length
509  * @param [in] cb: callback function
510  */
511 int
512 mtcp_settimer(mctx_t mctx, int id, struct timeval *timeout, callback_t cb);
513 
514 /** A sibling function to mtcp_settimer that returns
515  * the current timestamp of the machine in microseconds.
516  * This avoids the monitor application to call current
517  * time getter functions (e.g. gettimeofday) that may
518  * incur overhead.
519  *
520  * @param [in] mctx: mtcp context
521  * Returns timestamp on success, 0 on failure.
522  */
523 uint32_t
524 mtcp_cb_get_ts(mctx_t mctx);
525 
526 /** Pause mtcp application context since it is not running anything
527  * @param [in] mctx: mtcp context
528  *
529  * This is useful for running callback-only applications
530  */
531 void
532 mtcp_app_join(mctx_t mctx);
533 
534 /** Get IP addrs/ports for both sides.
535  * (Server IP/port in 0th element) (Client IP/port in 1st element)
536  * Should only be called with MOS_SOCK_MONITOR_STREAM_ACTIVE socket
537  * Returns 0 on success, -1 on failure
538  */
539 int
540 mtcp_getpeername(mctx_t mctx, int sock, struct sockaddr *saddr, socklen_t *addrlen, int side);
541 
542 /**
543  * Updates the Ethernet frame at a given offset across
544  * datalen bytes.
545  *
546  * @param [in] mctx: mtcp context
547  * @param [in] sock: monitoring socket
548  * @param [in] side: monitoring side
549  *                   (MOS_NULL for MOS_SOCK_MONITOR_RAW socket)
550  * @param [in] offset: the offset from where the data needs to be written
551  * @param [in] data: the data buffer that needs to be written
552  * @param [in] datalen: the length of data that needs to be written
553  * @param [in] option: disjunction of MOS_ETH_HDR, MOS_IP_HDR, MOS_TCP_HDR,
554  *			MOS_TCP_PAYLOAD, MOS_DROP_PKT, MOS_UPDATE_TCP_CHKSUM,
555  *			MOS_UPDATE_IP_CHKSUM
556  * @return Returns 0 on success, -1 on failure
557  *
558  * If you want to chomp/insert something in the payload:
559  * (i) first update the ip header to adjust iph->tot_len field; (MOS_OVERWRITE)
560  * (ii) then update the tcp payload accordingly (MOS_CHOMP or MOS_INSERT)
561  *
562  * MOS_DROP, MOS_OVERWRITE, MOS_CHOMP and MOS_INSERT are mutually
563  * exclusive operations
564  */
565 int
566 mtcp_setlastpkt(mctx_t mctx, int sock, int side, off_t offset,
567 		byte *data, uint16_t datalen, int option);
568 
569 /** Drop current packet (don't forward it to the peer node)
570  * @param [in] mctx: mtcp context
571  *
572  * This is useful for running callback-only applications
573  * This function is now deprecated...
574  */
575 //int
576 //mtcp_cb_dropcurpkt(mctx_t mctx);
577 
578 /* Reset the connection (send RST to both sides)
579  * (This API will be updated after discussion.)
580  */
581 int
582 mtcp_reset_conn(mctx_t mctx, int sock);
583 
584 int
585 mtcp_set_debug_string(mtcp_manager_t mtcp, const char *fmt, ...);
586 
587 int
588 mtcp_get_debug_string(mctx_t mctx, char *buf, int len);
589 
590 #endif /* __MOS_API_H_ */
591