1 #ifndef __MOS_API_H_ 2 #define __MOS_API_H_ 3 4 #ifdef DARWIN 5 #include <netinet/tcp.h> 6 #include <netinet/if_ether.h> 7 #else 8 #include <linux/tcp.h> 9 #include <linux/if_ether.h> 10 #endif 11 #include <netinet/in.h> 12 #include <arpa/inet.h> 13 #include <netinet/ip.h> 14 #include <stddef.h> /* for offsetof */ 15 #include "mtcp_epoll.h" 16 #include <stdbool.h> 17 18 #ifndef __MTCP_MANAGER 19 #define __MTCP_MANAGER 20 typedef struct mtcp_manager * mtcp_manager_t; 21 #endif 22 #ifndef __SOCKET_MAP 23 #define __SOCKET_MAP 24 typedef struct socket_map * socket_map_t; 25 #endif 26 27 /** Available hooking points */ 28 enum mtcp_hook_point 29 { 30 /* NOTE: The value of hooking points should not overlap with any of 31 * mos_event_types */ 32 33 /** Very first hooking point of incoming packet even before flow 34 * identification*/ 35 MOS_NULL = (1 << 29), 36 /** Hooking point before TCP receiver */ 37 MOS_HK_RCV = (1 << 30), 38 /** Hooking point after TCP sender */ 39 MOS_HK_SND = (1 << 31), 40 }; 41 42 /** Built-in events provided by mOS */ 43 enum mos_event_type 44 { 45 /** invalid event */ 46 MOS_NULL_EVENT = (0), 47 /* mos-defined tcp build-in events */ 48 /** A packet is coming in. */ 49 MOS_ON_PKT_IN = (0x1<<0), 50 /** A packet is going out. */ 51 /* THIS EVENT IS NOW DEPRECATED (USED ONLY FOR DEBUGGING) */ 52 MOS_ON_PKT_OUT = (0x1<<1), 53 /** SYN packet as seen by the monitor 54 * client side: activated when the client state is set to SYN_SENT 55 * server side: activated when the server state is set to SYN_RCVD 56 * 57 * Retransmitted SYN packets don't activate this event. 58 */ 59 MOS_ON_CONN_START = (0x1<<2), 60 /** 3-way handshake is finished. 61 * server side: ACK is coming in as a response of SYNACK. 62 * client side: SYNACK is coming in as a response of SYN. */ 63 /* THIS EVENT IS NOW DEPRECATED */ 64 MOS_ON_CONN_SETUP = (0x1<<3), 65 /** New data is now readable. 66 * This event is available in only MOS_NULL hook point. 67 * mOS raises this event only once while batched packet processing. */ 68 MOS_ON_CONN_NEW_DATA = (0x1<<4), 69 /** Abnormal behavior is detected. 70 * NOTE: This is not fully implemented yet. */ 71 MOS_ON_ERROR = (0x1<<5), 72 /** No packet is seen for a long time. 73 * This is implemented as mtcp_cb_settimer() 74 */ 75 MOS_ON_TIMEOUT = (0x1<<6), 76 /** TCP state is being changed. */ 77 MOS_ON_TCP_STATE_CHANGE = (0x1<<7), 78 /** A packet is not SYN and has no identified flow. */ 79 MOS_ON_ORPHAN = (0x1<<8), 80 /** Retransmission is detected */ 81 MOS_ON_REXMIT = (0x1<<9), 82 /** A flow is about to be destroyed. 83 * 4-way handshake, RST packet or timeout could be the reason. 84 * NOTE: In current implementation, mOS raises this event while destroying 85 * `struct tcp_stream`. There is possibility of false-positive especially 86 * when mOS is running out of memory. */ 87 MOS_ON_CONN_END = (0x1<<10), 88 89 /** This event is for debugging. We can easily mute this later. */ 90 MOS_ON_DEBUG_MESSAGE = (0x1<<11), 91 }; 92 93 #if 0 94 /* This may go away in future revisions */ 95 typedef union event_data { 96 uint32_t u32; 97 uint64_t u64; 98 void *ptr; 99 } event_data_t; 100 #endif 101 102 /* Macros for updating packet context */ 103 #define MOS_ETH_HDR (1 << 0) 104 #define MOS_IP_HDR (1 << 1) 105 #define MOS_TCP_HDR (1 << 2) 106 #define MOS_TCP_PAYLOAD (1 << 3) 107 #define MOS_UPDATE_IP_CHKSUM (1 << 4) 108 #define MOS_UPDATE_TCP_CHKSUM (1 << 5) 109 #define MOS_DROP (1 << 6) 110 #define MOS_OVERWRITE (1 << 7) 111 #define MOS_CHOMP (1 << 8) 112 #define MOS_INSERT (1 << 9) 113 114 /** 115 * struct pkt_info is the struct that is actually 116 * exposed to the monitor application. 117 * 118 * NOTE: When you retrieve the packet information using mtcp_getlastpkt() 119 * via MOS_SOCK_MONITOR_RAW socket, you can only use up to L3 information. 120 * (cur_ts, eth_len, ip_len, ethh, iph) 121 */ 122 struct pkt_info { 123 uint32_t cur_ts; /**< packet receiving time (read-only:ro) */ 124 125 /* ETH */ 126 uint16_t eth_len; 127 128 /* IP */ 129 uint16_t ip_len; 130 131 /* TCP */ 132 uint64_t offset; /**< TCP ring buffer offset */ 133 uint16_t payloadlen; 134 uint32_t seq; 135 uint32_t ack_seq; 136 uint16_t window; 137 138 /* ~~ 28 byte boundary ~~ */ 139 140 /* 141 * CAUTION!!! 142 * It is extremely critical that the last 5 fields (ethh .. frame) 143 * are always placed at the end of the definition. MOS relies on 144 * this specific arrangement when it is creating a new instantiation 145 * of pctx during mtcp_getlastpkt() invocation. 146 */ 147 struct ethhdr *ethh; 148 struct iphdr *iph; 149 struct tcphdr *tcph; 150 uint8_t *payload; 151 }; 152 153 /** 154 * PACKET CONTEXT is the packet structure that goes through 155 * the mOS core... 156 */ 157 struct pkt_ctx { 158 struct pkt_info p; 159 160 int8_t direction; /**< where does this packet originate from? (ro)*/ 161 uint8_t forward; /**< 0: drop, 1: forward to out_ifidx (rw) */ 162 int8_t in_ifidx; /**< input interface (ro) */ 163 int8_t out_ifidx; /**< output interface (rw) */ 164 int8_t batch_index; /**< index of packet in the rx batch */ 165 /* ~~ 64 byte boundary ~~ */ 166 }; 167 #define PKT_INFO_LEN offsetof(struct pkt_info, ethh) 168 169 /* 170 * Sequence number change structure. 171 * Used for MOS_SEQ_REMAP. 172 */ 173 typedef struct { 174 int64_t seq_off; /* the amount of sequence number drift */ 175 int side; /* which side does this sequence number change apply to? */ 176 uint32_t base_seq; /* seq # of the flow where the actual sequence # translation starts */ 177 } seq_remap_info; 178 179 typedef struct filter_arg { 180 void *arg; 181 size_t len; 182 } filter_arg_t; 183 184 /** 185 * The available level number in the POSIX library for sockets is 186 * on SOL_SOCKET 187 */ 188 #ifndef SOL_SOCKET 189 /* Level number for (get/set)sockopt() to apply to socket itself. */ 190 #define SOL_SOCKET 0xffff /* options for socket level */ 191 #endif 192 #define SOL_MONSOCKET 0xfffe /* MOS monitor socket level */ 193 194 /** 195 * MOS monitor socket option names (and values) 196 * This will contain options pertaining to monitor stream sockets 197 * 198 * MOS_FRAGINFO_CLIBUF : Gives back offsets to fragments of buffers 199 * (optname) currently stored in client's TCP ring buffer. 200 * (getsockopt) 201 * 202 * MOS_FRAGINFO_SVRBUF : Gives back offsets to fragments of buffers 203 * (optname) currently stored in server's TCP ring buffer. 204 * (getsockopt) 205 * 206 * MOS_INFO_CLIBUF : Gives back tcp info for client-side ring buffer. 207 * (optname) (getsockopt) 208 * 209 * MOS_INFO_SVRBUF : Gives back tcp info for server-side ring buffer. 210 * (optname) (getsockopt) 211 * 212 * MOS_TCP_STATE_CLI : Retrieves current TCP state for client side 213 * (optname) (getsockopt) 214 * 215 * MOS_TCP_STATE_SVR : Retrieves current TCP state for server side 216 * (optname) (getsockopt) 217 * 218 * MOS_TIMESTAMP : Retrieves timestamp of last packet seen for 219 * (optname) given flow. (in usecs) 220 * (getsockopt) 221 * 222 * MOS_SEQ_REMAP : Changes the sequence number change 223 * (optname) (setsockopt) 224 * 225 * MOS_STOP_MON : Stop monitoring 226 * (optname) (setsockopt) 227 */ 228 enum mos_socket_opts { 229 MOS_FRAGINFO_CLIBUF = 0x01, 230 MOS_FRAGINFO_SVRBUF = 0x02, 231 MOS_INFO_CLIBUF = 0x03, 232 MOS_INFO_SVRBUF = 0x04, 233 MOS_TCP_STATE_CLI = 0x05, 234 MOS_TCP_STATE_SVR = 0x06, 235 MOS_TIMESTAMP = 0x07, 236 MOS_MONLEVEL = 0x08, 237 MOS_CLIBUF = 0x09, 238 MOS_SVRBUF = 0x0a, 239 MOS_SEQ_REMAP = 0x0b, 240 MOS_STOP_MON = 0x0c, 241 MOS_FRAG_CLIBUF = 0x0d, 242 MOS_FRAG_SVRBUF = 0x0e, 243 #ifdef OLD_API 244 MOS_NO_CLIBUF = 0x0f, 245 MOS_NO_SVRBUF = 0x10, 246 #endif 247 }; 248 249 /** 250 * MOS tcp buf info structure. 251 * Used by the monitor application to retreive 252 * tcp_stream-related info. Usually called via 253 * getsockopt() function 254 */ 255 struct tcp_buf_info { 256 /** The initial TCP sequence number of TCP ring buffer. */ 257 uint32_t tcpbi_init_seq; 258 /** TCP sequence number of the 'last byte of payload that has 259 * already been read by the end application' (applies in the case 260 * of embedded monitor setup) 261 */ 262 uint32_t tcpbi_last_byte_read; 263 /** TCP sequence number of the 'last byte of the payload that 264 * is currently buffered and needs to be read by the end 265 * application' (applies in the case of embedded monitor setup). 266 * 267 * In case of standalone monitors, tcpbi_last_byte_read = 268 * tcpbi_next_byte_expected 269 */ 270 uint32_t tcpbi_next_byte_expected; 271 /** TCP sequence number of the 'last byte of the payload that 272 * is currently stored' in the TCP ring buffer. This value 273 * may be greater than tcpbi_next_byte_expected if packets 274 * arrive out of order. 275 */ 276 uint32_t tcpbi_last_byte_received; 277 }; 278 279 #ifdef NEWPPEEK 280 /** Structure to expose TCP ring buffer's fragment information. */ 281 struct tcp_ring_fragment { 282 uint64_t offset; 283 uint32_t len; 284 }; 285 #else 286 /** Structure to expose TCP ring buffer's fragment information. */ 287 struct tcp_ring_fragment { 288 /** TCP sequence number of the packet */ 289 uint32_t seq_num; 290 /** TCP sequence number */ 291 uint32_t len; 292 /** points the next fragment argument, NULL if it is the end of the list */ 293 struct tcp_ring_fragment *next; 294 }; 295 #endif 296 297 /** 298 * mOS tcp stream states. 299 * used by the monitor application to retreive 300 * tcp_stream-state info. Usually called via 301 * getsockopt() function 302 */ 303 enum tcpstate 304 { 305 TCP_CLOSED = 0, 306 TCP_LISTEN = 1, 307 TCP_SYN_SENT = 2, 308 TCP_SYN_RCVD = 3, 309 TCP_ESTABLISHED = 4, 310 TCP_FIN_WAIT_1 = 5, 311 TCP_FIN_WAIT_2 = 6, 312 TCP_CLOSE_WAIT = 7, 313 TCP_CLOSING = 8, 314 TCP_LAST_ACK = 9, 315 TCP_TIME_WAIT = 10 316 }; 317 318 /** Definition of event type */ 319 typedef uint64_t event_t; 320 321 /** Definition of monitor side */ 322 enum {MOS_SIDE_CLI=0, MOS_SIDE_SVR, MOS_SIDE_BOTH}; 323 324 /* mos callback/filter function type definition */ 325 /** Prototype of callback function */ 326 typedef void (*callback_t)(mctx_t mctx, int sock, int side, 327 event_t event, filter_arg_t *arg); 328 /** Prototype of UDE's filter function */ 329 typedef bool (*filter_t)(mctx_t mctx, int sock, int side, 330 event_t event, filter_arg_t *arg); 331 332 /*----------------------------------------------------------------------------*/ 333 /* Definition of monitor_filter type */ 334 union monitor_filter { 335 /** For MOS_SOCK_MONITOR_RAW type socket **/ 336 char *raw_pkt_filter; 337 /** For MOS_SOCK_MONITOR_STREAM type socket **/ 338 struct { 339 char *stream_syn_filter; 340 char *stream_orphan_filter; 341 }; 342 }; 343 typedef union monitor_filter *monitor_filter_t; 344 345 /* Assign an address range (specified by ft) to monitor via sock 346 * 347 * (1) If sock is MOS_SOCK_MONITOR_RAW type, ft.raw_pkt_filter is applied to 348 * every packet coming in. 349 * (2) If sock is MOS_SOCK_MONITOR_STREAM type, 350 * ft.stream_syn_filter is applied to the first SYN pkt of the flow. 351 * (The succeeding packets of that flow will bypass the filter operation.) 352 * ft.stream_orphan_filter is applied to the pkts that don't belong to any 353 * of the existing TCP streams which are being monitored. 354 * (e.g., non-SYN pkt with no identified flow) 355 * [*] ft.stream_syn_filter and ft.stream_orphan_filter should be consisted 356 * only of the following keywords: 357 * - 'tcp, 'host', 'src', 'dst', 'net', 'mask', 'port', 'portrange' 358 * - 'and', 'or', '&', '|' 359 * 360 * @param [in] mctx: mtcp context 361 * @param [in] sock: socket id (should be MOS_SOCK_MONITOR_RAW 362 * or MOS_SOCK_MONITOR_STREAM type) 363 * @param [in] cf: Describe a set of connections to accept 364 * in a BPF (Berkerley Packet Filter) format 365 * NULL if you want to monitor any packet 366 * @return zero on success, -1 on error 367 */ 368 int 369 mtcp_bind_monitor_filter(mctx_t mctx, int sock, monitor_filter_t ft); 370 /*----------------------------------------------------------------------------*/ 371 372 /** Register a callback function in hook_point 373 * @param [in] mctx: mtcp context 374 * @param [in] sock: socket id 375 * @param [in] event: event id 376 * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_DONTCARE 377 * @param [in] cb: callback fucntion 378 * @return zero on success, -1 on error 379 * 380 * (both for packet-level and flow-level) for events in hook_point 381 */ 382 int 383 mtcp_register_callback(mctx_t mctx, int sock, event_t event, 384 int hook_point, callback_t cb); 385 386 /** Remove registered callback functions 387 * @param [in] mctx: mtcp context 388 * @param [in] sock: socket id 389 * @param [in] event: event id 390 * @param [in] hook_point: MOS_HK_RCV, MOS_HK_SND, MOS_NULL 391 * @return zero on success, -1 on error 392 * 393 * (both for packet-level and flow-level) for events in hook_point 394 */ 395 //int 396 //mtcp_unregister_callback(mctx_t mctx, int sock, event_t event, 397 // int hook_point); 398 399 /** Allocate a child event 400 * @param [in] event: event id 401 * @return new event id on success, 0 on error 402 */ 403 event_t 404 mtcp_alloc_event(event_t event); 405 406 /** Define a user-defined event function 407 * @param [in] event: event id 408 * @param [in] filter: filter fucntion for new event 409 * @param [in] arg: a filter argument to be delivered to the filter 410 * @return new event id on success, 0 on error 411 * 412 * (both for packet-level and flow-level) 413 */ 414 event_t 415 mtcp_define_event(event_t event, filter_t filter, struct filter_arg *arg); 416 417 /** Raise a event 418 * @param [in] mctx: mtcp context 419 * @param [in] event: event id 420 * @return 0 on success, -1 on error 421 */ 422 int 423 mtcp_raise_event(mctx_t mctx, event_t event); 424 425 /* 426 * Callback only functions 427 */ 428 429 /** Set user-level context 430 * (e.g., to store any per-flow user-defined meatadata) 431 * @param [in] mctx: mtcp context 432 * @param [in] sock: the monitor socket id 433 * @param [in] uctx: user-level context 434 */ 435 void 436 mtcp_set_uctx(mctx_t mctx, int sock, void *uctx); 437 438 /** Get user-level context 439 * (e.g., to retrieve user-defined metadata stored in mtcp_set_uctx()) 440 * @param [in] mctx: mtcp context 441 * @param [in] sock: the monitor socket id 442 * @return user-level context for input flow_ocntext 443 */ 444 void * 445 mtcp_get_uctx(mctx_t mctx, int sock); 446 447 /** Peeking bytestream from flow_context 448 * @param [in] mctx: mtcp context 449 * @param [in] sock: monitoring stream socket id 450 * @param [in] side: side of monitoring (client side, server side or both) 451 * @param [in] buf: buffer for read byte stream 452 * @param [in] len: requested length 453 * 454 * It will return the number of bytes actually read. 455 * It will return -1 if there is an error 456 */ 457 ssize_t 458 mtcp_peek(mctx_t mctx, int sock, int side, 459 char *buf, size_t len); 460 461 /** 462 * The mtcp_ppeek() function reads up to count bytes from the TCP ring 463 * buffer of the monitor socket sock in mctx into buf, starting from 464 * the TCP sequence number seq_num. 465 * Note that seq_num can point the data in the fragmented buffer list 466 * of the TCP ring buffer. If there is no received byte with TCP sequence 467 * number seq_num in the TCP ring buffer, it returns error. If there are 468 * received bytes starting from seq_num, count is set to be the number 469 * of bytes read from the buffer. After mtcp_ppeek(), the data in the 470 * TCP ring buffer will not be flushed, and the monitor offset used by 471 * mtcp_peek() is not changed. 472 * 473 * @param [in] mctx: mtcp context 474 * @param [in] sock: monitoring stream socket id 475 * @param [in] side: side of monitoring (client side, server side or both) 476 * @param [in] buf: buffer for read byte stream 477 * @param [in] count: No. of bytes to be read 478 * @param [in] seq_num: byte offset of the TCP bytestream (absolute offset: offset 0 = init_seq_num) 479 * @return # of bytes actually read on success, -1 for error 480 */ 481 #ifdef NEWPPEEK 482 ssize_t mtcp_ppeek(mctx_t mctx, int sock, int side, 483 char *buf, size_t count, uint64_t off); 484 #else 485 ssize_t mtcp_ppeek(mctx_t mctx, int sock, int side, 486 char *buf, size_t count, off_t seq_num); 487 #endif 488 489 /* Use this macro to copy packets when mtcp_getlastpkt is called */ 490 #define MTCP_CB_GETCURPKT_CREATE_COPY 491 492 /** Get current packet of mtcp context 493 * @param [in] mctx: mTCP/mOS context 494 * @param [in] sock: monitoring stream socket id 495 * @param [in] side: side of monitoring 496 * (MOS_NULL for MOS_SOCK_MONITOR_RAW socket) 497 * @param [in] p: ptr to packet info ptr 498 * (only L2-L3 information is available for MOS_SOCK_MONITOR_RAW socket) 499 * @return 0 on success, -1 on failure 500 * This is useful for running callback-only applications 501 */ 502 int 503 mtcp_getlastpkt(mctx_t mctx, int sock, int side, struct pkt_info *p); 504 505 /** Register user's custom timer 506 * @param [in] mctx: mtcp context 507 * @param [in] id: timer id 508 * @param [in] timeout: timeout length 509 * @param [in] cb: callback function 510 */ 511 int 512 mtcp_settimer(mctx_t mctx, int id, struct timeval *timeout, callback_t cb); 513 514 /** A sibling function to mtcp_settimer that returns 515 * the current timestamp of the machine in microseconds. 516 * This avoids the monitor application to call current 517 * time getter functions (e.g. gettimeofday) that may 518 * incur overhead. 519 * 520 * @param [in] mctx: mtcp context 521 * Returns timestamp on success, 0 on failure. 522 */ 523 uint32_t 524 mtcp_cb_get_ts(mctx_t mctx); 525 526 /** Pause mtcp application context since it is not running anything 527 * @param [in] mctx: mtcp context 528 * 529 * This is useful for running callback-only applications 530 */ 531 void 532 mtcp_app_join(mctx_t mctx); 533 534 /** Get IP addrs/ports for both sides. 535 * (Server IP/port in 0th element) (Client IP/port in 1st element) 536 * Should only be called with MOS_SOCK_MONITOR_STREAM_ACTIVE socket 537 * Returns 0 on success, -1 on failure 538 */ 539 int 540 mtcp_getpeername(mctx_t mctx, int sock, struct sockaddr *saddr, socklen_t *addrlen, int side); 541 542 /** 543 * Updates the Ethernet frame at a given offset across 544 * datalen bytes. 545 * 546 * @param [in] mctx: mtcp context 547 * @param [in] sock: monitoring socket 548 * @param [in] side: monitoring side 549 * (MOS_NULL for MOS_SOCK_MONITOR_RAW socket) 550 * @param [in] offset: the offset from where the data needs to be written 551 * @param [in] data: the data buffer that needs to be written 552 * @param [in] datalen: the length of data that needs to be written 553 * @param [in] option: disjunction of MOS_ETH_HDR, MOS_IP_HDR, MOS_TCP_HDR, 554 * MOS_TCP_PAYLOAD, MOS_DROP_PKT, MOS_UPDATE_TCP_CHKSUM, 555 * MOS_UPDATE_IP_CHKSUM 556 * @return Returns 0 on success, -1 on failure 557 * 558 * If you want to chomp/insert something in the payload: 559 * (i) first update the ip header to adjust iph->tot_len field; (MOS_OVERWRITE) 560 * (ii) then update the tcp payload accordingly (MOS_CHOMP or MOS_INSERT) 561 * 562 * MOS_DROP, MOS_OVERWRITE, MOS_CHOMP and MOS_INSERT are mutually 563 * exclusive operations. 564 * 565 * # At the moment, mtcp_setlastpkt() can only be used for updating # 566 * # packet as long as the overall TCP payload size remains the same. # 567 */ 568 int 569 mtcp_setlastpkt(mctx_t mctx, int sock, int side, off_t offset, 570 byte *data, uint16_t datalen, int option); 571 572 /** Drop current packet (don't forward it to the peer node) 573 * @param [in] mctx: mtcp context 574 * 575 * This is useful for running callback-only applications 576 * This function is now deprecated... 577 */ 578 //int 579 //mtcp_cb_dropcurpkt(mctx_t mctx); 580 581 /* Reset the connection (send RST to both sides) 582 * (This API will be updated after discussion.) 583 */ 584 int 585 mtcp_reset_conn(mctx_t mctx, int sock); 586 587 int 588 mtcp_set_debug_string(mtcp_manager_t mtcp, const char *fmt, ...); 589 590 int 591 mtcp_get_debug_string(mctx_t mctx, char *buf, int len); 592 593 #endif /* __MOS_API_H_ */ 594