1 #define _GNU_SOURCE 2 #include <sched.h> 3 #include <unistd.h> 4 #include <sys/time.h> 5 #include <semaphore.h> 6 #include <sys/mman.h> 7 #include <signal.h> 8 #include <assert.h> 9 #include <string.h> 10 11 #include "cpu.h" 12 #include "eth_in.h" 13 #include "fhash.h" 14 #include "tcp_send_buffer.h" 15 #include "tcp_ring_buffer.h" 16 #include "socket.h" 17 #include "eth_out.h" 18 #include "tcp.h" 19 #include "tcp_in.h" 20 #include "tcp_out.h" 21 #include "mtcp_api.h" 22 #include "eventpoll.h" 23 #include "logger.h" 24 #include "config.h" 25 #include "arp.h" 26 #include "ip_out.h" 27 #include "timer.h" 28 #include "debug.h" 29 #include "event_callback.h" 30 #include "tcp_rb.h" 31 #include "tcp_stream.h" 32 #include "io_module.h" 33 34 #ifdef ENABLE_DPDK 35 /* for launching rte thread */ 36 #include <rte_launch.h> 37 #include <rte_lcore.h> 38 #endif /* !ENABLE_DPDK */ 39 #define PS_CHUNK_SIZE 64 40 #define RX_THRESH (PS_CHUNK_SIZE * 0.8) 41 42 #define ROUND_STAT FALSE 43 #define TIME_STAT FALSE 44 #define EVENT_STAT FALSE 45 #define TESTING FALSE 46 47 #define LOG_FILE_NAME "log" 48 #define MAX_FILE_NAME 1024 49 50 #define MAX(a, b) ((a)>(b)?(a):(b)) 51 #define MIN(a, b) ((a)<(b)?(a):(b)) 52 53 #define PER_STREAM_SLICE 0.1 // in ms 54 #define PER_STREAM_TCHECK 1 // in ms 55 #define PS_SELECT_TIMEOUT 100 // in us 56 57 #define GBPS(bytes) (bytes * 8.0 / (1000 * 1000 * 1000)) 58 59 /*----------------------------------------------------------------------------*/ 60 /* handlers for threads */ 61 struct mtcp_thread_context *g_pctx[MAX_CPUS] = {0}; 62 struct log_thread_context *g_logctx[MAX_CPUS] = {0}; 63 /*----------------------------------------------------------------------------*/ 64 static pthread_t g_thread[MAX_CPUS] = {0}; 65 static pthread_t log_thread[MAX_CPUS] = {0}; 66 /*----------------------------------------------------------------------------*/ 67 static sem_t g_init_sem[MAX_CPUS]; 68 static sem_t g_done_sem[MAX_CPUS]; 69 static int running[MAX_CPUS] = {0}; 70 /*----------------------------------------------------------------------------*/ 71 mtcp_sighandler_t app_signal_handler; 72 static int sigint_cnt[MAX_CPUS] = {0}; 73 static struct timespec sigint_ts[MAX_CPUS]; 74 /*----------------------------------------------------------------------------*/ 75 #ifdef NETSTAT 76 #if NETSTAT_TOTAL 77 static int printer = -1; 78 #if ROUND_STAT 79 #endif /* ROUND_STAT */ 80 #endif /* NETSTAT_TOTAL */ 81 #endif /* NETSTAT */ 82 /*----------------------------------------------------------------------------*/ 83 void 84 HandleSignal(int signal) 85 { 86 int i = 0; 87 88 if (signal == SIGINT) { 89 FreeConfigResources(); 90 #ifdef DARWIN 91 int core = 0; 92 #else 93 int core = sched_getcpu(); 94 #endif 95 struct timespec cur_ts; 96 97 clock_gettime(CLOCK_REALTIME, &cur_ts); 98 99 if (sigint_cnt[core] > 0 && cur_ts.tv_sec == sigint_ts[core].tv_sec) { 100 for (i = 0; i < g_config.mos->num_cores; i++) { 101 if (running[i]) { 102 exit(0); 103 g_pctx[i]->exit = TRUE; 104 } 105 } 106 } else { 107 for (i = 0; i < g_config.mos->num_cores; i++) { 108 if (g_pctx[i]) 109 g_pctx[i]->interrupt = TRUE; 110 } 111 if (!app_signal_handler) { 112 for (i = 0; i < g_config.mos->num_cores; i++) { 113 if (running[i]) { 114 exit(0); 115 g_pctx[i]->exit = TRUE; 116 } 117 } 118 } 119 } 120 sigint_cnt[core]++; 121 clock_gettime(CLOCK_REALTIME, &sigint_ts[core]); 122 } 123 124 if (signal != SIGUSR1) { 125 if (app_signal_handler) { 126 app_signal_handler(signal); 127 } 128 } 129 } 130 /*----------------------------------------------------------------------------*/ 131 static int 132 AttachDevice(struct mtcp_thread_context* ctx) 133 { 134 int working = -1; 135 mtcp_manager_t mtcp = ctx->mtcp_manager; 136 137 if (mtcp->iom->link_devices) 138 working = mtcp->iom->link_devices(ctx); 139 else 140 return 0; 141 142 return working; 143 } 144 /*----------------------------------------------------------------------------*/ 145 #ifdef TIMESTAT 146 static inline void 147 InitStatCounter(struct stat_counter *counter) 148 { 149 counter->cnt = 0; 150 counter->sum = 0; 151 counter->max = 0; 152 counter->min = 0; 153 } 154 /*----------------------------------------------------------------------------*/ 155 static inline void 156 UpdateStatCounter(struct stat_counter *counter, int64_t value) 157 { 158 counter->cnt++; 159 counter->sum += value; 160 if (value > counter->max) 161 counter->max = value; 162 if (counter->min == 0 || value < counter->min) 163 counter->min = value; 164 } 165 /*----------------------------------------------------------------------------*/ 166 static inline uint64_t 167 GetAverageStat(struct stat_counter *counter) 168 { 169 return counter->cnt ? (counter->sum / counter->cnt) : 0; 170 } 171 /*----------------------------------------------------------------------------*/ 172 static inline int64_t 173 TimeDiffUs(struct timeval *t2, struct timeval *t1) 174 { 175 return (t2->tv_sec - t1->tv_sec) * 1000000 + 176 (int64_t)(t2->tv_usec - t1->tv_usec); 177 } 178 /*----------------------------------------------------------------------------*/ 179 #endif 180 #ifdef NETSTAT 181 static inline void 182 PrintThreadNetworkStats(mtcp_manager_t mtcp, struct net_stat *ns) 183 { 184 int i; 185 186 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 187 ns->rx_packets[i] = mtcp->nstat.rx_packets[i] - mtcp->p_nstat.rx_packets[i]; 188 ns->rx_errors[i] = mtcp->nstat.rx_errors[i] - mtcp->p_nstat.rx_errors[i]; 189 ns->rx_bytes[i] = mtcp->nstat.rx_bytes[i] - mtcp->p_nstat.rx_bytes[i]; 190 ns->tx_packets[i] = mtcp->nstat.tx_packets[i] - mtcp->p_nstat.tx_packets[i]; 191 ns->tx_drops[i] = mtcp->nstat.tx_drops[i] - mtcp->p_nstat.tx_drops[i]; 192 ns->tx_bytes[i] = mtcp->nstat.tx_bytes[i] - mtcp->p_nstat.tx_bytes[i]; 193 #if NETSTAT_PERTHREAD 194 if (g_config.mos->netdev_table->ent[i]->stat_print) { 195 fprintf(stderr, "[CPU%2d] %s flows: %6u, " 196 "RX: %7llu(pps) (err: %5llu), %5.2lf(Gbps), " 197 "TX: %7llu(pps), %5.2lf(Gbps)\n", 198 mtcp->ctx->cpu, 199 g_config.mos->netdev_table->ent[i]->dev_name, 200 (unsigned)mtcp->flow_cnt, 201 (long long unsigned)ns->rx_packets[i], 202 (long long unsigned)ns->rx_errors[i], 203 GBPS(ns->rx_bytes[i]), 204 (long long unsigned)ns->tx_packets[i], 205 GBPS(ns->tx_bytes[i])); 206 } 207 #endif 208 } 209 mtcp->p_nstat = mtcp->nstat; 210 211 } 212 /*----------------------------------------------------------------------------*/ 213 #if ROUND_STAT 214 static inline void 215 PrintThreadRoundStats(mtcp_manager_t mtcp, struct run_stat *rs) 216 { 217 #define ROUND_DIV (1000) 218 rs->rounds = mtcp->runstat.rounds - mtcp->p_runstat.rounds; 219 rs->rounds_rx = mtcp->runstat.rounds_rx - mtcp->p_runstat.rounds_rx; 220 rs->rounds_rx_try = mtcp->runstat.rounds_rx_try - mtcp->p_runstat.rounds_rx_try; 221 rs->rounds_tx = mtcp->runstat.rounds_tx - mtcp->p_runstat.rounds_tx; 222 rs->rounds_tx_try = mtcp->runstat.rounds_tx_try - mtcp->p_runstat.rounds_tx_try; 223 rs->rounds_select = mtcp->runstat.rounds_select - mtcp->p_runstat.rounds_select; 224 rs->rounds_select_rx = mtcp->runstat.rounds_select_rx - mtcp->p_runstat.rounds_select_rx; 225 rs->rounds_select_tx = mtcp->runstat.rounds_select_tx - mtcp->p_runstat.rounds_select_tx; 226 rs->rounds_select_intr = mtcp->runstat.rounds_select_intr - mtcp->p_runstat.rounds_select_intr; 227 rs->rounds_twcheck = mtcp->runstat.rounds_twcheck - mtcp->p_runstat.rounds_twcheck; 228 mtcp->p_runstat = mtcp->runstat; 229 #if NETSTAT_PERTHREAD 230 fprintf(stderr, "[CPU%2d] Rounds: %4lluK, " 231 "rx: %3lluK (try: %4lluK), tx: %3lluK (try: %4lluK), " 232 "ps_select: %4llu (rx: %4llu, tx: %4llu, intr: %3llu)\n", 233 mtcp->ctx->cpu, rs->rounds / ROUND_DIV, 234 rs->rounds_rx / ROUND_DIV, rs->rounds_rx_try / ROUND_DIV, 235 rs->rounds_tx / ROUND_DIV, rs->rounds_tx_try / ROUND_DIV, 236 rs->rounds_select, 237 rs->rounds_select_rx, rs->rounds_select_tx, rs->rounds_select_intr); 238 #endif 239 } 240 #endif /* ROUND_STAT */ 241 /*----------------------------------------------------------------------------*/ 242 #if TIME_STAT 243 static inline void 244 PrintThreadRoundTime(mtcp_manager_t mtcp) 245 { 246 fprintf(stderr, "[CPU%2d] Time: (avg, max) " 247 "round: (%4luus, %4luus), processing: (%4luus, %4luus), " 248 "tcheck: (%4luus, %4luus), epoll: (%4luus, %4luus), " 249 "handle: (%4luus, %4luus), xmit: (%4luus, %4luus), " 250 "select: (%4luus, %4luus)\n", mtcp->ctx->cpu, 251 GetAverageStat(&mtcp->rtstat.round), mtcp->rtstat.round.max, 252 GetAverageStat(&mtcp->rtstat.processing), mtcp->rtstat.processing.max, 253 GetAverageStat(&mtcp->rtstat.tcheck), mtcp->rtstat.tcheck.max, 254 GetAverageStat(&mtcp->rtstat.epoll), mtcp->rtstat.epoll.max, 255 GetAverageStat(&mtcp->rtstat.handle), mtcp->rtstat.handle.max, 256 GetAverageStat(&mtcp->rtstat.xmit), mtcp->rtstat.xmit.max, 257 GetAverageStat(&mtcp->rtstat.select), mtcp->rtstat.select.max); 258 259 InitStatCounter(&mtcp->rtstat.round); 260 InitStatCounter(&mtcp->rtstat.processing); 261 InitStatCounter(&mtcp->rtstat.tcheck); 262 InitStatCounter(&mtcp->rtstat.epoll); 263 InitStatCounter(&mtcp->rtstat.handle); 264 InitStatCounter(&mtcp->rtstat.xmit); 265 InitStatCounter(&mtcp->rtstat.select); 266 } 267 #endif 268 #endif /* NETSTAT */ 269 /*----------------------------------------------------------------------------*/ 270 #if EVENT_STAT 271 static inline void 272 PrintEventStat(int core, struct mtcp_epoll_stat *stat) 273 { 274 fprintf(stderr, "[CPU%2d] calls: %lu, waits: %lu, wakes: %lu, " 275 "issued: %lu, registered: %lu, invalidated: %lu, handled: %lu\n", 276 core, stat->calls, stat->waits, stat->wakes, 277 stat->issued, stat->registered, stat->invalidated, stat->handled); 278 memset(stat, 0, sizeof(struct mtcp_epoll_stat)); 279 } 280 #endif /* EVENT_STAT */ 281 /*----------------------------------------------------------------------------*/ 282 #ifdef NETSTAT 283 static inline void 284 PrintNetworkStats(mtcp_manager_t mtcp, uint32_t cur_ts) 285 { 286 #define TIMEOUT 1 287 int i; 288 struct net_stat ns; 289 bool stat_print = false; 290 #if ROUND_STAT 291 struct run_stat rs; 292 #endif /* ROUND_STAT */ 293 #ifdef NETSTAT_TOTAL 294 static double peak_total_rx_gbps = 0; 295 static double peak_total_tx_gbps = 0; 296 static double avg_total_rx_gbps = 0; 297 static double avg_total_tx_gbps = 0; 298 299 double total_rx_gbps = 0, total_tx_gbps = 0; 300 int j; 301 uint32_t gflow_cnt = 0; 302 struct net_stat g_nstat; 303 #if ROUND_STAT 304 struct run_stat g_runstat; 305 #endif /* ROUND_STAT */ 306 #endif /* NETSTAT_TOTAL */ 307 308 if (TS_TO_MSEC(cur_ts - mtcp->p_nstat_ts) < SEC_TO_MSEC(TIMEOUT)) { 309 return; 310 } 311 312 mtcp->p_nstat_ts = cur_ts; 313 gflow_cnt = 0; 314 memset(&g_nstat, 0, sizeof(struct net_stat)); 315 for (i = 0; i < g_config.mos->num_cores; i++) { 316 if (running[i]) { 317 PrintThreadNetworkStats(g_mtcp[i], &ns); 318 #if NETSTAT_TOTAL 319 gflow_cnt += g_mtcp[i]->flow_cnt; 320 for (j = 0; j < g_config.mos->netdev_table->num; j++) { 321 g_nstat.rx_packets[j] += ns.rx_packets[j]; 322 g_nstat.rx_errors[j] += ns.rx_errors[j]; 323 g_nstat.rx_bytes[j] += ns.rx_bytes[j]; 324 g_nstat.tx_packets[j] += ns.tx_packets[j]; 325 g_nstat.tx_drops[j] += ns.tx_drops[j]; 326 g_nstat.tx_bytes[j] += ns.tx_bytes[j]; 327 } 328 #endif 329 } 330 } 331 #if NETSTAT_TOTAL 332 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 333 if (g_config.mos->netdev_table->ent[i]->stat_print) { 334 fprintf(stderr, "[ ALL ] %s, " 335 "RX: %7llu(pps) (err: %5llu), %5.2lf(Gbps), " 336 "TX: %7llu(pps), %5.2lf(Gbps)\n", 337 g_config.mos->netdev_table->ent[i]->dev_name, 338 (long long unsigned)g_nstat.rx_packets[i], 339 (long long unsigned)g_nstat.rx_errors[i], 340 GBPS(g_nstat.rx_bytes[i]), 341 (long long unsigned)g_nstat.tx_packets[i], 342 GBPS(g_nstat.tx_bytes[i])); 343 total_rx_gbps += GBPS(g_nstat.rx_bytes[i]); 344 total_tx_gbps += GBPS(g_nstat.tx_bytes[i]); 345 stat_print = true; 346 } 347 } 348 if (stat_print) { 349 fprintf(stderr, "[ ALL ] flows: %6u\n", gflow_cnt); 350 if (avg_total_rx_gbps == 0) 351 avg_total_rx_gbps = total_rx_gbps; 352 else 353 avg_total_rx_gbps = avg_total_rx_gbps * 0.6 + total_rx_gbps * 0.4; 354 355 if (avg_total_tx_gbps == 0) 356 avg_total_tx_gbps = total_tx_gbps; 357 else 358 avg_total_tx_gbps = avg_total_tx_gbps * 0.6 + total_tx_gbps * 0.4; 359 360 if (peak_total_rx_gbps < total_rx_gbps) 361 peak_total_rx_gbps = total_rx_gbps; 362 if (peak_total_tx_gbps < total_tx_gbps) 363 peak_total_tx_gbps = total_tx_gbps; 364 365 fprintf(stderr, "[ PEAK ] RX: %5.2lf(Gbps), TX: %5.2lf(Gbps)\n" 366 "[ RECENT AVG ] RX: %5.2lf(Gbps), TX: %5.2lf(Gbps)\n", 367 peak_total_rx_gbps, peak_total_tx_gbps, 368 avg_total_rx_gbps, avg_total_tx_gbps); 369 } 370 #endif 371 372 #if ROUND_STAT 373 memset(&g_runstat, 0, sizeof(struct run_stat)); 374 for (i = 0; i < g_config.mos->num_cores; i++) { 375 if (running[i]) { 376 PrintThreadRoundStats(g_mtcp[i], &rs); 377 #if DBGMSG 378 g_runstat.rounds += rs.rounds; 379 g_runstat.rounds_rx += rs.rounds_rx; 380 g_runstat.rounds_rx_try += rs.rounds_rx_try; 381 g_runstat.rounds_tx += rs.rounds_tx; 382 g_runstat.rounds_tx_try += rs.rounds_tx_try; 383 g_runstat.rounds_select += rs.rounds_select; 384 g_runstat.rounds_select_rx += rs.rounds_select_rx; 385 g_runstat.rounds_select_tx += rs.rounds_select_tx; 386 #endif 387 } 388 } 389 390 TRACE_DBG("[ ALL ] Rounds: %4ldK, " 391 "rx: %3ldK (try: %4ldK), tx: %3ldK (try: %4ldK), " 392 "ps_select: %4ld (rx: %4ld, tx: %4ld)\n", 393 g_runstat.rounds / 1000, g_runstat.rounds_rx / 1000, 394 g_runstat.rounds_rx_try / 1000, g_runstat.rounds_tx / 1000, 395 g_runstat.rounds_tx_try / 1000, g_runstat.rounds_select, 396 g_runstat.rounds_select_rx, g_runstat.rounds_select_tx); 397 #endif /* ROUND_STAT */ 398 399 #if TIME_STAT 400 for (i = 0; i < g_config.mos->num_cores; i++) { 401 if (running[i]) { 402 PrintThreadRoundTime(g_mtcp[i]); 403 } 404 } 405 #endif 406 407 #if EVENT_STAT 408 for (i = 0; i < g_config.mos->num_cores; i++) { 409 if (running[i] && g_mtcp[i]->ep) { 410 PrintEventStat(i, &g_mtcp[i]->ep->stat); 411 } 412 } 413 #endif 414 415 fflush(stderr); 416 } 417 #endif /* NETSTAT */ 418 /*----------------------------------------------------------------------------*/ 419 static inline void 420 FlushMonitorReadEvents(mtcp_manager_t mtcp) 421 { 422 struct event_queue *mtcpq; 423 struct tcp_stream *cur_stream; 424 struct mon_listener *walk; 425 426 /* check if monitor sockets should be passed data */ 427 TAILQ_FOREACH(walk, &mtcp->monitors, link) { 428 if (walk->socket->socktype != MOS_SOCK_MONITOR_STREAM || 429 !(mtcpq = walk->eq)) 430 continue; 431 432 while (mtcpq->num_events > 0) { 433 cur_stream = 434 (struct tcp_stream *)mtcpq->events[mtcpq->start++].ev.data.ptr; 435 /* only read events */ 436 if (cur_stream != NULL && 437 (cur_stream->actions & MOS_ACT_READ_DATA)) { 438 if (cur_stream->rcvvar != NULL && 439 cur_stream->rcvvar->rcvbuf != NULL) { 440 /* no need to pass pkt context */ 441 struct socket_map *walk; 442 SOCKQ_FOREACH_START(walk, &cur_stream->msocks) { 443 HandleCallback(mtcp, MOS_NULL, walk, 444 cur_stream->side, NULL, 445 MOS_ON_CONN_NEW_DATA); 446 } SOCKQ_FOREACH_END; 447 } 448 /* reset the actions now */ 449 cur_stream->actions = 0; 450 } 451 if (mtcpq->start >= mtcpq->size) 452 mtcpq->start = 0; 453 mtcpq->num_events--; 454 } 455 } 456 } 457 /*----------------------------------------------------------------------------*/ 458 static inline void 459 FlushBufferedReadEvents(mtcp_manager_t mtcp) 460 { 461 int i; 462 int offset; 463 struct event_queue *mtcpq; 464 struct tcp_stream *cur_stream; 465 466 if (mtcp->ep == NULL) { 467 TRACE_EPOLL("No epoll socket has been registered yet!\n"); 468 return; 469 } else { 470 /* case when mtcpq exists */ 471 mtcpq = mtcp->ep->mtcp_queue; 472 offset = mtcpq->start; 473 } 474 475 /* we will use queued-up epoll read-in events 476 * to trigger buffered read monitor events */ 477 for (i = 0; i < mtcpq->num_events; i++) { 478 cur_stream = mtcp->smap[mtcpq->events[offset++].sockid].stream; 479 /* only read events */ 480 /* Raise new data callback event */ 481 if (cur_stream != NULL && 482 (cur_stream->socket->events | MOS_EPOLLIN)) { 483 if (cur_stream->rcvvar != NULL && 484 cur_stream->rcvvar->rcvbuf != NULL) { 485 /* no need to pass pkt context */ 486 struct socket_map *walk; 487 SOCKQ_FOREACH_START(walk, &cur_stream->msocks) { 488 HandleCallback(mtcp, MOS_NULL, walk, cur_stream->side, 489 NULL, MOS_ON_CONN_NEW_DATA); 490 } SOCKQ_FOREACH_END; 491 } 492 } 493 if (offset >= mtcpq->size) 494 offset = 0; 495 } 496 } 497 /*----------------------------------------------------------------------------*/ 498 static inline void 499 FlushEpollEvents(mtcp_manager_t mtcp, uint32_t cur_ts) 500 { 501 struct mtcp_epoll *ep = mtcp->ep; 502 struct event_queue *usrq = ep->usr_queue; 503 struct event_queue *mtcpq = ep->mtcp_queue; 504 505 pthread_mutex_lock(&ep->epoll_lock); 506 if (ep->mtcp_queue->num_events > 0) { 507 /* while mtcp_queue have events */ 508 /* and usr_queue is not full */ 509 while (mtcpq->num_events > 0 && usrq->num_events < usrq->size) { 510 /* copy the event from mtcp_queue to usr_queue */ 511 usrq->events[usrq->end++] = mtcpq->events[mtcpq->start++]; 512 513 if (usrq->end >= usrq->size) 514 usrq->end = 0; 515 usrq->num_events++; 516 517 if (mtcpq->start >= mtcpq->size) 518 mtcpq->start = 0; 519 mtcpq->num_events--; 520 } 521 } 522 523 /* if there are pending events, wake up user */ 524 if (ep->waiting && (ep->usr_queue->num_events > 0 || 525 ep->usr_shadow_queue->num_events > 0)) { 526 STAT_COUNT(mtcp->runstat.rounds_epoll); 527 TRACE_EPOLL("Broadcasting events. num: %d, cur_ts: %u, prev_ts: %u\n", 528 ep->usr_queue->num_events, cur_ts, mtcp->ts_last_event); 529 mtcp->ts_last_event = cur_ts; 530 ep->stat.wakes++; 531 pthread_cond_signal(&ep->epoll_cond); 532 } 533 pthread_mutex_unlock(&ep->epoll_lock); 534 } 535 /*----------------------------------------------------------------------------*/ 536 static inline void 537 HandleApplicationCalls(mtcp_manager_t mtcp, uint32_t cur_ts) 538 { 539 tcp_stream *stream; 540 int cnt, max_cnt; 541 int handled, delayed; 542 int control, send, ack; 543 544 /* connect handling */ 545 while ((stream = StreamDequeue(mtcp->connectq))) { 546 if (stream->state != TCP_ST_SYN_SENT) { 547 TRACE_INFO("Got a connection request from app with state: %s", 548 TCPStateToString(stream)); 549 exit(EXIT_FAILURE); 550 } else { 551 stream->cb_events |= MOS_ON_CONN_START | 552 MOS_ON_TCP_STATE_CHANGE; 553 /* if monitor is on... */ 554 if (stream->pair_stream != NULL) 555 stream->pair_stream->cb_events |= 556 MOS_ON_CONN_START; 557 } 558 AddtoControlList(mtcp, stream, cur_ts); 559 } 560 561 /* send queue handling */ 562 while ((stream = StreamDequeue(mtcp->sendq))) { 563 stream->sndvar->on_sendq = FALSE; 564 AddtoSendList(mtcp, stream); 565 } 566 567 /* ack queue handling */ 568 while ((stream = StreamDequeue(mtcp->ackq))) { 569 stream->sndvar->on_ackq = FALSE; 570 EnqueueACK(mtcp, stream, cur_ts, ACK_OPT_AGGREGATE); 571 } 572 573 /* close handling */ 574 handled = delayed = 0; 575 control = send = ack = 0; 576 while ((stream = StreamDequeue(mtcp->closeq))) { 577 struct tcp_send_vars *sndvar = stream->sndvar; 578 sndvar->on_closeq = FALSE; 579 580 if (sndvar->sndbuf) { 581 sndvar->fss = sndvar->sndbuf->head_seq + sndvar->sndbuf->len; 582 } else { 583 sndvar->fss = stream->snd_nxt; 584 } 585 586 if (g_config.mos->tcp_timeout > 0) 587 RemoveFromTimeoutList(mtcp, stream); 588 589 if (stream->have_reset) { 590 handled++; 591 if (stream->state != TCP_ST_CLOSED_RSVD) { 592 stream->close_reason = TCP_RESET; 593 stream->state = TCP_ST_CLOSED_RSVD; 594 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 595 TRACE_STATE("Stream %d: TCP_ST_CLOSED_RSVD\n", stream->id); 596 DestroyTCPStream(mtcp, stream); 597 } else { 598 TRACE_ERROR("Stream already closed.\n"); 599 } 600 601 } else if (sndvar->on_control_list) { 602 sndvar->on_closeq_int = TRUE; 603 StreamInternalEnqueue(mtcp->closeq_int, stream); 604 delayed++; 605 if (sndvar->on_control_list) 606 control++; 607 if (sndvar->on_send_list) 608 send++; 609 if (sndvar->on_ack_list) 610 ack++; 611 612 } else if (sndvar->on_send_list || sndvar->on_ack_list) { 613 handled++; 614 if (stream->state == TCP_ST_ESTABLISHED) { 615 stream->state = TCP_ST_FIN_WAIT_1; 616 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 617 TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); 618 619 } else if (stream->state == TCP_ST_CLOSE_WAIT) { 620 stream->state = TCP_ST_LAST_ACK; 621 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 622 TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); 623 } 624 stream->control_list_waiting = TRUE; 625 626 } else if (stream->state != TCP_ST_CLOSED_RSVD) { 627 handled++; 628 if (stream->state == TCP_ST_ESTABLISHED) { 629 stream->state = TCP_ST_FIN_WAIT_1; 630 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 631 TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); 632 633 } else if (stream->state == TCP_ST_CLOSE_WAIT) { 634 stream->state = TCP_ST_LAST_ACK; 635 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 636 TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); 637 } 638 //sndvar->rto = TCP_FIN_RTO; 639 //UpdateRetransmissionTimer(mtcp, stream, mtcp->cur_ts); 640 AddtoControlList(mtcp, stream, cur_ts); 641 } else { 642 TRACE_ERROR("Already closed connection!\n"); 643 } 644 } 645 TRACE_ROUND("Handling close connections. cnt: %d\n", cnt); 646 647 cnt = 0; 648 max_cnt = mtcp->closeq_int->count; 649 while (cnt++ < max_cnt) { 650 stream = StreamInternalDequeue(mtcp->closeq_int); 651 652 if (stream->sndvar->on_control_list) { 653 StreamInternalEnqueue(mtcp->closeq_int, stream); 654 655 } else if (stream->state != TCP_ST_CLOSED_RSVD) { 656 handled++; 657 stream->sndvar->on_closeq_int = FALSE; 658 if (stream->state == TCP_ST_ESTABLISHED) { 659 stream->state = TCP_ST_FIN_WAIT_1; 660 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 661 TRACE_STATE("Stream %d: TCP_ST_FIN_WAIT_1\n", stream->id); 662 663 } else if (stream->state == TCP_ST_CLOSE_WAIT) { 664 stream->state = TCP_ST_LAST_ACK; 665 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 666 TRACE_STATE("Stream %d: TCP_ST_LAST_ACK\n", stream->id); 667 } 668 AddtoControlList(mtcp, stream, cur_ts); 669 } else { 670 stream->sndvar->on_closeq_int = FALSE; 671 TRACE_ERROR("Already closed connection!\n"); 672 } 673 } 674 675 /* reset handling */ 676 while ((stream = StreamDequeue(mtcp->resetq))) { 677 stream->sndvar->on_resetq = FALSE; 678 679 if (g_config.mos->tcp_timeout > 0) 680 RemoveFromTimeoutList(mtcp, stream); 681 682 if (stream->have_reset) { 683 if (stream->state != TCP_ST_CLOSED_RSVD) { 684 stream->close_reason = TCP_RESET; 685 stream->state = TCP_ST_CLOSED_RSVD; 686 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 687 TRACE_STATE("Stream %d: TCP_ST_CLOSED_RSVD\n", stream->id); 688 DestroyTCPStream(mtcp, stream); 689 } else { 690 TRACE_ERROR("Stream already closed.\n"); 691 } 692 693 } else if (stream->sndvar->on_control_list || 694 stream->sndvar->on_send_list || stream->sndvar->on_ack_list) { 695 /* wait until all the queues are flushed */ 696 stream->sndvar->on_resetq_int = TRUE; 697 StreamInternalEnqueue(mtcp->resetq_int, stream); 698 699 } else { 700 if (stream->state != TCP_ST_CLOSED_RSVD) { 701 stream->close_reason = TCP_ACTIVE_CLOSE; 702 stream->state = TCP_ST_CLOSED_RSVD; 703 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 704 TRACE_STATE("Stream %d: TCP_ST_CLOSED_RSVD\n", stream->id); 705 AddtoControlList(mtcp, stream, cur_ts); 706 } else { 707 TRACE_ERROR("Stream already closed.\n"); 708 } 709 } 710 } 711 TRACE_ROUND("Handling reset connections. cnt: %d\n", cnt); 712 713 cnt = 0; 714 max_cnt = mtcp->resetq_int->count; 715 while (cnt++ < max_cnt) { 716 stream = StreamInternalDequeue(mtcp->resetq_int); 717 718 if (stream->sndvar->on_control_list || 719 stream->sndvar->on_send_list || stream->sndvar->on_ack_list) { 720 /* wait until all the queues are flushed */ 721 StreamInternalEnqueue(mtcp->resetq_int, stream); 722 723 } else { 724 stream->sndvar->on_resetq_int = FALSE; 725 726 if (stream->state != TCP_ST_CLOSED_RSVD) { 727 stream->close_reason = TCP_ACTIVE_CLOSE; 728 stream->state = TCP_ST_CLOSED_RSVD; 729 stream->cb_events |= MOS_ON_TCP_STATE_CHANGE; 730 TRACE_STATE("Stream %d: TCP_ST_CLOSED_RSVD\n", stream->id); 731 AddtoControlList(mtcp, stream, cur_ts); 732 } else { 733 TRACE_ERROR("Stream already closed.\n"); 734 } 735 } 736 } 737 738 /* destroy streams in destroyq */ 739 while ((stream = StreamDequeue(mtcp->destroyq))) { 740 DestroyTCPStream(mtcp, stream); 741 } 742 743 mtcp->wakeup_flag = FALSE; 744 } 745 /*----------------------------------------------------------------------------*/ 746 static inline void 747 WritePacketsToChunks(mtcp_manager_t mtcp, uint32_t cur_ts) 748 { 749 int thresh = g_config.mos->max_concurrency; 750 int i; 751 752 /* Set the threshold to g_config.mos->max_concurrency to send ACK immediately */ 753 /* Otherwise, set to appropriate value (e.g. thresh) */ 754 assert(mtcp->g_sender != NULL); 755 if (mtcp->g_sender->control_list_cnt) 756 WriteTCPControlList(mtcp, mtcp->g_sender, cur_ts, thresh); 757 if (mtcp->g_sender->ack_list_cnt) 758 WriteTCPACKList(mtcp, mtcp->g_sender, cur_ts, thresh); 759 if (mtcp->g_sender->send_list_cnt) 760 WriteTCPDataList(mtcp, mtcp->g_sender, cur_ts, thresh); 761 762 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 763 assert(mtcp->n_sender[i] != NULL); 764 if (mtcp->n_sender[i]->control_list_cnt) 765 WriteTCPControlList(mtcp, mtcp->n_sender[i], cur_ts, thresh); 766 if (mtcp->n_sender[i]->ack_list_cnt) 767 WriteTCPACKList(mtcp, mtcp->n_sender[i], cur_ts, thresh); 768 if (mtcp->n_sender[i]->send_list_cnt) 769 WriteTCPDataList(mtcp, mtcp->n_sender[i], cur_ts, thresh); 770 } 771 } 772 /*----------------------------------------------------------------------------*/ 773 #if TESTING 774 static int 775 DestroyRemainingFlows(mtcp_manager_t mtcp) 776 { 777 struct hashtable *ht = mtcp->tcp_flow_table; 778 tcp_stream *walk; 779 int cnt, i; 780 781 cnt = 0; 782 783 thread_printf(mtcp, mtcp->log_fp, 784 "CPU %d: Flushing remaining flows.\n", mtcp->ctx->cpu); 785 786 for (i = 0; i < NUM_BINS; i++) { 787 TAILQ_FOREACH(walk, &ht->ht_table[i], rcvvar->he_link) { 788 thread_printf(mtcp, mtcp->log_fp, 789 "CPU %d: Destroying stream %d\n", mtcp->ctx->cpu, walk->id); 790 #ifdef DUMP_STREAM 791 DumpStream(mtcp, walk); 792 #endif 793 DestroyTCPStream(mtcp, walk); 794 cnt++; 795 } 796 } 797 798 return cnt; 799 } 800 #endif 801 /*----------------------------------------------------------------------------*/ 802 static void 803 InterruptApplication(mtcp_manager_t mtcp) 804 { 805 /* interrupt if the mtcp_epoll_wait() is waiting */ 806 if (mtcp->ep) { 807 pthread_mutex_lock(&mtcp->ep->epoll_lock); 808 if (mtcp->ep->waiting) { 809 pthread_cond_signal(&mtcp->ep->epoll_cond); 810 } 811 pthread_mutex_unlock(&mtcp->ep->epoll_lock); 812 } 813 /* interrupt if the accept() is waiting */ 814 if (mtcp->listener) { 815 if (mtcp->listener->socket) { 816 pthread_mutex_lock(&mtcp->listener->accept_lock); 817 if (!(mtcp->listener->socket->opts & MTCP_NONBLOCK)) { 818 pthread_cond_signal(&mtcp->listener->accept_cond); 819 } 820 pthread_mutex_unlock(&mtcp->listener->accept_lock); 821 } 822 } 823 } 824 /*----------------------------------------------------------------------------*/ 825 void 826 RunPassiveLoop(mtcp_manager_t mtcp) 827 { 828 sem_wait(&g_done_sem[mtcp->ctx->cpu]); 829 sem_destroy(&g_done_sem[mtcp->ctx->cpu]); 830 return; 831 } 832 /*----------------------------------------------------------------------------*/ 833 static void 834 RunMainLoop(struct mtcp_thread_context *ctx) 835 { 836 mtcp_manager_t mtcp = ctx->mtcp_manager; 837 int i; 838 int recv_cnt; 839 int rx_inf, tx_inf; 840 struct timeval cur_ts = {0}; 841 uint32_t ts, ts_prev; 842 843 #if TIME_STAT 844 struct timeval prev_ts, processing_ts, tcheck_ts, 845 epoll_ts, handle_ts, xmit_ts, select_ts; 846 #endif 847 int thresh; 848 849 gettimeofday(&cur_ts, NULL); 850 851 TRACE_DBG("CPU %d: mtcp thread running.\n", ctx->cpu); 852 853 #if TIME_STAT 854 prev_ts = cur_ts; 855 InitStatCounter(&mtcp->rtstat.round); 856 InitStatCounter(&mtcp->rtstat.processing); 857 InitStatCounter(&mtcp->rtstat.tcheck); 858 InitStatCounter(&mtcp->rtstat.epoll); 859 InitStatCounter(&mtcp->rtstat.handle); 860 InitStatCounter(&mtcp->rtstat.xmit); 861 InitStatCounter(&mtcp->rtstat.select); 862 #endif 863 864 ts = ts_prev = 0; 865 while ((!ctx->done || mtcp->flow_cnt) && !ctx->exit) { 866 867 STAT_COUNT(mtcp->runstat.rounds); 868 recv_cnt = 0; 869 gettimeofday(&cur_ts, NULL); 870 #if TIME_STAT 871 /* measure the inter-round delay */ 872 UpdateStatCounter(&mtcp->rtstat.round, TimeDiffUs(&cur_ts, &prev_ts)); 873 prev_ts = cur_ts; 874 #endif 875 876 ts = TIMEVAL_TO_TS(&cur_ts); 877 mtcp->cur_ts = ts; 878 879 for (rx_inf = 0; rx_inf < g_config.mos->netdev_table->num; rx_inf++) { 880 881 recv_cnt = mtcp->iom->recv_pkts(ctx, rx_inf); 882 STAT_COUNT(mtcp->runstat.rounds_rx_try); 883 884 for (i = 0; i < recv_cnt; i++) { 885 uint16_t len; 886 uint8_t *pktbuf; 887 pktbuf = mtcp->iom->get_rptr(mtcp->ctx, rx_inf, i, &len); 888 ProcessPacket(mtcp, rx_inf, i, ts, pktbuf, len); 889 } 890 } 891 STAT_COUNT(mtcp->runstat.rounds_rx); 892 893 #if TIME_STAT 894 gettimeofday(&processing_ts, NULL); 895 UpdateStatCounter(&mtcp->rtstat.processing, 896 TimeDiffUs(&processing_ts, &cur_ts)); 897 #endif /* TIME_STAT */ 898 899 /* Handle user defined timeout */ 900 struct timer *walk, *tmp; 901 for (walk = TAILQ_FIRST(&mtcp->timer_list); walk != NULL; walk = tmp) { 902 tmp = TAILQ_NEXT(walk, timer_link); 903 if (TIMEVAL_LT(&cur_ts, &walk->exp)) 904 break; 905 906 struct mtcp_context mctx = {.cpu = ctx->cpu}; 907 walk->cb(&mctx, walk->id, 0, 0 /* FIXME */, NULL); 908 DelTimer(mtcp, walk); 909 } 910 911 /* interaction with application */ 912 if (mtcp->flow_cnt > 0) { 913 914 /* check retransmission timeout and timewait expire */ 915 #if 0 916 thresh = (int)mtcp->flow_cnt / (TS_TO_USEC(PER_STREAM_TCHECK)); 917 assert(thresh >= 0); 918 if (thresh == 0) 919 thresh = 1; 920 if (recv_cnt > 0 && thresh > recv_cnt) 921 thresh = recv_cnt; 922 #else 923 thresh = g_config.mos->max_concurrency; 924 #endif 925 926 /* Eunyoung, you may fix this later 927 * if there is no rcv packet, we will send as much as possible 928 */ 929 if (thresh == -1) 930 thresh = g_config.mos->max_concurrency; 931 932 CheckRtmTimeout(mtcp, ts, thresh); 933 CheckTimewaitExpire(mtcp, ts, thresh); 934 935 if (g_config.mos->tcp_timeout > 0 && ts != ts_prev) { 936 CheckConnectionTimeout(mtcp, ts, thresh); 937 } 938 939 #if TIME_STAT 940 } 941 gettimeofday(&tcheck_ts, NULL); 942 UpdateStatCounter(&mtcp->rtstat.tcheck, 943 TimeDiffUs(&tcheck_ts, &processing_ts)); 944 945 if (mtcp->flow_cnt > 0) { 946 #endif /* TIME_STAT */ 947 948 } 949 950 /* 951 * before flushing epoll events, call monitor events for 952 * all registered `read` events 953 */ 954 if (mtcp->num_msp > 0) 955 /* call this when only a standalone monitor is running */ 956 FlushMonitorReadEvents(mtcp); 957 958 /* if epoll is in use, flush all the queued events */ 959 if (mtcp->ep) { 960 FlushBufferedReadEvents(mtcp); 961 FlushEpollEvents(mtcp, ts); 962 } 963 #if TIME_STAT 964 gettimeofday(&epoll_ts, NULL); 965 UpdateStatCounter(&mtcp->rtstat.epoll, 966 TimeDiffUs(&epoll_ts, &tcheck_ts)); 967 #endif /* TIME_STAT */ 968 969 if (end_app_exists && mtcp->flow_cnt > 0) { 970 /* handle stream queues */ 971 HandleApplicationCalls(mtcp, ts); 972 } 973 974 #if TIME_STAT 975 gettimeofday(&handle_ts, NULL); 976 UpdateStatCounter(&mtcp->rtstat.handle, 977 TimeDiffUs(&handle_ts, &epoll_ts)); 978 #endif /* TIME_STAT */ 979 980 WritePacketsToChunks(mtcp, ts); 981 982 /* send packets from write buffer */ 983 /* Send until tx is available */ 984 int num_dev = g_config.mos->netdev_table->num; 985 if (likely(mtcp->iom->send_pkts != NULL)) 986 for (tx_inf = 0; tx_inf < num_dev; tx_inf++) { 987 mtcp->iom->send_pkts(ctx, tx_inf); 988 } 989 990 #if TIME_STAT 991 gettimeofday(&xmit_ts, NULL); 992 UpdateStatCounter(&mtcp->rtstat.xmit, 993 TimeDiffUs(&xmit_ts, &handle_ts)); 994 #endif /* TIME_STAT */ 995 996 if (ts != ts_prev) { 997 ts_prev = ts; 998 #ifdef NETSTAT 999 if (ctx->cpu == printer) { 1000 #ifdef RUN_ARP 1001 ARPTimer(mtcp, ts); 1002 #endif 1003 #ifdef NETSTAT 1004 PrintNetworkStats(mtcp, ts); 1005 #endif 1006 } 1007 #endif /* NETSTAT */ 1008 } 1009 1010 if (mtcp->iom->select) 1011 mtcp->iom->select(ctx); 1012 1013 if (ctx->interrupt) { 1014 InterruptApplication(mtcp); 1015 } 1016 } 1017 1018 #if TESTING 1019 DestroyRemainingFlows(mtcp); 1020 #endif 1021 1022 TRACE_DBG("MTCP thread %d out of main loop.\n", ctx->cpu); 1023 /* flush logs */ 1024 flush_log_data(mtcp); 1025 TRACE_DBG("MTCP thread %d flushed logs.\n", ctx->cpu); 1026 InterruptApplication(mtcp); 1027 TRACE_INFO("MTCP thread %d finished.\n", ctx->cpu); 1028 } 1029 /*----------------------------------------------------------------------------*/ 1030 struct mtcp_sender * 1031 CreateMTCPSender(int ifidx) 1032 { 1033 struct mtcp_sender *sender; 1034 1035 sender = (struct mtcp_sender *)calloc(1, sizeof(struct mtcp_sender)); 1036 if (!sender) { 1037 return NULL; 1038 } 1039 1040 sender->ifidx = ifidx; 1041 1042 TAILQ_INIT(&sender->control_list); 1043 TAILQ_INIT(&sender->send_list); 1044 TAILQ_INIT(&sender->ack_list); 1045 1046 sender->control_list_cnt = 0; 1047 sender->send_list_cnt = 0; 1048 sender->ack_list_cnt = 0; 1049 1050 return sender; 1051 } 1052 /*----------------------------------------------------------------------------*/ 1053 void 1054 DestroyMTCPSender(struct mtcp_sender *sender) 1055 { 1056 free(sender); 1057 } 1058 /*----------------------------------------------------------------------------*/ 1059 static mtcp_manager_t 1060 InitializeMTCPManager(struct mtcp_thread_context* ctx) 1061 { 1062 mtcp_manager_t mtcp; 1063 char log_name[MAX_FILE_NAME]; 1064 int i; 1065 1066 posix_seq_srand((unsigned)pthread_self()); 1067 1068 mtcp = (mtcp_manager_t)calloc(1, sizeof(struct mtcp_manager)); 1069 if (!mtcp) { 1070 perror("malloc"); 1071 fprintf(stderr, "Failed to allocate mtcp_manager.\n"); 1072 return NULL; 1073 } 1074 g_mtcp[ctx->cpu] = mtcp; 1075 1076 mtcp->tcp_flow_table = CreateHashtable(); 1077 if (!mtcp->tcp_flow_table) { 1078 CTRACE_ERROR("Falied to allocate tcp flow table.\n"); 1079 return NULL; 1080 } 1081 1082 #ifdef HUGEPAGE 1083 #define IS_HUGEPAGE 1 1084 #else 1085 #define IS_HUGEPAGE 0 1086 #endif 1087 if (mon_app_exists) { 1088 /* initialize event callback */ 1089 #ifdef NEWEV 1090 InitEvent(mtcp); 1091 #else 1092 InitEvent(mtcp, NUM_EV_TABLE); 1093 #endif 1094 } 1095 1096 if (!(mtcp->bufseg_pool = MPCreate(sizeof(tcpbufseg_t), 1097 sizeof(tcpbufseg_t) * g_config.mos->max_concurrency * 1098 ((g_config.mos->rmem_size - 1) / UNITBUFSIZE + 1), 0))) { 1099 TRACE_ERROR("Failed to allocate ev_table pool\n"); 1100 exit(0); 1101 } 1102 if (!(mtcp->sockent_pool = MPCreate(sizeof(struct sockent), 1103 sizeof(struct sockent) * g_config.mos->max_concurrency * 3, 0))) { 1104 TRACE_ERROR("Failed to allocate ev_table pool\n"); 1105 exit(0); 1106 } 1107 #ifdef USE_TIMER_POOL 1108 if (!(mtcp->timer_pool = MPCreate(sizeof(struct timer), 1109 sizeof(struct timer) * g_config.mos->max_concurrency * 10, 0))) { 1110 TRACE_ERROR("Failed to allocate ev_table pool\n"); 1111 exit(0); 1112 } 1113 #endif 1114 mtcp->flow_pool = MPCreate(sizeof(tcp_stream), 1115 sizeof(tcp_stream) * g_config.mos->max_concurrency, IS_HUGEPAGE); 1116 if (!mtcp->flow_pool) { 1117 CTRACE_ERROR("Failed to allocate tcp flow pool.\n"); 1118 return NULL; 1119 } 1120 mtcp->rv_pool = MPCreate(sizeof(struct tcp_recv_vars), 1121 sizeof(struct tcp_recv_vars) * g_config.mos->max_concurrency, IS_HUGEPAGE); 1122 if (!mtcp->rv_pool) { 1123 CTRACE_ERROR("Failed to allocate tcp recv variable pool.\n"); 1124 return NULL; 1125 } 1126 mtcp->sv_pool = MPCreate(sizeof(struct tcp_send_vars), 1127 sizeof(struct tcp_send_vars) * g_config.mos->max_concurrency, IS_HUGEPAGE); 1128 if (!mtcp->sv_pool) { 1129 CTRACE_ERROR("Failed to allocate tcp send variable pool.\n"); 1130 return NULL; 1131 } 1132 1133 mtcp->rbm_snd = SBManagerCreate(g_config.mos->wmem_size, g_config.mos->no_ring_buffers, 1134 g_config.mos->max_concurrency); 1135 if (!mtcp->rbm_snd) { 1136 CTRACE_ERROR("Failed to create send ring buffer.\n"); 1137 return NULL; 1138 } 1139 1140 mtcp->smap = (socket_map_t)calloc(g_config.mos->max_concurrency, sizeof(struct socket_map)); 1141 if (!mtcp->smap) { 1142 perror("calloc"); 1143 CTRACE_ERROR("Failed to allocate memory for stream map.\n"); 1144 return NULL; 1145 } 1146 1147 if (mon_app_exists) { 1148 mtcp->msmap = (socket_map_t)calloc(g_config.mos->max_concurrency, sizeof(struct socket_map)); 1149 if (!mtcp->msmap) { 1150 perror("calloc"); 1151 CTRACE_ERROR("Failed to allocate memory for monitor stream map.\n"); 1152 return NULL; 1153 } 1154 1155 for (i = 0; i < g_config.mos->max_concurrency; i++) { 1156 mtcp->msmap[i].monitor_stream = calloc(1, sizeof(struct mon_stream)); 1157 if (!mtcp->msmap[i].monitor_stream) { 1158 perror("calloc"); 1159 CTRACE_ERROR("Failed to allocate memory for monitr stream map\n"); 1160 return NULL; 1161 } 1162 } 1163 } 1164 1165 TAILQ_INIT(&mtcp->timer_list); 1166 TAILQ_INIT(&mtcp->monitors); 1167 1168 TAILQ_INIT(&mtcp->free_smap); 1169 for (i = 0; i < g_config.mos->max_concurrency; i++) { 1170 mtcp->smap[i].id = i; 1171 mtcp->smap[i].socktype = MOS_SOCK_UNUSED; 1172 memset(&mtcp->smap[i].saddr, 0, sizeof(struct sockaddr_in)); 1173 mtcp->smap[i].stream = NULL; 1174 TAILQ_INSERT_TAIL(&mtcp->free_smap, &mtcp->smap[i], link); 1175 } 1176 1177 if (mon_app_exists) { 1178 TAILQ_INIT(&mtcp->free_msmap); 1179 for (i = 0; i < g_config.mos->max_concurrency; i++) { 1180 mtcp->msmap[i].id = i; 1181 mtcp->msmap[i].socktype = MOS_SOCK_UNUSED; 1182 memset(&mtcp->msmap[i].saddr, 0, sizeof(struct sockaddr_in)); 1183 TAILQ_INSERT_TAIL(&mtcp->free_msmap, &mtcp->msmap[i], link); 1184 } 1185 } 1186 1187 mtcp->ctx = ctx; 1188 mtcp->ep = NULL; 1189 1190 snprintf(log_name, MAX_FILE_NAME, "%s/"LOG_FILE_NAME"_%d", 1191 g_config.mos->mos_log, ctx->cpu); 1192 mtcp->log_fp = fopen(log_name, "w+"); 1193 if (!mtcp->log_fp) { 1194 perror("fopen"); 1195 CTRACE_ERROR("Failed to create file for logging. (%s)\n", log_name); 1196 return NULL; 1197 } 1198 mtcp->sp_fd = g_logctx[ctx->cpu]->pair_sp_fd; 1199 mtcp->logger = g_logctx[ctx->cpu]; 1200 1201 mtcp->connectq = CreateStreamQueue(BACKLOG_SIZE); 1202 if (!mtcp->connectq) { 1203 CTRACE_ERROR("Failed to create connect queue.\n"); 1204 return NULL; 1205 } 1206 mtcp->sendq = CreateStreamQueue(g_config.mos->max_concurrency); 1207 if (!mtcp->sendq) { 1208 CTRACE_ERROR("Failed to create send queue.\n"); 1209 return NULL; 1210 } 1211 mtcp->ackq = CreateStreamQueue(g_config.mos->max_concurrency); 1212 if (!mtcp->ackq) { 1213 CTRACE_ERROR("Failed to create ack queue.\n"); 1214 return NULL; 1215 } 1216 mtcp->closeq = CreateStreamQueue(g_config.mos->max_concurrency); 1217 if (!mtcp->closeq) { 1218 CTRACE_ERROR("Failed to create close queue.\n"); 1219 return NULL; 1220 } 1221 mtcp->closeq_int = CreateInternalStreamQueue(g_config.mos->max_concurrency); 1222 if (!mtcp->closeq_int) { 1223 CTRACE_ERROR("Failed to create close queue.\n"); 1224 return NULL; 1225 } 1226 mtcp->resetq = CreateStreamQueue(g_config.mos->max_concurrency); 1227 if (!mtcp->resetq) { 1228 CTRACE_ERROR("Failed to create reset queue.\n"); 1229 return NULL; 1230 } 1231 mtcp->resetq_int = CreateInternalStreamQueue(g_config.mos->max_concurrency); 1232 if (!mtcp->resetq_int) { 1233 CTRACE_ERROR("Failed to create reset queue.\n"); 1234 return NULL; 1235 } 1236 mtcp->destroyq = CreateStreamQueue(g_config.mos->max_concurrency); 1237 if (!mtcp->destroyq) { 1238 CTRACE_ERROR("Failed to create destroy queue.\n"); 1239 return NULL; 1240 } 1241 1242 mtcp->g_sender = CreateMTCPSender(-1); 1243 if (!mtcp->g_sender) { 1244 CTRACE_ERROR("Failed to create global sender structure.\n"); 1245 return NULL; 1246 } 1247 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 1248 mtcp->n_sender[i] = CreateMTCPSender(i); 1249 if (!mtcp->n_sender[i]) { 1250 CTRACE_ERROR("Failed to create per-nic sender structure.\n"); 1251 return NULL; 1252 } 1253 } 1254 1255 mtcp->rto_store = InitRTOHashstore(); 1256 TAILQ_INIT(&mtcp->timewait_list); 1257 TAILQ_INIT(&mtcp->timeout_list); 1258 1259 return mtcp; 1260 } 1261 /*----------------------------------------------------------------------------*/ 1262 static void * 1263 MTCPRunThread(void *arg) 1264 { 1265 mctx_t mctx = (mctx_t)arg; 1266 int cpu = mctx->cpu; 1267 int working; 1268 struct mtcp_manager *mtcp; 1269 struct mtcp_thread_context *ctx; 1270 1271 /* affinitize the thread to this core first */ 1272 mtcp_core_affinitize(cpu); 1273 1274 /* memory alloc after core affinitization would use local memory 1275 most time */ 1276 ctx = calloc(1, sizeof(*ctx)); 1277 if (!ctx) { 1278 perror("calloc"); 1279 TRACE_ERROR("Failed to calloc mtcp context.\n"); 1280 exit(-1); 1281 } 1282 ctx->thread = pthread_self(); 1283 ctx->cpu = cpu; 1284 mtcp = ctx->mtcp_manager = InitializeMTCPManager(ctx); 1285 if (!mtcp) { 1286 TRACE_ERROR("Failed to initialize mtcp manager.\n"); 1287 exit(-1); 1288 } 1289 1290 /* assign mtcp context's underlying I/O module */ 1291 mtcp->iom = current_iomodule_func; 1292 1293 /* I/O initializing */ 1294 if (mtcp->iom->init_handle) 1295 mtcp->iom->init_handle(ctx); 1296 1297 if (pthread_mutex_init(&ctx->flow_pool_lock, NULL)) { 1298 perror("pthread_mutex_init of ctx->flow_pool_lock\n"); 1299 exit(-1); 1300 } 1301 1302 if (pthread_mutex_init(&ctx->socket_pool_lock, NULL)) { 1303 perror("pthread_mutex_init of ctx->socket_pool_lock\n"); 1304 exit(-1); 1305 } 1306 1307 SQ_LOCK_INIT(&ctx->connect_lock, "ctx->connect_lock", exit(-1)); 1308 SQ_LOCK_INIT(&ctx->close_lock, "ctx->close_lock", exit(-1)); 1309 SQ_LOCK_INIT(&ctx->reset_lock, "ctx->reset_lock", exit(-1)); 1310 SQ_LOCK_INIT(&ctx->sendq_lock, "ctx->sendq_lock", exit(-1)); 1311 SQ_LOCK_INIT(&ctx->ackq_lock, "ctx->ackq_lock", exit(-1)); 1312 SQ_LOCK_INIT(&ctx->destroyq_lock, "ctx->destroyq_lock", exit(-1)); 1313 1314 /* remember this context pointer for signal processing */ 1315 g_pctx[cpu] = ctx; 1316 mlockall(MCL_CURRENT); 1317 1318 // attach (nic device, queue) 1319 working = AttachDevice(ctx); 1320 if (working != 0) { 1321 sem_post(&g_init_sem[ctx->cpu]); 1322 TRACE_DBG("MTCP thread %d finished. Not attached any device\n", ctx->cpu); 1323 pthread_exit(NULL); 1324 } 1325 1326 TRACE_DBG("CPU %d: initialization finished.\n", cpu); 1327 sem_post(&g_init_sem[ctx->cpu]); 1328 1329 /* start the main loop */ 1330 RunMainLoop(ctx); 1331 1332 TRACE_DBG("MTCP thread %d finished.\n", ctx->cpu); 1333 1334 /* signaling mTCP thread is done */ 1335 sem_post(&g_done_sem[mctx->cpu]); 1336 1337 //pthread_exit(NULL); 1338 return 0; 1339 } 1340 /*----------------------------------------------------------------------------*/ 1341 #ifdef ENABLE_DPDK 1342 static int MTCPDPDKRunThread(void *arg) 1343 { 1344 MTCPRunThread(arg); 1345 return 0; 1346 } 1347 #endif /* !ENABLE_DPDK */ 1348 /*----------------------------------------------------------------------------*/ 1349 mctx_t 1350 mtcp_create_context(int cpu) 1351 { 1352 mctx_t mctx; 1353 int ret; 1354 1355 if (cpu >= g_config.mos->num_cores) { 1356 TRACE_ERROR("Failed initialize new mtcp context. " 1357 "Requested cpu id %d exceed the number of cores %d configured to use.\n", 1358 cpu, g_config.mos->num_cores); 1359 return NULL; 1360 } 1361 1362 /* check if mtcp_create_context() was already initialized */ 1363 if (g_logctx[cpu] != NULL) { 1364 TRACE_ERROR("%s was already initialized before!\n", 1365 __FUNCTION__); 1366 return NULL; 1367 } 1368 1369 ret = sem_init(&g_init_sem[cpu], 0, 0); 1370 if (ret) { 1371 TRACE_ERROR("Failed initialize init_sem.\n"); 1372 return NULL; 1373 } 1374 1375 ret = sem_init(&g_done_sem[cpu], 0, 0); 1376 if (ret) { 1377 TRACE_ERROR("Failed initialize done_sem.\n"); 1378 return NULL; 1379 } 1380 1381 mctx = (mctx_t)calloc(1, sizeof(struct mtcp_context)); 1382 if (!mctx) { 1383 TRACE_ERROR("Failed to allocate memory for mtcp_context.\n"); 1384 return NULL; 1385 } 1386 mctx->cpu = cpu; 1387 g_ctx[cpu] = mctx; 1388 1389 /* initialize logger */ 1390 g_logctx[cpu] = (struct log_thread_context *) 1391 calloc(1, sizeof(struct log_thread_context)); 1392 if (!g_logctx[cpu]) { 1393 perror("malloc"); 1394 TRACE_ERROR("Failed to allocate memory for log thread context.\n"); 1395 return NULL; 1396 } 1397 InitLogThreadContext(g_logctx[cpu], cpu); 1398 if (pthread_create(&log_thread[cpu], 1399 NULL, ThreadLogMain, (void *)g_logctx[cpu])) { 1400 perror("pthread_create"); 1401 TRACE_ERROR("Failed to create log thread\n"); 1402 return NULL; 1403 } 1404 1405 #ifdef ENABLE_DPDK 1406 /* Wake up mTCP threads (wake up I/O threads) */ 1407 if (current_iomodule_func == &dpdk_module_func) { 1408 int master; 1409 master = rte_get_master_lcore(); 1410 if (master == cpu) { 1411 lcore_config[master].ret = 0; 1412 lcore_config[master].state = FINISHED; 1413 if (pthread_create(&g_thread[cpu], 1414 NULL, MTCPRunThread, (void *)mctx) != 0) { 1415 TRACE_ERROR("pthread_create of mtcp thread failed!\n"); 1416 return NULL; 1417 } 1418 } else 1419 rte_eal_remote_launch(MTCPDPDKRunThread, mctx, cpu); 1420 } else 1421 #endif /* !ENABLE_DPDK */ 1422 { 1423 if (pthread_create(&g_thread[cpu], 1424 NULL, MTCPRunThread, (void *)mctx) != 0) { 1425 TRACE_ERROR("pthread_create of mtcp thread failed!\n"); 1426 return NULL; 1427 } 1428 } 1429 1430 sem_wait(&g_init_sem[cpu]); 1431 sem_destroy(&g_init_sem[cpu]); 1432 1433 running[cpu] = TRUE; 1434 1435 #ifdef NETSTAT 1436 #if NETSTAT_TOTAL 1437 if (printer < 0) { 1438 printer = cpu; 1439 TRACE_INFO("CPU %d is in charge of printing stats.\n", printer); 1440 } 1441 #endif 1442 #endif 1443 1444 return mctx; 1445 } 1446 /*----------------------------------------------------------------------------*/ 1447 /** 1448 * TODO: It currently always returns 0. Add appropriate error return values 1449 */ 1450 int 1451 mtcp_destroy_context(mctx_t mctx) 1452 { 1453 struct mtcp_thread_context *ctx = g_pctx[mctx->cpu]; 1454 struct mtcp_manager *mtcp = ctx->mtcp_manager; 1455 struct log_thread_context *log_ctx = mtcp->logger; 1456 int ret, i; 1457 1458 TRACE_DBG("CPU %d: mtcp_destroy_context()\n", mctx->cpu); 1459 1460 /* close all stream sockets that are still open */ 1461 if (!ctx->exit) { 1462 for (i = 0; i < g_config.mos->max_concurrency; i++) { 1463 if (mtcp->smap[i].socktype == MOS_SOCK_STREAM) { 1464 TRACE_DBG("Closing remaining socket %d (%s)\n", 1465 i, TCPStateToString(mtcp->smap[i].stream)); 1466 #ifdef DUMP_STREAM 1467 DumpStream(mtcp, mtcp->smap[i].stream); 1468 #endif 1469 mtcp_close(mctx, i); 1470 } 1471 } 1472 } 1473 1474 ctx->done = 1; 1475 1476 //pthread_kill(g_thread[mctx->cpu], SIGINT); 1477 #ifdef ENABLE_DPDK 1478 ctx->exit = 1; 1479 /* XXX - dpdk logic changes */ 1480 if (current_iomodule_func == &dpdk_module_func) { 1481 int master = rte_get_master_lcore(); 1482 if (master == mctx->cpu) 1483 pthread_join(g_thread[mctx->cpu], NULL); 1484 else 1485 rte_eal_wait_lcore(mctx->cpu); 1486 } else 1487 #endif /* !ENABLE_DPDK */ 1488 { 1489 pthread_join(g_thread[mctx->cpu], NULL); 1490 } 1491 1492 TRACE_INFO("MTCP thread %d joined.\n", mctx->cpu); 1493 running[mctx->cpu] = FALSE; 1494 1495 #ifdef NETSTAT 1496 #if NETSTAT_TOTAL 1497 if (printer == mctx->cpu) { 1498 for (i = 0; i < num_cpus; i++) { 1499 if (i != mctx->cpu && running[i]) { 1500 printer = i; 1501 break; 1502 } 1503 } 1504 } 1505 #endif 1506 #endif 1507 1508 log_ctx->done = 1; 1509 ret = write(log_ctx->pair_sp_fd, "F", 1); 1510 if (ret != 1) 1511 TRACE_ERROR("CPU %d: Fail to signal socket pair\n", mctx->cpu); 1512 1513 pthread_join(log_thread[ctx->cpu], NULL); 1514 fclose(mtcp->log_fp); 1515 TRACE_LOG("Log thread %d joined.\n", mctx->cpu); 1516 1517 if (mtcp->connectq) { 1518 DestroyStreamQueue(mtcp->connectq); 1519 mtcp->connectq = NULL; 1520 } 1521 if (mtcp->sendq) { 1522 DestroyStreamQueue(mtcp->sendq); 1523 mtcp->sendq = NULL; 1524 } 1525 if (mtcp->ackq) { 1526 DestroyStreamQueue(mtcp->ackq); 1527 mtcp->ackq = NULL; 1528 } 1529 if (mtcp->closeq) { 1530 DestroyStreamQueue(mtcp->closeq); 1531 mtcp->closeq = NULL; 1532 } 1533 if (mtcp->closeq_int) { 1534 DestroyInternalStreamQueue(mtcp->closeq_int); 1535 mtcp->closeq_int = NULL; 1536 } 1537 if (mtcp->resetq) { 1538 DestroyStreamQueue(mtcp->resetq); 1539 mtcp->resetq = NULL; 1540 } 1541 if (mtcp->resetq_int) { 1542 DestroyInternalStreamQueue(mtcp->resetq_int); 1543 mtcp->resetq_int = NULL; 1544 } 1545 if (mtcp->destroyq) { 1546 DestroyStreamQueue(mtcp->destroyq); 1547 mtcp->destroyq = NULL; 1548 } 1549 1550 DestroyMTCPSender(mtcp->g_sender); 1551 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 1552 DestroyMTCPSender(mtcp->n_sender[i]); 1553 } 1554 1555 MPDestroy(mtcp->rv_pool); 1556 MPDestroy(mtcp->sv_pool); 1557 MPDestroy(mtcp->flow_pool); 1558 1559 if (mtcp->ap) { 1560 DestroyAddressPool(mtcp->ap); 1561 } 1562 1563 SQ_LOCK_DESTROY(&ctx->connect_lock); 1564 SQ_LOCK_DESTROY(&ctx->close_lock); 1565 SQ_LOCK_DESTROY(&ctx->reset_lock); 1566 SQ_LOCK_DESTROY(&ctx->sendq_lock); 1567 SQ_LOCK_DESTROY(&ctx->ackq_lock); 1568 SQ_LOCK_DESTROY(&ctx->destroyq_lock); 1569 1570 //TRACE_INFO("MTCP thread %d destroyed.\n", mctx->cpu); 1571 if (mtcp->iom->destroy_handle) 1572 mtcp->iom->destroy_handle(ctx); 1573 free(ctx); 1574 free(mctx); 1575 1576 return 0; 1577 } 1578 /*----------------------------------------------------------------------------*/ 1579 mtcp_sighandler_t 1580 mtcp_register_signal(int signum, mtcp_sighandler_t handler) 1581 { 1582 mtcp_sighandler_t prev; 1583 1584 if (signum == SIGINT) { 1585 prev = app_signal_handler; 1586 app_signal_handler = handler; 1587 } else { 1588 if ((prev = signal(signum, handler)) == SIG_ERR) { 1589 perror("signal"); 1590 return SIG_ERR; 1591 } 1592 } 1593 1594 return prev; 1595 } 1596 /*----------------------------------------------------------------------------*/ 1597 int 1598 mtcp_getconf(struct mtcp_conf *conf) 1599 { 1600 int i, j; 1601 1602 if (!conf) { 1603 errno = EINVAL; 1604 return -1; 1605 } 1606 1607 conf->num_cores = g_config.mos->num_cores; 1608 conf->max_concurrency = g_config.mos->max_concurrency; 1609 conf->cpu_mask = g_config.mos->cpu_mask; 1610 1611 conf->rcvbuf_size = g_config.mos->rmem_size; 1612 conf->sndbuf_size = g_config.mos->wmem_size; 1613 1614 conf->tcp_timewait = g_config.mos->tcp_tw_interval; 1615 conf->tcp_timeout = g_config.mos->tcp_timeout; 1616 1617 i = 0; 1618 struct conf_block *bwalk; 1619 TAILQ_FOREACH(bwalk, &g_config.app_blkh, link) { 1620 struct app_conf *app_conf = (struct app_conf *)bwalk->conf; 1621 for (j = 0; j < app_conf->app_argc; j++) 1622 conf->app_argv[i][j] = app_conf->app_argv[j]; 1623 conf->app_argc[i] = app_conf->app_argc; 1624 conf->app_cpu_mask[i] = app_conf->cpu_mask; 1625 i++; 1626 } 1627 conf->num_app = i; 1628 1629 return 0; 1630 } 1631 /*----------------------------------------------------------------------------*/ 1632 int 1633 mtcp_setconf(const struct mtcp_conf *conf) 1634 { 1635 if (!conf) 1636 return -1; 1637 1638 g_config.mos->num_cores = conf->num_cores; 1639 g_config.mos->max_concurrency = conf->max_concurrency; 1640 1641 g_config.mos->rmem_size = conf->rcvbuf_size; 1642 g_config.mos->wmem_size = conf->sndbuf_size; 1643 1644 g_config.mos->tcp_tw_interval = conf->tcp_timewait; 1645 g_config.mos->tcp_timeout = conf->tcp_timeout; 1646 1647 TRACE_CONFIG("Configuration updated by mtcp_setconf().\n"); 1648 //PrintConfiguration(); 1649 1650 return 0; 1651 } 1652 /*----------------------------------------------------------------------------*/ 1653 int 1654 mtcp_init(const char *config_file) 1655 { 1656 int i; 1657 int ret; 1658 1659 if (geteuid()) { 1660 TRACE_CONFIG("[CAUTION] Run as root if mlock is necessary.\n"); 1661 #if defined(ENABLE_DPDK) || defined(ENABLE_NETMAP) 1662 TRACE_CONFIG("[CAUTION] Run the app as root!\n"); 1663 exit(EXIT_FAILURE); 1664 #endif 1665 } 1666 1667 /* getting cpu and NIC */ 1668 num_cpus = GetNumCPUs(); 1669 assert(num_cpus >= 1); 1670 for (i = 0; i < num_cpus; i++) { 1671 g_mtcp[i] = NULL; 1672 running[i] = FALSE; 1673 sigint_cnt[i] = 0; 1674 } 1675 1676 ret = LoadConfigurationUpperHalf(config_file); 1677 if (ret) { 1678 TRACE_CONFIG("Error occured while loading configuration.\n"); 1679 return -1; 1680 } 1681 1682 #if defined(ENABLE_PSIO) 1683 current_iomodule_func = &ps_module_func; 1684 #elif defined(ENABLE_DPDK) 1685 current_iomodule_func = &dpdk_module_func; 1686 #elif defined(ENABLE_PCAP) 1687 current_iomodule_func = &pcap_module_func; 1688 #elif defined(ENABLE_NETMAP) 1689 current_iomodule_func = &netmap_module_func; 1690 #endif 1691 1692 if (current_iomodule_func->load_module_upper_half) 1693 current_iomodule_func->load_module_upper_half(); 1694 1695 LoadConfigurationLowerHalf(); 1696 1697 //PrintConfiguration(); 1698 1699 for (i = 0; i < g_config.mos->netdev_table->num; i++) { 1700 ap[i] = CreateAddressPool(g_config.mos->netdev_table->ent[i]->ip_addr, 1); 1701 if (!ap[i]) { 1702 TRACE_CONFIG("Error occured while create address pool[%d]\n", 1703 i); 1704 return -1; 1705 } 1706 } 1707 1708 //PrintInterfaceInfo(); 1709 //PrintRoutingTable(); 1710 //PrintARPTable(); 1711 InitARPTable(); 1712 1713 if (signal(SIGUSR1, HandleSignal) == SIG_ERR) { 1714 perror("signal, SIGUSR1"); 1715 return -1; 1716 } 1717 if (signal(SIGINT, HandleSignal) == SIG_ERR) { 1718 perror("signal, SIGINT"); 1719 return -1; 1720 } 1721 app_signal_handler = NULL; 1722 1723 printf("load_module(): %p\n", current_iomodule_func); 1724 /* load system-wide io module specs */ 1725 if (current_iomodule_func->load_module_lower_half) 1726 current_iomodule_func->load_module_lower_half(); 1727 1728 GlobInitEvent(); 1729 1730 PrintConf(&g_config); 1731 1732 return 0; 1733 } 1734 /*----------------------------------------------------------------------------*/ 1735 int 1736 mtcp_destroy() 1737 { 1738 int i; 1739 1740 /* wait until all threads are closed */ 1741 for (i = 0; i < num_cpus; i++) { 1742 if (running[i]) { 1743 if (pthread_join(g_thread[i], NULL) != 0) 1744 return -1; 1745 } 1746 } 1747 1748 for (i = 0; i < g_config.mos->netdev_table->num; i++) 1749 DestroyAddressPool(ap[i]); 1750 1751 TRACE_INFO("All MTCP threads are joined.\n"); 1752 1753 return 0; 1754 } 1755 /*----------------------------------------------------------------------------*/ 1756