xref: /libevent-2.1.12/epoll.c (revision 1df324d4)
13e41f17aSNiels Provos /*
2b85b710cSNick Mathewson  * Copyright 2000-2007 Niels Provos <[email protected]>
3e49e2891SNick Mathewson  * Copyright 2007-2012 Niels Provos, Nick Mathewson
43e41f17aSNiels Provos  *
53e41f17aSNiels Provos  * Redistribution and use in source and binary forms, with or without
63e41f17aSNiels Provos  * modification, are permitted provided that the following conditions
73e41f17aSNiels Provos  * are met:
83e41f17aSNiels Provos  * 1. Redistributions of source code must retain the above copyright
93e41f17aSNiels Provos  *    notice, this list of conditions and the following disclaimer.
103e41f17aSNiels Provos  * 2. Redistributions in binary form must reproduce the above copyright
113e41f17aSNiels Provos  *    notice, this list of conditions and the following disclaimer in the
123e41f17aSNiels Provos  *    documentation and/or other materials provided with the distribution.
13c3f496c7SNiels Provos  * 3. The name of the author may not be used to endorse or promote products
143e41f17aSNiels Provos  *    derived from this software without specific prior written permission.
153e41f17aSNiels Provos  *
163e41f17aSNiels Provos  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
173e41f17aSNiels Provos  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
183e41f17aSNiels Provos  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
193e41f17aSNiels Provos  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
203e41f17aSNiels Provos  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
213e41f17aSNiels Provos  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
223e41f17aSNiels Provos  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
233e41f17aSNiels Provos  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
243e41f17aSNiels Provos  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
253e41f17aSNiels Provos  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
263e41f17aSNiels Provos  */
27ec347b92SNick Mathewson #include "event2/event-config.h"
280915ca0aSKevin Bowling #include "evconfig-private.h"
293e41f17aSNiels Provos 
3068120d9bSNick Mathewson #ifdef EVENT__HAVE_EPOLL
3176d4c929SRoss Lagerwall 
323e41f17aSNiels Provos #include <stdint.h>
333e41f17aSNiels Provos #include <sys/types.h>
34b0b72eb0SNiels Provos #include <sys/resource.h>
3568120d9bSNick Mathewson #ifdef EVENT__HAVE_SYS_TIME_H
363e41f17aSNiels Provos #include <sys/time.h>
373e41f17aSNiels Provos #endif
383e41f17aSNiels Provos #include <sys/queue.h>
393e41f17aSNiels Provos #include <sys/epoll.h>
403e41f17aSNiels Provos #include <signal.h>
41850c3ff2SChristopher Davis #include <limits.h>
423e41f17aSNiels Provos #include <stdio.h>
433e41f17aSNiels Provos #include <stdlib.h>
443e41f17aSNiels Provos #include <string.h>
453e41f17aSNiels Provos #include <unistd.h>
463e41f17aSNiels Provos #include <errno.h>
4768120d9bSNick Mathewson #ifdef EVENT__HAVE_FCNTL_H
486df2ede5SNiels Provos #include <fcntl.h>
496df2ede5SNiels Provos #endif
5026c75828SNick Mathewson #ifdef EVENT__HAVE_SYS_TIMERFD_H
5126c75828SNick Mathewson #include <sys/timerfd.h>
5226c75828SNick Mathewson #endif
533e41f17aSNiels Provos 
5441b7cbc3SNiels Provos #include "event-internal.h"
55169321c9SNick Mathewson #include "evsignal-internal.h"
566b22e74aSNick Mathewson #include "event2/thread.h"
576b22e74aSNick Mathewson #include "evthread-internal.h"
58169321c9SNick Mathewson #include "log-internal.h"
59169321c9SNick Mathewson #include "evmap-internal.h"
60c8c6a897SNick Mathewson #include "changelist-internal.h"
6171bca50fSNick Mathewson #include "time-internal.h"
62ff266332SJoakim Soderberg 
63ff266332SJoakim Soderberg /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
64ff266332SJoakim Soderberg    using special EPOLLRDHUP flag on a read event.
65ff266332SJoakim Soderberg */
66ff266332SJoakim Soderberg #if !defined(EPOLLRDHUP)
67ff266332SJoakim Soderberg #define EPOLLRDHUP 0
683908a5e3SNick Mathewson #define EARLY_CLOSE_IF_HAVE_RDHUP 0
693908a5e3SNick Mathewson #else
703908a5e3SNick Mathewson #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
71ff266332SJoakim Soderberg #endif
72ff266332SJoakim Soderberg 
7343ffcf69SNick Mathewson #include "epolltable-internal.h"
743e41f17aSNiels Provos 
7526c75828SNick Mathewson #if defined(EVENT__HAVE_SYS_TIMERFD_H) &&			  \
7626c75828SNick Mathewson 	defined(EVENT__HAVE_TIMERFD_CREATE) &&			  \
7726c75828SNick Mathewson 	defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
7826c75828SNick Mathewson 	defined(TFD_CLOEXEC)
7926c75828SNick Mathewson /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
8026c75828SNick Mathewson    and working.  This means that we can't support it on 2.6.25 (where timerfd
8126c75828SNick Mathewson    was introduced) or 2.6.26, since 2.6.27 introduced those flags.
8226c75828SNick Mathewson  */
8326c75828SNick Mathewson #define USING_TIMERFD
8426c75828SNick Mathewson #endif
8526c75828SNick Mathewson 
863e41f17aSNiels Provos struct epollop {
873e41f17aSNiels Provos 	struct epoll_event *events;
883e41f17aSNiels Provos 	int nevents;
893e41f17aSNiels Provos 	int epfd;
9026c75828SNick Mathewson #ifdef USING_TIMERFD
9126c75828SNick Mathewson 	int timerfd;
9226c75828SNick Mathewson #endif
933ba224dbSNiels Provos };
943e41f17aSNiels Provos 
95ca42671aSNiels Provos static void *epoll_init(struct event_base *);
9602b2b4d1SNiels Provos static int epoll_dispatch(struct event_base *, struct timeval *);
9702b2b4d1SNiels Provos static void epoll_dealloc(struct event_base *);
983e41f17aSNiels Provos 
999531763aSNick Mathewson static const struct eventop epollops_changelist = {
1009531763aSNick Mathewson 	"epoll (with changelist)",
1013e41f17aSNiels Provos 	epoll_init,
1028ac3c4c2SNick Mathewson 	event_changelist_add_,
1038ac3c4c2SNick Mathewson 	event_changelist_del_,
1042e8051f5SNiels Provos 	epoll_dispatch,
10588897852SNiels Provos 	epoll_dealloc,
10605965921SNick Mathewson 	1, /* need reinit */
1073908a5e3SNick Mathewson 	EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,
108c8c6a897SNick Mathewson 	EVENT_CHANGELIST_FDINFO_SIZE
1093e41f17aSNiels Provos };
1103e41f17aSNiels Provos 
1119531763aSNick Mathewson 
1129531763aSNick Mathewson static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
1139531763aSNick Mathewson     short old, short events, void *p);
1149531763aSNick Mathewson static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
1159531763aSNick Mathewson     short old, short events, void *p);
1169531763aSNick Mathewson 
1179531763aSNick Mathewson const struct eventop epollops = {
1189531763aSNick Mathewson 	"epoll",
1199531763aSNick Mathewson 	epoll_init,
1209531763aSNick Mathewson 	epoll_nochangelist_add,
1219531763aSNick Mathewson 	epoll_nochangelist_del,
1229531763aSNick Mathewson 	epoll_dispatch,
1239531763aSNick Mathewson 	epoll_dealloc,
1249531763aSNick Mathewson 	1, /* need reinit */
125b1b69ac7SDiego Giagio 	EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,
1269531763aSNick Mathewson 	0
1279531763aSNick Mathewson };
1289531763aSNick Mathewson 
12985255a63SNick Mathewson #define INITIAL_NEVENT 32
13085255a63SNick Mathewson #define MAX_NEVENT 4096
1313e41f17aSNiels Provos 
132f9f4d4feSNick Mathewson /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
133f9f4d4feSNick Mathewson  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
134f9f4d4feSNick Mathewson  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
135f9f4d4feSNick Mathewson  * largest number of msec we can support here is 2147482.  Let's
136f9f4d4feSNick Mathewson  * round that down by 47 seconds.
137f9f4d4feSNick Mathewson  */
138f9f4d4feSNick Mathewson #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
139f9f4d4feSNick Mathewson 
140ca42671aSNiels Provos static void *
epoll_init(struct event_base * base)14141b7cbc3SNiels Provos epoll_init(struct event_base *base)
1423e41f17aSNiels Provos {
143bac906c7SNick Mathewson 	int epfd = -1;
1443ba224dbSNiels Provos 	struct epollop *epollop;
1453e41f17aSNiels Provos 
14668120d9bSNick Mathewson #ifdef EVENT__HAVE_EPOLL_CREATE1
147bac906c7SNick Mathewson 	/* First, try the shiny new epoll_create1 interface, if we have it. */
148bac906c7SNick Mathewson 	epfd = epoll_create1(EPOLL_CLOEXEC);
149bac906c7SNick Mathewson #endif
150bac906c7SNick Mathewson 	if (epfd == -1) {
151bac906c7SNick Mathewson 		/* Initialize the kernel queue using the old interface.  (The
152bac906c7SNick Mathewson 		size field is ignored   since 2.6.8.) */
15385255a63SNick Mathewson 		if ((epfd = epoll_create(32000)) == -1) {
15444ceb945SNick Mathewson 			if (errno != ENOSYS)
155fbdaf3abSNiels Provos 				event_warn("epoll_create");
1563e41f17aSNiels Provos 			return (NULL);
1573e41f17aSNiels Provos 		}
158d0939d2bSJardel Weyrich 		evutil_make_socket_closeonexec(epfd);
159bac906c7SNick Mathewson 	}
1606df2ede5SNiels Provos 
1611aee7183SJamie Iles 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
1621aee7183SJamie Iles 		close(epfd);
1633ba224dbSNiels Provos 		return (NULL);
1641aee7183SJamie Iles 	}
1653ba224dbSNiels Provos 
1663ba224dbSNiels Provos 	epollop->epfd = epfd;
1673e41f17aSNiels Provos 
168e3fd294aSNick Mathewson 	/* Initialize fields */
16918a8cfacSNick Mathewson 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
1703ba224dbSNiels Provos 	if (epollop->events == NULL) {
17149868b61SNick Mathewson 		mm_free(epollop);
1721aee7183SJamie Iles 		close(epfd);
1733e41f17aSNiels Provos 		return (NULL);
1743e41f17aSNiels Provos 	}
17585255a63SNick Mathewson 	epollop->nevents = INITIAL_NEVENT;
1763e41f17aSNiels Provos 
1779531763aSNick Mathewson 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
1789531763aSNick Mathewson 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
17926c75828SNick Mathewson 		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
18026c75828SNick Mathewson 
1819531763aSNick Mathewson 		base->evsel = &epollops_changelist;
18226c75828SNick Mathewson 	}
18326c75828SNick Mathewson 
18426c75828SNick Mathewson #ifdef USING_TIMERFD
18526c75828SNick Mathewson 	/*
18626c75828SNick Mathewson 	  The epoll interface ordinarily gives us one-millisecond precision,
18726c75828SNick Mathewson 	  so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
18826c75828SNick Mathewson 	  timer.  But when the user has set the new PRECISE_TIMER flag for an
18926c75828SNick Mathewson 	  event_base, we can try to use timerfd to give them finer granularity.
19026c75828SNick Mathewson 	*/
19126c75828SNick Mathewson 	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
19226c75828SNick Mathewson 	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
19326c75828SNick Mathewson 		int fd;
19426c75828SNick Mathewson 		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
19526c75828SNick Mathewson 		if (epollop->timerfd >= 0) {
19626c75828SNick Mathewson 			struct epoll_event epev;
1971258614fSPatrick Pelletier 			memset(&epev, 0, sizeof(epev));
19826c75828SNick Mathewson 			epev.data.fd = epollop->timerfd;
19926c75828SNick Mathewson 			epev.events = EPOLLIN;
20026c75828SNick Mathewson 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
20126c75828SNick Mathewson 				event_warn("epoll_ctl(timerfd)");
20226c75828SNick Mathewson 				close(fd);
20326c75828SNick Mathewson 				epollop->timerfd = -1;
20426c75828SNick Mathewson 			}
20526c75828SNick Mathewson 		} else {
2061aaf9f01SDave Hart 			if (errno != EINVAL && errno != ENOSYS) {
2071aaf9f01SDave Hart 				/* These errors probably mean that we were
2081aaf9f01SDave Hart 				 * compiled with timerfd/TFD_* support, but
2091aaf9f01SDave Hart 				 * we're running on a kernel that lacks those.
2101aaf9f01SDave Hart 				 */
21126c75828SNick Mathewson 				event_warn("timerfd_create");
21226c75828SNick Mathewson 			}
2131aaf9f01SDave Hart 			epollop->timerfd = -1;
2141aaf9f01SDave Hart 		}
21526c75828SNick Mathewson 	} else {
21626c75828SNick Mathewson 		epollop->timerfd = -1;
21726c75828SNick Mathewson 	}
21826c75828SNick Mathewson #endif
2199531763aSNick Mathewson 
2208ac3c4c2SNick Mathewson 	evsig_init_(base);
2213ba224dbSNiels Provos 
2223ba224dbSNiels Provos 	return (epollop);
2233e41f17aSNiels Provos }
2243e41f17aSNiels Provos 
2259e725f72SNick Mathewson static const char *
change_to_string(int change)2269e725f72SNick Mathewson change_to_string(int change)
2279e725f72SNick Mathewson {
2289e725f72SNick Mathewson 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
2299e725f72SNick Mathewson 	if (change == EV_CHANGE_ADD) {
2309e725f72SNick Mathewson 		return "add";
2319e725f72SNick Mathewson 	} else if (change == EV_CHANGE_DEL) {
2329e725f72SNick Mathewson 		return "del";
2339e725f72SNick Mathewson 	} else if (change == 0) {
2349e725f72SNick Mathewson 		return "none";
2359e725f72SNick Mathewson 	} else {
2369e725f72SNick Mathewson 		return "???";
2379e725f72SNick Mathewson 	}
2389e725f72SNick Mathewson }
2399e725f72SNick Mathewson 
2409e725f72SNick Mathewson static const char *
epoll_op_to_string(int op)2419e725f72SNick Mathewson epoll_op_to_string(int op)
2429e725f72SNick Mathewson {
2439e725f72SNick Mathewson 	return op == EPOLL_CTL_ADD?"ADD":
2449e725f72SNick Mathewson 	    op == EPOLL_CTL_DEL?"DEL":
2459e725f72SNick Mathewson 	    op == EPOLL_CTL_MOD?"MOD":
2469e725f72SNick Mathewson 	    "???";
2479e725f72SNick Mathewson }
2489e725f72SNick Mathewson 
249a1b142bdSAzat Khuzhin #define PRINT_CHANGES(op, events, ch, status)  \
250a1b142bdSAzat Khuzhin 	"Epoll %s(%d) on fd %d " status ". "       \
251a1b142bdSAzat Khuzhin 	"Old events were %d; "                     \
252a1b142bdSAzat Khuzhin 	"read change was %d (%s); "                \
253a1b142bdSAzat Khuzhin 	"write change was %d (%s); "               \
254a1b142bdSAzat Khuzhin 	"close change was %d (%s)",                \
255a1b142bdSAzat Khuzhin 	epoll_op_to_string(op),                    \
256a1b142bdSAzat Khuzhin 	events,                                    \
257a1b142bdSAzat Khuzhin 	ch->fd,                                    \
258a1b142bdSAzat Khuzhin 	ch->old_events,                            \
259a1b142bdSAzat Khuzhin 	ch->read_change,                           \
260a1b142bdSAzat Khuzhin 	change_to_string(ch->read_change),         \
261a1b142bdSAzat Khuzhin 	ch->write_change,                          \
262a1b142bdSAzat Khuzhin 	change_to_string(ch->write_change),        \
263a1b142bdSAzat Khuzhin 	ch->close_change,                          \
264a1b142bdSAzat Khuzhin 	change_to_string(ch->close_change)
265a1b142bdSAzat Khuzhin 
266d80c1c36SNick Mathewson static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)2679531763aSNick Mathewson epoll_apply_one_change(struct event_base *base,
2689531763aSNick Mathewson     struct epollop *epollop,
2699531763aSNick Mathewson     const struct event_change *ch)
270c8c6a897SNick Mathewson {
271c8c6a897SNick Mathewson 	struct epoll_event epev;
2729531763aSNick Mathewson 	int op, events = 0;
2738c83eb69SNick Mathewson 	int idx;
274c8c6a897SNick Mathewson 
27543ffcf69SNick Mathewson 	idx = EPOLL_OP_TABLE_INDEX(ch);
27643ffcf69SNick Mathewson 	op = epoll_op_table[idx].op;
27743ffcf69SNick Mathewson 	events = epoll_op_table[idx].events;
278c8c6a897SNick Mathewson 
2798c83eb69SNick Mathewson 	if (!events) {
2808c83eb69SNick Mathewson 		EVUTIL_ASSERT(op == 0);
2819531763aSNick Mathewson 		return 0;
2828c83eb69SNick Mathewson 	}
2838c83eb69SNick Mathewson 
284db2efdf5SAzat Khuzhin 	if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET)
2858c83eb69SNick Mathewson 		events |= EPOLLET;
286c8c6a897SNick Mathewson 
287c8c6a897SNick Mathewson 	memset(&epev, 0, sizeof(epev));
288c8c6a897SNick Mathewson 	epev.data.fd = ch->fd;
289c8c6a897SNick Mathewson 	epev.events = events;
2902d55a190SNick Mathewson 	if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
291a1b142bdSAzat Khuzhin 		event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));
2922d55a190SNick Mathewson 		return 0;
2932d55a190SNick Mathewson 	}
2942d55a190SNick Mathewson 
2952d55a190SNick Mathewson 	switch (op) {
2962d55a190SNick Mathewson 	case EPOLL_CTL_MOD:
2972d55a190SNick Mathewson 		if (errno == ENOENT) {
298c8c6a897SNick Mathewson 			/* If a MOD operation fails with ENOENT, the
299c8c6a897SNick Mathewson 			 * fd was probably closed and re-opened.  We
300c8c6a897SNick Mathewson 			 * should retry the operation as an ADD.
301c8c6a897SNick Mathewson 			 */
302c8c6a897SNick Mathewson 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
303ec2b05edSNick Mathewson 				event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
304ec2b05edSNick Mathewson 				    (int)epev.events, ch->fd);
3059531763aSNick Mathewson 				return -1;
306c8c6a897SNick Mathewson 			} else {
307ec2b05edSNick Mathewson 				event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
308ec2b05edSNick Mathewson 					(int)epev.events,
309ec2b05edSNick Mathewson 					ch->fd));
3102d55a190SNick Mathewson 				return 0;
311c8c6a897SNick Mathewson 			}
3122d55a190SNick Mathewson 		}
3132d55a190SNick Mathewson 		break;
3142d55a190SNick Mathewson 	case EPOLL_CTL_ADD:
3152d55a190SNick Mathewson 		if (errno == EEXIST) {
316c281aba3SNick Mathewson 			/* If an ADD operation fails with EEXIST,
317c281aba3SNick Mathewson 			 * either the operation was redundant (as with a
318c281aba3SNick Mathewson 			 * precautionary add), or we ran into a fun
319c281aba3SNick Mathewson 			 * kernel bug where using dup*() to duplicate the
320c281aba3SNick Mathewson 			 * same file into the same fd gives you the same epitem
321c281aba3SNick Mathewson 			 * rather than a fresh one.  For the second case,
322c281aba3SNick Mathewson 			 * we must retry with MOD. */
323c281aba3SNick Mathewson 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
324c281aba3SNick Mathewson 				event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
325c281aba3SNick Mathewson 				    (int)epev.events, ch->fd);
3269531763aSNick Mathewson 				return -1;
327c281aba3SNick Mathewson 			} else {
328c281aba3SNick Mathewson 				event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
329ec2b05edSNick Mathewson 					(int)epev.events,
330c281aba3SNick Mathewson 					ch->fd));
3312d55a190SNick Mathewson 				return 0;
332c281aba3SNick Mathewson 			}
3332d55a190SNick Mathewson 		}
3342d55a190SNick Mathewson 		break;
3352d55a190SNick Mathewson 	case EPOLL_CTL_DEL:
336d4970d4eSNick Mathewson 		if (errno == ENOENT || errno == EBADF || errno == EPERM) {
337c8c6a897SNick Mathewson 			/* If a delete fails with one of these errors,
338c8c6a897SNick Mathewson 			 * that's fine too: we closed the fd before we
339c8c6a897SNick Mathewson 			 * got around to calling epoll_dispatch. */
340ec2b05edSNick Mathewson 			event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
341ec2b05edSNick Mathewson 				(int)epev.events,
342ec2b05edSNick Mathewson 				ch->fd,
343ec2b05edSNick Mathewson 				strerror(errno)));
3442d55a190SNick Mathewson 			return 0;
3452d55a190SNick Mathewson 		}
3462d55a190SNick Mathewson 		break;
3472d55a190SNick Mathewson 	default:
3482d55a190SNick Mathewson 		break;
3492d55a190SNick Mathewson 	}
3502d55a190SNick Mathewson 
351a1b142bdSAzat Khuzhin 	event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));
3529531763aSNick Mathewson 	return -1;
353c8c6a897SNick Mathewson }
354c8c6a897SNick Mathewson 
3559531763aSNick Mathewson static int
epoll_apply_changes(struct event_base * base)3569531763aSNick Mathewson epoll_apply_changes(struct event_base *base)
3579531763aSNick Mathewson {
3589531763aSNick Mathewson 	struct event_changelist *changelist = &base->changelist;
3599531763aSNick Mathewson 	struct epollop *epollop = base->evbase;
3609531763aSNick Mathewson 	struct event_change *ch;
3619531763aSNick Mathewson 
3629531763aSNick Mathewson 	int r = 0;
3639531763aSNick Mathewson 	int i;
3649531763aSNick Mathewson 
3659531763aSNick Mathewson 	for (i = 0; i < changelist->n_changes; ++i) {
3669531763aSNick Mathewson 		ch = &changelist->changes[i];
3679531763aSNick Mathewson 		if (epoll_apply_one_change(base, epollop, ch) < 0)
3689531763aSNick Mathewson 			r = -1;
3699531763aSNick Mathewson 	}
3709531763aSNick Mathewson 
3719531763aSNick Mathewson 	return (r);
3729531763aSNick Mathewson }
3739531763aSNick Mathewson 
3749531763aSNick Mathewson static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3759531763aSNick Mathewson epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
3769531763aSNick Mathewson     short old, short events, void *p)
3779531763aSNick Mathewson {
3789531763aSNick Mathewson 	struct event_change ch;
3799531763aSNick Mathewson 	ch.fd = fd;
3809531763aSNick Mathewson 	ch.old_events = old;
381b1b69ac7SDiego Giagio 	ch.read_change = ch.write_change = ch.close_change = 0;
3829531763aSNick Mathewson 	if (events & EV_WRITE)
3839531763aSNick Mathewson 		ch.write_change = EV_CHANGE_ADD |
3849531763aSNick Mathewson 		    (events & EV_ET);
3859531763aSNick Mathewson 	if (events & EV_READ)
3869531763aSNick Mathewson 		ch.read_change = EV_CHANGE_ADD |
3879531763aSNick Mathewson 		    (events & EV_ET);
388b1b69ac7SDiego Giagio 	if (events & EV_CLOSED)
389b1b69ac7SDiego Giagio 		ch.close_change = EV_CHANGE_ADD |
390b1b69ac7SDiego Giagio 		    (events & EV_ET);
3919531763aSNick Mathewson 
3929531763aSNick Mathewson 	return epoll_apply_one_change(base, base->evbase, &ch);
3939531763aSNick Mathewson }
3949531763aSNick Mathewson 
3959531763aSNick Mathewson static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)3969531763aSNick Mathewson epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
3979531763aSNick Mathewson     short old, short events, void *p)
3989531763aSNick Mathewson {
3999531763aSNick Mathewson 	struct event_change ch;
4009531763aSNick Mathewson 	ch.fd = fd;
4019531763aSNick Mathewson 	ch.old_events = old;
402b1b69ac7SDiego Giagio 	ch.read_change = ch.write_change = ch.close_change = 0;
4039531763aSNick Mathewson 	if (events & EV_WRITE)
404ca4b6404SAzat Khuzhin 		ch.write_change = EV_CHANGE_DEL |
405ca4b6404SAzat Khuzhin 		    (events & EV_ET);
4069531763aSNick Mathewson 	if (events & EV_READ)
407ca4b6404SAzat Khuzhin 		ch.read_change = EV_CHANGE_DEL |
408ca4b6404SAzat Khuzhin 		    (events & EV_ET);
409b1b69ac7SDiego Giagio 	if (events & EV_CLOSED)
410ca4b6404SAzat Khuzhin 		ch.close_change = EV_CHANGE_DEL |
411ca4b6404SAzat Khuzhin 		    (events & EV_ET);
4129531763aSNick Mathewson 
4139531763aSNick Mathewson 	return epoll_apply_one_change(base, base->evbase, &ch);
414c8c6a897SNick Mathewson }
415c8c6a897SNick Mathewson 
416c8c6a897SNick Mathewson static int
epoll_dispatch(struct event_base * base,struct timeval * tv)41702b2b4d1SNiels Provos epoll_dispatch(struct event_base *base, struct timeval *tv)
4183e41f17aSNiels Provos {
41902b2b4d1SNiels Provos 	struct epollop *epollop = base->evbase;
4203e41f17aSNiels Provos 	struct epoll_event *events = epollop->events;
421850c3ff2SChristopher Davis 	int i, res;
422850c3ff2SChristopher Davis 	long timeout = -1;
4233e41f17aSNiels Provos 
42426c75828SNick Mathewson #ifdef USING_TIMERFD
42526c75828SNick Mathewson 	if (epollop->timerfd >= 0) {
42626c75828SNick Mathewson 		struct itimerspec is;
42726c75828SNick Mathewson 		is.it_interval.tv_sec = 0;
42826c75828SNick Mathewson 		is.it_interval.tv_nsec = 0;
42926c75828SNick Mathewson 		if (tv == NULL) {
43026c75828SNick Mathewson 			/* No timeout; disarm the timer. */
43126c75828SNick Mathewson 			is.it_value.tv_sec = 0;
43226c75828SNick Mathewson 			is.it_value.tv_nsec = 0;
43326c75828SNick Mathewson 		} else {
43426c75828SNick Mathewson 			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
43526c75828SNick Mathewson 				/* we need to exit immediately; timerfd can't
43626c75828SNick Mathewson 				 * do that. */
43726c75828SNick Mathewson 				timeout = 0;
43826c75828SNick Mathewson 			}
43926c75828SNick Mathewson 			is.it_value.tv_sec = tv->tv_sec;
44026c75828SNick Mathewson 			is.it_value.tv_nsec = tv->tv_usec * 1000;
44126c75828SNick Mathewson 		}
44226c75828SNick Mathewson 		/* TODO: we could avoid unnecessary syscalls here by only
44326c75828SNick Mathewson 		   calling timerfd_settime when the top timeout changes, or
44426c75828SNick Mathewson 		   when we're called with a different timeval.
44526c75828SNick Mathewson 		*/
44626c75828SNick Mathewson 		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
44726c75828SNick Mathewson 			event_warn("timerfd_settime");
44826c75828SNick Mathewson 		}
44926c75828SNick Mathewson 	} else
45026c75828SNick Mathewson #endif
451850c3ff2SChristopher Davis 	if (tv != NULL) {
4528ac3c4c2SNick Mathewson 		timeout = evutil_tv_to_msec_(tv);
453850c3ff2SChristopher Davis 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
454850c3ff2SChristopher Davis 			/* Linux kernels can wait forever if the timeout is
455850c3ff2SChristopher Davis 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
456f9f4d4feSNick Mathewson 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
457f9f4d4feSNick Mathewson 		}
458850c3ff2SChristopher Davis 	}
459f9f4d4feSNick Mathewson 
460c8c6a897SNick Mathewson 	epoll_apply_changes(base);
4618ac3c4c2SNick Mathewson 	event_changelist_remove_all_(&base->changelist, base);
462c8c6a897SNick Mathewson 
46376cd2b70SNick Mathewson 	EVBASE_RELEASE_LOCK(base, th_base_lock);
4646b22e74aSNick Mathewson 
4653e41f17aSNiels Provos 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
4663e41f17aSNiels Provos 
46776cd2b70SNick Mathewson 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
4686b22e74aSNick Mathewson 
4693e41f17aSNiels Provos 	if (res == -1) {
4703e41f17aSNiels Provos 		if (errno != EINTR) {
471fbdaf3abSNiels Provos 			event_warn("epoll_wait");
4723e41f17aSNiels Provos 			return (-1);
4733e41f17aSNiels Provos 		}
4743e41f17aSNiels Provos 
4753e41f17aSNiels Provos 		return (0);
47641b7cbc3SNiels Provos 	}
4773e41f17aSNiels Provos 
478fbdaf3abSNiels Provos 	event_debug(("%s: epoll_wait reports %d", __func__, res));
4792e36dbe1SNick Mathewson 	EVUTIL_ASSERT(res <= epollop->nevents);
4803e41f17aSNiels Provos 
4813e41f17aSNiels Provos 	for (i = 0; i < res; i++) {
482e1cd86d7SNiels Provos 		int what = events[i].events;
48385255a63SNick Mathewson 		short ev = 0;
48426c75828SNick Mathewson #ifdef USING_TIMERFD
48526c75828SNick Mathewson 		if (events[i].data.fd == epollop->timerfd)
48626c75828SNick Mathewson 			continue;
48726c75828SNick Mathewson #endif
488bbed0954SNiels Provos 
489*1df324d4SAzat Khuzhin 		if (what & EPOLLERR) {
490*1df324d4SAzat Khuzhin 			ev = EV_READ | EV_WRITE;
491*1df324d4SAzat Khuzhin 		} else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) {
49285255a63SNick Mathewson 			ev = EV_READ | EV_WRITE;
493bbed0954SNiels Provos 		} else {
49402b2b4d1SNiels Provos 			if (what & EPOLLIN)
49585255a63SNick Mathewson 				ev |= EV_READ;
49602b2b4d1SNiels Provos 			if (what & EPOLLOUT)
49785255a63SNick Mathewson 				ev |= EV_WRITE;
498b1b69ac7SDiego Giagio 			if (what & EPOLLRDHUP)
499b1b69ac7SDiego Giagio 				ev |= EV_CLOSED;
500bbed0954SNiels Provos 		}
5013e41f17aSNiels Provos 
5020faaee01SNick Mathewson 		if (!ev)
503bbed0954SNiels Provos 			continue;
504bbed0954SNiels Provos 
5058ac3c4c2SNick Mathewson 		evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
50685255a63SNick Mathewson 	}
50785255a63SNick Mathewson 
50885255a63SNick Mathewson 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
50985255a63SNick Mathewson 		/* We used all of the event space this time.  We should
51085255a63SNick Mathewson 		   be ready for more events next time. */
51185255a63SNick Mathewson 		int new_nevents = epollop->nevents * 2;
51285255a63SNick Mathewson 		struct epoll_event *new_events;
51385255a63SNick Mathewson 
51485255a63SNick Mathewson 		new_events = mm_realloc(epollop->events,
51585255a63SNick Mathewson 		    new_nevents * sizeof(struct epoll_event));
51685255a63SNick Mathewson 		if (new_events) {
51785255a63SNick Mathewson 			epollop->events = new_events;
51885255a63SNick Mathewson 			epollop->nevents = new_nevents;
51985255a63SNick Mathewson 		}
5203e41f17aSNiels Provos 	}
5213e41f17aSNiels Provos 
5223e41f17aSNiels Provos 	return (0);
5233e41f17aSNiels Provos }
5243e41f17aSNiels Provos 
5252e8051f5SNiels Provos 
526ca42671aSNiels Provos static void
epoll_dealloc(struct event_base * base)52702b2b4d1SNiels Provos epoll_dealloc(struct event_base *base)
5282e8051f5SNiels Provos {
52902b2b4d1SNiels Provos 	struct epollop *epollop = base->evbase;
5302e8051f5SNiels Provos 
5318ac3c4c2SNick Mathewson 	evsig_dealloc_(base);
5322e8051f5SNiels Provos 	if (epollop->events)
53349868b61SNick Mathewson 		mm_free(epollop->events);
5342e8051f5SNiels Provos 	if (epollop->epfd >= 0)
5352e8051f5SNiels Provos 		close(epollop->epfd);
53626c75828SNick Mathewson #ifdef USING_TIMERFD
53726c75828SNick Mathewson 	if (epollop->timerfd >= 0)
53826c75828SNick Mathewson 		close(epollop->timerfd);
53926c75828SNick Mathewson #endif
5402e8051f5SNiels Provos 
5412e8051f5SNiels Provos 	memset(epollop, 0, sizeof(struct epollop));
54249868b61SNick Mathewson 	mm_free(epollop);
5432e8051f5SNiels Provos }
54476d4c929SRoss Lagerwall 
54568120d9bSNick Mathewson #endif /* EVENT__HAVE_EPOLL */
546