1 #include "network_backends.h"
2 
3 #ifdef USE_MTCP
4 
5 #include "network.h"
6 #include "fdevent.h"
7 #include "log.h"
8 #include "stat_cache.h"
9 
10 #include <sys/types.h>
11 #include <sys/socket.h>
12 #include <sys/stat.h>
13 #include <sys/time.h>
14 #include <sys/resource.h>
15 
16 #include <netinet/in.h>
17 #include <netinet/tcp.h>
18 
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <unistd.h>
22 #include <netdb.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <fcntl.h>
26 /*------------------------------------------------------------------------------------*/
27 /* on linux 2.4.29 + debian/ubuntu we have crashes if this is enabled */
28 #undef HAVE_POSIX_FADVISE
29 /*------------------------------------------------------------------------------------*/
30 #if 0
31 int
32 network_write_chunkqueue_mtcp_writev(server *srv, connection *con, int fd, chunkqueue *cq, off_t max_bytes)
33 {
34 	chunk *c;
35 
36 	for(c = cq->first; (max_bytes > 0) && (NULL != c); c = c->next) {
37 		int chunk_finished = 0;
38 
39 		switch(c->type) {
40 		case MEM_CHUNK: {
41 			char * offset;
42 			off_t toSend;
43 			ssize_t r;
44 
45 			size_t num_chunks, i;
46 			struct iovec *chunks;
47 			chunk *tc;
48 			size_t num_bytes = 0;
49 #if defined(_SC_IOV_MAX) /* IRIX, MacOS X, FreeBSD, Solaris, ... */
50 			const size_t max_chunks = sysconf(_SC_IOV_MAX);
51 #elif defined(IOV_MAX) /* Linux x86 (glibc-2.3.6-3) */
52 			const size_t max_chunks = IOV_MAX;
53 #elif defined(MAX_IOVEC) /* Linux ia64 (glibc-2.3.3-98.28) */
54 			const size_t max_chunks = MAX_IOVEC;
55 #elif defined(UIO_MAXIOV) /* Linux x86 (glibc-2.2.5-233) */
56 			const size_t max_chunks = UIO_MAXIOV;
57 #elif (defined(__FreeBSD__) && __FreeBSD_version < 500000) || defined(__DragonFly__) || defined(__APPLE__)
58 			/* - FreeBSD 4.x
59 			 * - MacOS X 10.3.x
60 			 *   (covered in -DKERNEL)
61 			 *  */
62 			const size_t max_chunks = 1024; /* UIO_MAXIOV value from sys/uio.h */
63 #else
64 #error "sysconf() doesnt return _SC_IOV_MAX ..., check the output of 'man writev' for the EINVAL error and send the output to [email protected]"
65 #endif
66 
67 			/* build writev list
68 			 *
69 			 * 1. limit: num_chunks < max_chunks
70 			 * 2. limit: num_bytes < max_bytes
71 			 */
72 			for (num_chunks = 0, tc = c; tc && tc->type == MEM_CHUNK && num_chunks < max_chunks; num_chunks++, tc = tc->next);
73 
74 			chunks = calloc(num_chunks, sizeof(*chunks));
75 
76 			for(tc = c, i = 0; i < num_chunks; tc = tc->next, i++) {
77 				if (tc->mem->used == 0) {
78 					chunks[i].iov_base = tc->mem->ptr;
79 					chunks[i].iov_len  = 0;
80 				} else {
81 					offset = tc->mem->ptr + tc->offset;
82 					toSend = tc->mem->used - 1 - tc->offset;
83 
84 					chunks[i].iov_base = offset;
85 
86 					/* protect the return value of writev() */
87 					if (toSend > max_bytes ||
88 					    (off_t) num_bytes + toSend > max_bytes) {
89 						chunks[i].iov_len = max_bytes - num_bytes;
90 
91 						num_chunks = i + 1;
92 						break;
93 					} else {
94 						chunks[i].iov_len = toSend;
95 					}
96 
97 					num_bytes += toSend;
98 				}
99 			}
100 
101 			if ((r = mtcp_writev(srv->mctx, fd, chunks, num_chunks)) < 0) {
102 				switch (errno) {
103 				case EAGAIN:
104 				case EINTR:
105 					r = 0;
106 					break;
107 				case EPIPE:
108 				case ECONNRESET:
109 					free(chunks);
110 					return -2;
111 				default:
112 					log_error_write(srv, __FILE__, __LINE__, "ssd",
113 							"writev failed:", strerror(errno), fd);
114 
115 					free(chunks);
116 					return -1;
117 				}
118 			}
119 
120 			cq->bytes_out += r;
121 			max_bytes -= r;
122 
123 			/* check which chunks have been written */
124 
125 			for(i = 0, tc = c; i < num_chunks; i++, tc = tc->next) {
126 				if (r >= (ssize_t)chunks[i].iov_len) {
127 					/* written */
128 					r -= chunks[i].iov_len;
129 					tc->offset += chunks[i].iov_len;
130 
131 					if (chunk_finished) {
132 						/* skip the chunks from further touches */
133 						c = c->next;
134 					} else {
135 						/* chunks_written + c = c->next is done in the for()*/
136 						chunk_finished = 1;
137 					}
138 				} else {
139 					/* partially written */
140 
141 					tc->offset += r;
142 					chunk_finished = 0;
143 
144 					break;
145 				}
146 			}
147 			free(chunks);
148 
149 			break;
150 		}
151 		case FILE_CHUNK: {
152 			ssize_t r;
153 			off_t abs_offset;
154 			off_t toSend;
155 			stat_cache_entry *sce = NULL;
156 
157 #define KByte * 1024
158 #define MByte * 1024 KByte
159 #define GByte * 1024 MByte
160 			const off_t we_want_to_mmap = 512 KByte;
161 			char *start = NULL;
162 
163 			if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) {
164 				log_error_write(srv, __FILE__, __LINE__, "sb",
165 						strerror(errno), c->file.name);
166 				return -1;
167 			}
168 
169 			abs_offset = c->file.start + c->offset;
170 
171 			if (abs_offset > sce->st.st_size) {
172 				log_error_write(srv, __FILE__, __LINE__, "sb",
173 						"file was shrinked:", c->file.name);
174 
175 				return -1;
176 			}
177 
178 			/* mmap the buffer
179 			 * - first mmap
180 			 * - new mmap as the we are at the end of the last one */
181 			if (c->file.mmap.start == MAP_FAILED ||
182 			    abs_offset == (off_t)(c->file.mmap.offset + c->file.mmap.length)) {
183 
184 				/* Optimizations for the future:
185 				 *
186 				 * adaptive mem-mapping
187 				 *   the problem:
188 				 *     we mmap() the whole file. If someone has alot large files and 32bit
189 				 *     machine the virtual address area will be unrun and we will have a failing
190 				 *     mmap() call.
191 				 *   solution:
192 				 *     only mmap 16M in one chunk and move the window as soon as we have finished
193 				 *     the first 8M
194 				 *
195 				 * read-ahead buffering
196 				 *   the problem:
197 				 *     sending out several large files in parallel trashes the read-ahead of the
198 				 *     kernel leading to long wait-for-seek times.
199 				 *   solutions: (increasing complexity)
200 				 *     1. use madvise
201 				 *     2. use a internal read-ahead buffer in the chunk-structure
202 				 *     3. use non-blocking IO for file-transfers
203 				 *   */
204 
205 				/* all mmap()ed areas are 512kb expect the last which might be smaller */
206 				off_t we_want_to_send;
207 				size_t to_mmap;
208 
209 				/* this is a remap, move the mmap-offset */
210 				if (c->file.mmap.start != MAP_FAILED) {
211 					munmap(c->file.mmap.start, c->file.mmap.length);
212 					c->file.mmap.offset += we_want_to_mmap;
213 				} else {
214 					/* in case the range-offset is after the first mmap()ed area we skip the area */
215 					c->file.mmap.offset = 0;
216 
217 					while (c->file.mmap.offset + we_want_to_mmap < c->file.start) {
218 						c->file.mmap.offset += we_want_to_mmap;
219 					}
220 				}
221 
222 				/* length is rel, c->offset too, assume there is no limit at the mmap-boundaries */
223 				we_want_to_send = c->file.length - c->offset;
224 				to_mmap = (c->file.start + c->file.length) - c->file.mmap.offset;
225 
226 				/* we have more to send than we can mmap() at once */
227 				if (abs_offset + we_want_to_send > c->file.mmap.offset + we_want_to_mmap) {
228 					we_want_to_send = (c->file.mmap.offset + we_want_to_mmap) - abs_offset;
229 					to_mmap = we_want_to_mmap;
230 				}
231 
232 				if (-1 == c->file.fd) {  /* open the file if not already open */
233 					if (-1 == (c->file.fd = open(c->file.name->ptr, O_RDONLY))) {
234 						log_error_write(srv, __FILE__, __LINE__, "sbs", "open failed for:", c->file.name, strerror(errno));
235 
236 						return -1;
237 					}
238 #ifdef FD_CLOEXEC
239 					fcntl(c->file.fd, F_SETFD, FD_CLOEXEC);
240 #endif
241 				}
242 
243 				if (MAP_FAILED == (c->file.mmap.start = mmap(NULL, to_mmap, PROT_READ, MAP_SHARED, c->file.fd, c->file.mmap.offset))) {
244 					/* close it here, otherwise we'd have to set FD_CLOEXEC */
245 
246 					log_error_write(srv, __FILE__, __LINE__, "ssbd", "mmap failed:",
247 							strerror(errno), c->file.name, c->file.fd);
248 
249 					return -1;
250 				}
251 
252 				c->file.mmap.length = to_mmap;
253 #ifdef LOCAL_BUFFERING
254 				buffer_copy_string_len(c->mem, c->file.mmap.start, c->file.mmap.length);
255 #else
256 #ifdef HAVE_MADVISE
257 				/* don't advise files < 64Kb */
258 				if (c->file.mmap.length > (64 KByte)) {
259 					/* darwin 7 is returning EINVAL all the time and I don't know how to
260 					 * detect this at runtime.i
261 					 *
262 					 * ignore the return value for now */
263 					madvise(c->file.mmap.start, c->file.mmap.length, MADV_WILLNEED);
264 				}
265 #endif
266 #endif
267 
268 				/* chunk_reset() or chunk_free() will cleanup for us */
269 			}
270 
271 			/* to_send = abs_mmap_end - abs_offset */
272 			toSend = (c->file.mmap.offset + c->file.mmap.length) - (abs_offset);
273 
274 			if (toSend < 0) {
275 				log_error_write(srv, __FILE__, __LINE__, "soooo",
276 						"toSend is negative:",
277 						toSend,
278 						c->file.mmap.length,
279 						abs_offset,
280 						c->file.mmap.offset);
281 				exit(EXIT_FAILURE);
282 			}
283 
284 			if (toSend > max_bytes) toSend = max_bytes;
285 
286 #ifdef LOCAL_BUFFERING
287 			start = c->mem->ptr;
288 #else
289 			start = c->file.mmap.start;
290 #endif
291 
292 			if ((r = mtcp_write(srv->mctx, fd, start + (abs_offset - c->file.mmap.offset), toSend)) < 0) {
293 				switch (errno) {
294 				case EAGAIN:
295 				case EINTR:
296 					r = 0;
297 					break;
298 				case EPIPE:
299 				case ECONNRESET:
300 					return -2;
301 				default:
302 					log_error_write(srv, __FILE__, __LINE__, "ssd",
303 							"write failed:", strerror(errno), fd);
304 
305 					return -1;
306 				}
307 			}
308 
309 			c->offset += r;
310 			cq->bytes_out += r;
311 			max_bytes -= r;
312 
313 			if (c->offset == c->file.length) {
314 				chunk_finished = 1;
315 
316 				/* we don't need the mmaping anymore */
317 				if (c->file.mmap.start != MAP_FAILED) {
318 					munmap(c->file.mmap.start, c->file.mmap.length);
319 					c->file.mmap.start = MAP_FAILED;
320 				}
321 			}
322 
323 			break;
324 		}
325 		default:
326 
327 			log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known");
328 
329 			return -1;
330 		}
331 
332 		if (!chunk_finished) {
333 			/* not finished yet */
334 
335 			break;
336 		}
337 	}
338 
339 	return 0;
340 }
341 #endif
342 /*----------------------------------------------------------------------------------*/
343 /*
344  * The following code is the `write' version of mtcp.... for debugging purposes
345  */
network_write_chunkqueue_mtcp_writev(server * srv,connection * con,int fd,chunkqueue * cq,off_t max_bytes)346 int network_write_chunkqueue_mtcp_writev(server *srv, connection *con, int fd, chunkqueue *cq, off_t max_bytes) {
347 	chunk *c;
348 
349 	for(c = cq->first; (max_bytes > 0) && (NULL != c); c = c->next) {
350 		int chunk_finished = 0;
351 
352 		switch(c->type) {
353 		case MEM_CHUNK: {
354 			char * offset;
355 			off_t toSend;
356 			ssize_t r;
357 
358 			if (c->mem->used == 0) {
359 				chunk_finished = 1;
360 				break;
361 			}
362 
363 			offset = c->mem->ptr + c->offset;
364 			toSend = c->mem->used - 1 - c->offset;
365 			if (toSend > max_bytes) toSend = max_bytes;
366 
367 #ifdef __WIN32
368 			if ((r = send(fd, offset, toSend, 0)) < 0) {
369 				/* no error handling for windows... */
370 				log_error_write(srv, __FILE__, __LINE__, "ssd", "send failed: ", strerror(errno), fd);
371 
372 				return -1;
373 			}
374 #else
375 			if ((r = mtcp_write(srv->mctx, fd, offset, toSend)) < 0) {
376 				switch (errno) {
377 				case EAGAIN:
378 				case EINTR:
379 					r = 0;
380 					break;
381 				case EPIPE:
382 				case ECONNRESET:
383 					return -2;
384 				default:
385 					log_error_write(srv, __FILE__, __LINE__, "ssd",
386 						"write failed:", strerror(errno), fd);
387 
388 					return -1;
389 				}
390 			}
391 #endif
392 
393 			c->offset += r;
394 			cq->bytes_out += r;
395 			max_bytes -= r;
396 
397 			if (c->offset == (off_t)c->mem->used - 1) {
398 				chunk_finished = 1;
399 			}
400 
401 			break;
402 		}
403 		case FILE_CHUNK: {
404 #ifdef USE_MMAP
405 			char *p = NULL;
406 #endif
407 			ssize_t r;
408 			off_t offset;
409 			off_t toSend;
410 			stat_cache_entry *sce = NULL;
411 			int ifd;
412 
413 			if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) {
414 				log_error_write(srv, __FILE__, __LINE__, "sb",
415 						strerror(errno), c->file.name);
416 				return -1;
417 			}
418 
419 			offset = c->file.start + c->offset;
420 			toSend = c->file.length - c->offset;
421 
422 			if (toSend > max_bytes) toSend = max_bytes;
423 
424 			if (offset > sce->st.st_size) {
425 				log_error_write(srv, __FILE__, __LINE__, "sb", "file was shrinked:", c->file.name);
426 
427 				return -1;
428 			}
429 
430 			if (-1 == (ifd = open(c->file.name->ptr, O_RDONLY))) {
431 				log_error_write(srv, __FILE__, __LINE__, "ss", "open failed: ", strerror(errno));
432 
433 				return -1;
434 			}
435 
436 #ifdef USE_MMAP
437 			Control should not come here
438 			if (MAP_FAILED == (p = mmap(0, sce->st.st_size, PROT_READ, MAP_SHARED, ifd, 0))) {
439 				log_error_write(srv, __FILE__, __LINE__, "ss", "mmap failed: ", strerror(errno));
440 
441 				close(ifd);
442 
443 				return -1;
444 			}
445 			close(ifd);
446 
447 			if ((r = write(fd, p + offset, toSend)) <= 0) {
448 				switch (errno) {
449 				case EAGAIN:
450 				case EINTR:
451 					r = 0;
452 					break;
453 				case EPIPE:
454 				case ECONNRESET:
455 					munmap(p, sce->st.st_size);
456 					return -2;
457 				default:
458 					log_error_write(srv, __FILE__, __LINE__, "ssd",
459 						"write failed:", strerror(errno), fd);
460 					munmap(p, sce->st.st_size);
461 
462 					return -1;
463 				}
464 			}
465 
466 			munmap(p, sce->st.st_size);
467 #else /* USE_MMAP */
468 			buffer_prepare_copy(srv->tmp_buf, toSend);
469 
470 			lseek(ifd, offset, SEEK_SET);
471 			if (-1 == (toSend = read(ifd, srv->tmp_buf->ptr, toSend))) {
472 				log_error_write(srv, __FILE__, __LINE__, "ss", "read: ", strerror(errno));
473 				close(ifd);
474 
475 				return -1;
476 			}
477 			close(ifd);
478 
479 #ifdef __WIN32
480 			if ((r = send(fd, srv->tmp_buf->ptr, toSend, 0)) < 0) {
481 				/* no error handling for windows... */
482 				log_error_write(srv, __FILE__, __LINE__, "ssd", "send failed: ", strerror(errno), fd);
483 
484 				return -1;
485 			}
486 #else /* __WIN32 */
487 			if ((r = mtcp_write(srv->mctx, fd, srv->tmp_buf->ptr, toSend)) < 0) {
488 				switch (errno) {
489 				case EAGAIN:
490 				case EINTR:
491 					r = 0;
492 					break;
493 				case EPIPE:
494 				case ECONNRESET:
495 					return -2;
496 				default:
497 					log_error_write(srv, __FILE__, __LINE__, "ssd",
498 						"write failed:", strerror(errno), fd);
499 
500 					return -1;
501 				}
502 			}
503 #endif /* __WIN32 */
504 #endif /* USE_MMAP */
505 
506 			c->offset += r;
507 			cq->bytes_out += r;
508 			max_bytes -= r;
509 
510 			if (c->offset == c->file.length) {
511 				chunk_finished = 1;
512 			}
513 
514 			break;
515 		}
516 		default:
517 
518 			log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known");
519 
520 			return -1;
521 		}
522 
523 		if (!chunk_finished) {
524 			/* not finished yet */
525 
526 			break;
527 		}
528 	}
529 
530 	return 0;
531 }
532 /*------------------------------------------------------------------------------------*/
533 #endif /* USE_MTCP */
534 #if 0
535 network_linuxsendfile_init(void) {
536 	p->write = network_linuxsendfile_write_chunkset;
537 }
538 #endif
539