1*76404edcSAsim Jamshed #include "network_backends.h"
2*76404edcSAsim Jamshed 
3*76404edcSAsim Jamshed #ifdef USE_WRITEV
4*76404edcSAsim Jamshed 
5*76404edcSAsim Jamshed #include "network.h"
6*76404edcSAsim Jamshed #include "fdevent.h"
7*76404edcSAsim Jamshed #include "log.h"
8*76404edcSAsim Jamshed #include "stat_cache.h"
9*76404edcSAsim Jamshed 
10*76404edcSAsim Jamshed #include <sys/types.h>
11*76404edcSAsim Jamshed #include <sys/socket.h>
12*76404edcSAsim Jamshed #include <sys/uio.h>
13*76404edcSAsim Jamshed #include <sys/stat.h>
14*76404edcSAsim Jamshed #include <sys/time.h>
15*76404edcSAsim Jamshed #include <sys/resource.h>
16*76404edcSAsim Jamshed #include <netinet/in.h>
17*76404edcSAsim Jamshed #include <netinet/tcp.h>
18*76404edcSAsim Jamshed 
19*76404edcSAsim Jamshed #include <errno.h>
20*76404edcSAsim Jamshed #include <fcntl.h>
21*76404edcSAsim Jamshed #include <unistd.h>
22*76404edcSAsim Jamshed #include <netdb.h>
23*76404edcSAsim Jamshed #include <string.h>
24*76404edcSAsim Jamshed #include <stdlib.h>
25*76404edcSAsim Jamshed #include <limits.h>
26*76404edcSAsim Jamshed #include <stdio.h>
27*76404edcSAsim Jamshed #include <assert.h>
28*76404edcSAsim Jamshed 
29*76404edcSAsim Jamshed #if 0
30*76404edcSAsim Jamshed #define LOCAL_BUFFERING 1
31*76404edcSAsim Jamshed #endif
32*76404edcSAsim Jamshed 
network_write_chunkqueue_writev(server * srv,connection * con,int fd,chunkqueue * cq,off_t max_bytes)33*76404edcSAsim Jamshed int network_write_chunkqueue_writev(server *srv, connection *con, int fd, chunkqueue *cq, off_t max_bytes) {
34*76404edcSAsim Jamshed 	chunk *c;
35*76404edcSAsim Jamshed 
36*76404edcSAsim Jamshed 	for(c = cq->first; (max_bytes > 0) && (NULL != c); c = c->next) {
37*76404edcSAsim Jamshed 		int chunk_finished = 0;
38*76404edcSAsim Jamshed 
39*76404edcSAsim Jamshed 		switch(c->type) {
40*76404edcSAsim Jamshed 		case MEM_CHUNK: {
41*76404edcSAsim Jamshed 			char * offset;
42*76404edcSAsim Jamshed 			off_t toSend;
43*76404edcSAsim Jamshed 			ssize_t r;
44*76404edcSAsim Jamshed 
45*76404edcSAsim Jamshed 			size_t num_chunks, i;
46*76404edcSAsim Jamshed 			struct iovec *chunks;
47*76404edcSAsim Jamshed 			chunk *tc;
48*76404edcSAsim Jamshed 			size_t num_bytes = 0;
49*76404edcSAsim Jamshed #if defined(_SC_IOV_MAX) /* IRIX, MacOS X, FreeBSD, Solaris, ... */
50*76404edcSAsim Jamshed 			const size_t max_chunks = sysconf(_SC_IOV_MAX);
51*76404edcSAsim Jamshed #elif defined(IOV_MAX) /* Linux x86 (glibc-2.3.6-3) */
52*76404edcSAsim Jamshed 			const size_t max_chunks = IOV_MAX;
53*76404edcSAsim Jamshed #elif defined(MAX_IOVEC) /* Linux ia64 (glibc-2.3.3-98.28) */
54*76404edcSAsim Jamshed 			const size_t max_chunks = MAX_IOVEC;
55*76404edcSAsim Jamshed #elif defined(UIO_MAXIOV) /* Linux x86 (glibc-2.2.5-233) */
56*76404edcSAsim Jamshed 			const size_t max_chunks = UIO_MAXIOV;
57*76404edcSAsim Jamshed #elif (defined(__FreeBSD__) && __FreeBSD_version < 500000) || defined(__DragonFly__) || defined(__APPLE__)
58*76404edcSAsim Jamshed 			/* - FreeBSD 4.x
59*76404edcSAsim Jamshed 			 * - MacOS X 10.3.x
60*76404edcSAsim Jamshed 			 *   (covered in -DKERNEL)
61*76404edcSAsim Jamshed 			 *  */
62*76404edcSAsim Jamshed 			const size_t max_chunks = 1024; /* UIO_MAXIOV value from sys/uio.h */
63*76404edcSAsim Jamshed #else
64*76404edcSAsim Jamshed #error "sysconf() doesnt return _SC_IOV_MAX ..., check the output of 'man writev' for the EINVAL error and send the output to [email protected]"
65*76404edcSAsim Jamshed #endif
66*76404edcSAsim Jamshed 
67*76404edcSAsim Jamshed 			/* build writev list
68*76404edcSAsim Jamshed 			 *
69*76404edcSAsim Jamshed 			 * 1. limit: num_chunks < max_chunks
70*76404edcSAsim Jamshed 			 * 2. limit: num_bytes < max_bytes
71*76404edcSAsim Jamshed 			 */
72*76404edcSAsim Jamshed 			for (num_chunks = 0, tc = c; tc && tc->type == MEM_CHUNK && num_chunks < max_chunks; num_chunks++, tc = tc->next);
73*76404edcSAsim Jamshed 
74*76404edcSAsim Jamshed 			chunks = calloc(num_chunks, sizeof(*chunks));
75*76404edcSAsim Jamshed 
76*76404edcSAsim Jamshed 			for(tc = c, i = 0; i < num_chunks; tc = tc->next, i++) {
77*76404edcSAsim Jamshed 				if (tc->mem->used == 0) {
78*76404edcSAsim Jamshed 					chunks[i].iov_base = tc->mem->ptr;
79*76404edcSAsim Jamshed 					chunks[i].iov_len  = 0;
80*76404edcSAsim Jamshed 				} else {
81*76404edcSAsim Jamshed 					offset = tc->mem->ptr + tc->offset;
82*76404edcSAsim Jamshed 					toSend = tc->mem->used - 1 - tc->offset;
83*76404edcSAsim Jamshed 
84*76404edcSAsim Jamshed 					chunks[i].iov_base = offset;
85*76404edcSAsim Jamshed 
86*76404edcSAsim Jamshed 					/* protect the return value of writev() */
87*76404edcSAsim Jamshed 					if (toSend > max_bytes ||
88*76404edcSAsim Jamshed 					    (off_t) num_bytes + toSend > max_bytes) {
89*76404edcSAsim Jamshed 						chunks[i].iov_len = max_bytes - num_bytes;
90*76404edcSAsim Jamshed 
91*76404edcSAsim Jamshed 						num_chunks = i + 1;
92*76404edcSAsim Jamshed 						break;
93*76404edcSAsim Jamshed 					} else {
94*76404edcSAsim Jamshed 						chunks[i].iov_len = toSend;
95*76404edcSAsim Jamshed 					}
96*76404edcSAsim Jamshed 
97*76404edcSAsim Jamshed 					num_bytes += toSend;
98*76404edcSAsim Jamshed 				}
99*76404edcSAsim Jamshed 			}
100*76404edcSAsim Jamshed 
101*76404edcSAsim Jamshed 			if ((r = writev(fd, chunks, num_chunks)) < 0) {
102*76404edcSAsim Jamshed 				switch (errno) {
103*76404edcSAsim Jamshed 				case EAGAIN:
104*76404edcSAsim Jamshed 				case EINTR:
105*76404edcSAsim Jamshed 					r = 0;
106*76404edcSAsim Jamshed 					break;
107*76404edcSAsim Jamshed 				case EPIPE:
108*76404edcSAsim Jamshed 				case ECONNRESET:
109*76404edcSAsim Jamshed 					free(chunks);
110*76404edcSAsim Jamshed 					return -2;
111*76404edcSAsim Jamshed 				default:
112*76404edcSAsim Jamshed 					log_error_write(srv, __FILE__, __LINE__, "ssd",
113*76404edcSAsim Jamshed 							"writev failed:", strerror(errno), fd);
114*76404edcSAsim Jamshed 
115*76404edcSAsim Jamshed 					free(chunks);
116*76404edcSAsim Jamshed 					return -1;
117*76404edcSAsim Jamshed 				}
118*76404edcSAsim Jamshed 			}
119*76404edcSAsim Jamshed 
120*76404edcSAsim Jamshed 			cq->bytes_out += r;
121*76404edcSAsim Jamshed 			max_bytes -= r;
122*76404edcSAsim Jamshed 
123*76404edcSAsim Jamshed 			/* check which chunks have been written */
124*76404edcSAsim Jamshed 
125*76404edcSAsim Jamshed 			for(i = 0, tc = c; i < num_chunks; i++, tc = tc->next) {
126*76404edcSAsim Jamshed 				if (r >= (ssize_t)chunks[i].iov_len) {
127*76404edcSAsim Jamshed 					/* written */
128*76404edcSAsim Jamshed 					r -= chunks[i].iov_len;
129*76404edcSAsim Jamshed 					tc->offset += chunks[i].iov_len;
130*76404edcSAsim Jamshed 
131*76404edcSAsim Jamshed 					if (chunk_finished) {
132*76404edcSAsim Jamshed 						/* skip the chunks from further touches */
133*76404edcSAsim Jamshed 						c = c->next;
134*76404edcSAsim Jamshed 					} else {
135*76404edcSAsim Jamshed 						/* chunks_written + c = c->next is done in the for()*/
136*76404edcSAsim Jamshed 						chunk_finished = 1;
137*76404edcSAsim Jamshed 					}
138*76404edcSAsim Jamshed 				} else {
139*76404edcSAsim Jamshed 					/* partially written */
140*76404edcSAsim Jamshed 
141*76404edcSAsim Jamshed 					tc->offset += r;
142*76404edcSAsim Jamshed 					chunk_finished = 0;
143*76404edcSAsim Jamshed 
144*76404edcSAsim Jamshed 					break;
145*76404edcSAsim Jamshed 				}
146*76404edcSAsim Jamshed 			}
147*76404edcSAsim Jamshed 			free(chunks);
148*76404edcSAsim Jamshed 
149*76404edcSAsim Jamshed 			break;
150*76404edcSAsim Jamshed 		}
151*76404edcSAsim Jamshed 		case FILE_CHUNK: {
152*76404edcSAsim Jamshed 			ssize_t r;
153*76404edcSAsim Jamshed 			off_t abs_offset;
154*76404edcSAsim Jamshed 			off_t toSend;
155*76404edcSAsim Jamshed 			stat_cache_entry *sce = NULL;
156*76404edcSAsim Jamshed 
157*76404edcSAsim Jamshed #define KByte * 1024
158*76404edcSAsim Jamshed #define MByte * 1024 KByte
159*76404edcSAsim Jamshed #define GByte * 1024 MByte
160*76404edcSAsim Jamshed 			const off_t we_want_to_mmap = 512 KByte;
161*76404edcSAsim Jamshed 			char *start = NULL;
162*76404edcSAsim Jamshed 
163*76404edcSAsim Jamshed 			if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) {
164*76404edcSAsim Jamshed 				log_error_write(srv, __FILE__, __LINE__, "sb",
165*76404edcSAsim Jamshed 						strerror(errno), c->file.name);
166*76404edcSAsim Jamshed 				return -1;
167*76404edcSAsim Jamshed 			}
168*76404edcSAsim Jamshed 
169*76404edcSAsim Jamshed 			abs_offset = c->file.start + c->offset;
170*76404edcSAsim Jamshed 
171*76404edcSAsim Jamshed 			if (abs_offset > sce->st.st_size) {
172*76404edcSAsim Jamshed 				log_error_write(srv, __FILE__, __LINE__, "sb",
173*76404edcSAsim Jamshed 						"file was shrinked:", c->file.name);
174*76404edcSAsim Jamshed 
175*76404edcSAsim Jamshed 				return -1;
176*76404edcSAsim Jamshed 			}
177*76404edcSAsim Jamshed 
178*76404edcSAsim Jamshed 			/* mmap the buffer
179*76404edcSAsim Jamshed 			 * - first mmap
180*76404edcSAsim Jamshed 			 * - new mmap as the we are at the end of the last one */
181*76404edcSAsim Jamshed 			if (c->file.mmap.start == MAP_FAILED ||
182*76404edcSAsim Jamshed 			    abs_offset == (off_t)(c->file.mmap.offset + c->file.mmap.length)) {
183*76404edcSAsim Jamshed 
184*76404edcSAsim Jamshed 				/* Optimizations for the future:
185*76404edcSAsim Jamshed 				 *
186*76404edcSAsim Jamshed 				 * adaptive mem-mapping
187*76404edcSAsim Jamshed 				 *   the problem:
188*76404edcSAsim Jamshed 				 *     we mmap() the whole file. If someone has alot large files and 32bit
189*76404edcSAsim Jamshed 				 *     machine the virtual address area will be unrun and we will have a failing
190*76404edcSAsim Jamshed 				 *     mmap() call.
191*76404edcSAsim Jamshed 				 *   solution:
192*76404edcSAsim Jamshed 				 *     only mmap 16M in one chunk and move the window as soon as we have finished
193*76404edcSAsim Jamshed 				 *     the first 8M
194*76404edcSAsim Jamshed 				 *
195*76404edcSAsim Jamshed 				 * read-ahead buffering
196*76404edcSAsim Jamshed 				 *   the problem:
197*76404edcSAsim Jamshed 				 *     sending out several large files in parallel trashes the read-ahead of the
198*76404edcSAsim Jamshed 				 *     kernel leading to long wait-for-seek times.
199*76404edcSAsim Jamshed 				 *   solutions: (increasing complexity)
200*76404edcSAsim Jamshed 				 *     1. use madvise
201*76404edcSAsim Jamshed 				 *     2. use a internal read-ahead buffer in the chunk-structure
202*76404edcSAsim Jamshed 				 *     3. use non-blocking IO for file-transfers
203*76404edcSAsim Jamshed 				 *   */
204*76404edcSAsim Jamshed 
205*76404edcSAsim Jamshed 				/* all mmap()ed areas are 512kb expect the last which might be smaller */
206*76404edcSAsim Jamshed 				off_t we_want_to_send;
207*76404edcSAsim Jamshed 				size_t to_mmap;
208*76404edcSAsim Jamshed 
209*76404edcSAsim Jamshed 				/* this is a remap, move the mmap-offset */
210*76404edcSAsim Jamshed 				if (c->file.mmap.start != MAP_FAILED) {
211*76404edcSAsim Jamshed 					munmap(c->file.mmap.start, c->file.mmap.length);
212*76404edcSAsim Jamshed 					c->file.mmap.offset += we_want_to_mmap;
213*76404edcSAsim Jamshed 				} else {
214*76404edcSAsim Jamshed 					/* in case the range-offset is after the first mmap()ed area we skip the area */
215*76404edcSAsim Jamshed 					c->file.mmap.offset = 0;
216*76404edcSAsim Jamshed 
217*76404edcSAsim Jamshed 					while (c->file.mmap.offset + we_want_to_mmap < c->file.start) {
218*76404edcSAsim Jamshed 						c->file.mmap.offset += we_want_to_mmap;
219*76404edcSAsim Jamshed 					}
220*76404edcSAsim Jamshed 				}
221*76404edcSAsim Jamshed 
222*76404edcSAsim Jamshed 				/* length is rel, c->offset too, assume there is no limit at the mmap-boundaries */
223*76404edcSAsim Jamshed 				we_want_to_send = c->file.length - c->offset;
224*76404edcSAsim Jamshed 				to_mmap = (c->file.start + c->file.length) - c->file.mmap.offset;
225*76404edcSAsim Jamshed 
226*76404edcSAsim Jamshed 				/* we have more to send than we can mmap() at once */
227*76404edcSAsim Jamshed 				if (abs_offset + we_want_to_send > c->file.mmap.offset + we_want_to_mmap) {
228*76404edcSAsim Jamshed 					we_want_to_send = (c->file.mmap.offset + we_want_to_mmap) - abs_offset;
229*76404edcSAsim Jamshed 					to_mmap = we_want_to_mmap;
230*76404edcSAsim Jamshed 				}
231*76404edcSAsim Jamshed 
232*76404edcSAsim Jamshed 				if (-1 == c->file.fd) {  /* open the file if not already open */
233*76404edcSAsim Jamshed 					if (-1 == (c->file.fd = open(c->file.name->ptr, O_RDONLY))) {
234*76404edcSAsim Jamshed 						log_error_write(srv, __FILE__, __LINE__, "sbs", "open failed for:", c->file.name, strerror(errno));
235*76404edcSAsim Jamshed 
236*76404edcSAsim Jamshed 						return -1;
237*76404edcSAsim Jamshed 					}
238*76404edcSAsim Jamshed #ifdef FD_CLOEXEC
239*76404edcSAsim Jamshed 					fcntl(c->file.fd, F_SETFD, FD_CLOEXEC);
240*76404edcSAsim Jamshed #endif
241*76404edcSAsim Jamshed 				}
242*76404edcSAsim Jamshed 
243*76404edcSAsim Jamshed 				if (MAP_FAILED == (c->file.mmap.start = mmap(NULL, to_mmap, PROT_READ, MAP_SHARED, c->file.fd, c->file.mmap.offset))) {
244*76404edcSAsim Jamshed 					/* close it here, otherwise we'd have to set FD_CLOEXEC */
245*76404edcSAsim Jamshed 
246*76404edcSAsim Jamshed 					log_error_write(srv, __FILE__, __LINE__, "ssbd", "mmap failed:",
247*76404edcSAsim Jamshed 							strerror(errno), c->file.name, c->file.fd);
248*76404edcSAsim Jamshed 
249*76404edcSAsim Jamshed 					return -1;
250*76404edcSAsim Jamshed 				}
251*76404edcSAsim Jamshed 
252*76404edcSAsim Jamshed 				c->file.mmap.length = to_mmap;
253*76404edcSAsim Jamshed #ifdef LOCAL_BUFFERING
254*76404edcSAsim Jamshed 				buffer_copy_string_len(c->mem, c->file.mmap.start, c->file.mmap.length);
255*76404edcSAsim Jamshed #else
256*76404edcSAsim Jamshed #ifdef HAVE_MADVISE
257*76404edcSAsim Jamshed 				/* don't advise files < 64Kb */
258*76404edcSAsim Jamshed 				if (c->file.mmap.length > (64 KByte)) {
259*76404edcSAsim Jamshed 					/* darwin 7 is returning EINVAL all the time and I don't know how to
260*76404edcSAsim Jamshed 					 * detect this at runtime.i
261*76404edcSAsim Jamshed 					 *
262*76404edcSAsim Jamshed 					 * ignore the return value for now */
263*76404edcSAsim Jamshed 					madvise(c->file.mmap.start, c->file.mmap.length, MADV_WILLNEED);
264*76404edcSAsim Jamshed 				}
265*76404edcSAsim Jamshed #endif
266*76404edcSAsim Jamshed #endif
267*76404edcSAsim Jamshed 
268*76404edcSAsim Jamshed 				/* chunk_reset() or chunk_free() will cleanup for us */
269*76404edcSAsim Jamshed 			}
270*76404edcSAsim Jamshed 
271*76404edcSAsim Jamshed 			/* to_send = abs_mmap_end - abs_offset */
272*76404edcSAsim Jamshed 			toSend = (c->file.mmap.offset + c->file.mmap.length) - (abs_offset);
273*76404edcSAsim Jamshed 
274*76404edcSAsim Jamshed 			if (toSend < 0) {
275*76404edcSAsim Jamshed 				log_error_write(srv, __FILE__, __LINE__, "soooo",
276*76404edcSAsim Jamshed 						"toSend is negative:",
277*76404edcSAsim Jamshed 						toSend,
278*76404edcSAsim Jamshed 						c->file.mmap.length,
279*76404edcSAsim Jamshed 						abs_offset,
280*76404edcSAsim Jamshed 						c->file.mmap.offset);
281*76404edcSAsim Jamshed 				assert(toSend < 0);
282*76404edcSAsim Jamshed 			}
283*76404edcSAsim Jamshed 
284*76404edcSAsim Jamshed 			if (toSend > max_bytes) toSend = max_bytes;
285*76404edcSAsim Jamshed 
286*76404edcSAsim Jamshed #ifdef LOCAL_BUFFERING
287*76404edcSAsim Jamshed 			start = c->mem->ptr;
288*76404edcSAsim Jamshed #else
289*76404edcSAsim Jamshed 			start = c->file.mmap.start;
290*76404edcSAsim Jamshed #endif
291*76404edcSAsim Jamshed 
292*76404edcSAsim Jamshed 			if ((r = write(fd, start + (abs_offset - c->file.mmap.offset), toSend)) < 0) {
293*76404edcSAsim Jamshed 				switch (errno) {
294*76404edcSAsim Jamshed 				case EAGAIN:
295*76404edcSAsim Jamshed 				case EINTR:
296*76404edcSAsim Jamshed 					r = 0;
297*76404edcSAsim Jamshed 					break;
298*76404edcSAsim Jamshed 				case EPIPE:
299*76404edcSAsim Jamshed 				case ECONNRESET:
300*76404edcSAsim Jamshed 					return -2;
301*76404edcSAsim Jamshed 				default:
302*76404edcSAsim Jamshed 					log_error_write(srv, __FILE__, __LINE__, "ssd",
303*76404edcSAsim Jamshed 							"write failed:", strerror(errno), fd);
304*76404edcSAsim Jamshed 
305*76404edcSAsim Jamshed 					return -1;
306*76404edcSAsim Jamshed 				}
307*76404edcSAsim Jamshed 			}
308*76404edcSAsim Jamshed 
309*76404edcSAsim Jamshed 			c->offset += r;
310*76404edcSAsim Jamshed 			cq->bytes_out += r;
311*76404edcSAsim Jamshed 			max_bytes -= r;
312*76404edcSAsim Jamshed 
313*76404edcSAsim Jamshed 			if (c->offset == c->file.length) {
314*76404edcSAsim Jamshed 				chunk_finished = 1;
315*76404edcSAsim Jamshed 
316*76404edcSAsim Jamshed 				/* we don't need the mmaping anymore */
317*76404edcSAsim Jamshed 				if (c->file.mmap.start != MAP_FAILED) {
318*76404edcSAsim Jamshed 					munmap(c->file.mmap.start, c->file.mmap.length);
319*76404edcSAsim Jamshed 					c->file.mmap.start = MAP_FAILED;
320*76404edcSAsim Jamshed 				}
321*76404edcSAsim Jamshed 			}
322*76404edcSAsim Jamshed 
323*76404edcSAsim Jamshed 			break;
324*76404edcSAsim Jamshed 		}
325*76404edcSAsim Jamshed 		default:
326*76404edcSAsim Jamshed 
327*76404edcSAsim Jamshed 			log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known");
328*76404edcSAsim Jamshed 
329*76404edcSAsim Jamshed 			return -1;
330*76404edcSAsim Jamshed 		}
331*76404edcSAsim Jamshed 
332*76404edcSAsim Jamshed 		if (!chunk_finished) {
333*76404edcSAsim Jamshed 			/* not finished yet */
334*76404edcSAsim Jamshed 
335*76404edcSAsim Jamshed 			break;
336*76404edcSAsim Jamshed 		}
337*76404edcSAsim Jamshed 	}
338*76404edcSAsim Jamshed 
339*76404edcSAsim Jamshed 	return 0;
340*76404edcSAsim Jamshed }
341*76404edcSAsim Jamshed 
342*76404edcSAsim Jamshed #endif
343