1 #include "network_backends.h"
2
3 #ifdef USE_WRITEV
4
5 #include "network.h"
6 #include "fdevent.h"
7 #include "log.h"
8 #include "stat_cache.h"
9
10 #include <sys/types.h>
11 #include <sys/socket.h>
12 #include <sys/uio.h>
13 #include <sys/stat.h>
14 #include <sys/time.h>
15 #include <sys/resource.h>
16 #include <netinet/in.h>
17 #include <netinet/tcp.h>
18
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <unistd.h>
22 #include <netdb.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <limits.h>
26 #include <stdio.h>
27 #include <assert.h>
28
29 #if 0
30 #define LOCAL_BUFFERING 1
31 #endif
32
network_write_chunkqueue_writev(server * srv,connection * con,int fd,chunkqueue * cq,off_t max_bytes)33 int network_write_chunkqueue_writev(server *srv, connection *con, int fd, chunkqueue *cq, off_t max_bytes) {
34 chunk *c;
35
36 for(c = cq->first; (max_bytes > 0) && (NULL != c); c = c->next) {
37 int chunk_finished = 0;
38
39 switch(c->type) {
40 case MEM_CHUNK: {
41 char * offset;
42 off_t toSend;
43 ssize_t r;
44
45 size_t num_chunks, i;
46 struct iovec *chunks;
47 chunk *tc;
48 size_t num_bytes = 0;
49 #if defined(_SC_IOV_MAX) /* IRIX, MacOS X, FreeBSD, Solaris, ... */
50 const size_t max_chunks = sysconf(_SC_IOV_MAX);
51 #elif defined(IOV_MAX) /* Linux x86 (glibc-2.3.6-3) */
52 const size_t max_chunks = IOV_MAX;
53 #elif defined(MAX_IOVEC) /* Linux ia64 (glibc-2.3.3-98.28) */
54 const size_t max_chunks = MAX_IOVEC;
55 #elif defined(UIO_MAXIOV) /* Linux x86 (glibc-2.2.5-233) */
56 const size_t max_chunks = UIO_MAXIOV;
57 #elif (defined(__FreeBSD__) && __FreeBSD_version < 500000) || defined(__DragonFly__) || defined(__APPLE__)
58 /* - FreeBSD 4.x
59 * - MacOS X 10.3.x
60 * (covered in -DKERNEL)
61 * */
62 const size_t max_chunks = 1024; /* UIO_MAXIOV value from sys/uio.h */
63 #else
64 #error "sysconf() doesnt return _SC_IOV_MAX ..., check the output of 'man writev' for the EINVAL error and send the output to [email protected]"
65 #endif
66
67 /* build writev list
68 *
69 * 1. limit: num_chunks < max_chunks
70 * 2. limit: num_bytes < max_bytes
71 */
72 for (num_chunks = 0, tc = c; tc && tc->type == MEM_CHUNK && num_chunks < max_chunks; num_chunks++, tc = tc->next);
73
74 chunks = calloc(num_chunks, sizeof(*chunks));
75
76 for(tc = c, i = 0; i < num_chunks; tc = tc->next, i++) {
77 if (tc->mem->used == 0) {
78 chunks[i].iov_base = tc->mem->ptr;
79 chunks[i].iov_len = 0;
80 } else {
81 offset = tc->mem->ptr + tc->offset;
82 toSend = tc->mem->used - 1 - tc->offset;
83
84 chunks[i].iov_base = offset;
85
86 /* protect the return value of writev() */
87 if (toSend > max_bytes ||
88 (off_t) num_bytes + toSend > max_bytes) {
89 chunks[i].iov_len = max_bytes - num_bytes;
90
91 num_chunks = i + 1;
92 break;
93 } else {
94 chunks[i].iov_len = toSend;
95 }
96
97 num_bytes += toSend;
98 }
99 }
100
101 if ((r = writev(fd, chunks, num_chunks)) < 0) {
102 switch (errno) {
103 case EAGAIN:
104 case EINTR:
105 r = 0;
106 break;
107 case EPIPE:
108 case ECONNRESET:
109 free(chunks);
110 return -2;
111 default:
112 log_error_write(srv, __FILE__, __LINE__, "ssd",
113 "writev failed:", strerror(errno), fd);
114
115 free(chunks);
116 return -1;
117 }
118 }
119
120 cq->bytes_out += r;
121 max_bytes -= r;
122
123 /* check which chunks have been written */
124
125 for(i = 0, tc = c; i < num_chunks; i++, tc = tc->next) {
126 if (r >= (ssize_t)chunks[i].iov_len) {
127 /* written */
128 r -= chunks[i].iov_len;
129 tc->offset += chunks[i].iov_len;
130
131 if (chunk_finished) {
132 /* skip the chunks from further touches */
133 c = c->next;
134 } else {
135 /* chunks_written + c = c->next is done in the for()*/
136 chunk_finished = 1;
137 }
138 } else {
139 /* partially written */
140
141 tc->offset += r;
142 chunk_finished = 0;
143
144 break;
145 }
146 }
147 free(chunks);
148
149 break;
150 }
151 case FILE_CHUNK: {
152 ssize_t r;
153 off_t abs_offset;
154 off_t toSend;
155 stat_cache_entry *sce = NULL;
156
157 #define KByte * 1024
158 #define MByte * 1024 KByte
159 #define GByte * 1024 MByte
160 const off_t we_want_to_mmap = 512 KByte;
161 char *start = NULL;
162
163 if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) {
164 log_error_write(srv, __FILE__, __LINE__, "sb",
165 strerror(errno), c->file.name);
166 return -1;
167 }
168
169 abs_offset = c->file.start + c->offset;
170
171 if (abs_offset > sce->st.st_size) {
172 log_error_write(srv, __FILE__, __LINE__, "sb",
173 "file was shrinked:", c->file.name);
174
175 return -1;
176 }
177
178 /* mmap the buffer
179 * - first mmap
180 * - new mmap as the we are at the end of the last one */
181 if (c->file.mmap.start == MAP_FAILED ||
182 abs_offset == (off_t)(c->file.mmap.offset + c->file.mmap.length)) {
183
184 /* Optimizations for the future:
185 *
186 * adaptive mem-mapping
187 * the problem:
188 * we mmap() the whole file. If someone has alot large files and 32bit
189 * machine the virtual address area will be unrun and we will have a failing
190 * mmap() call.
191 * solution:
192 * only mmap 16M in one chunk and move the window as soon as we have finished
193 * the first 8M
194 *
195 * read-ahead buffering
196 * the problem:
197 * sending out several large files in parallel trashes the read-ahead of the
198 * kernel leading to long wait-for-seek times.
199 * solutions: (increasing complexity)
200 * 1. use madvise
201 * 2. use a internal read-ahead buffer in the chunk-structure
202 * 3. use non-blocking IO for file-transfers
203 * */
204
205 /* all mmap()ed areas are 512kb expect the last which might be smaller */
206 off_t we_want_to_send;
207 size_t to_mmap;
208
209 /* this is a remap, move the mmap-offset */
210 if (c->file.mmap.start != MAP_FAILED) {
211 munmap(c->file.mmap.start, c->file.mmap.length);
212 c->file.mmap.offset += we_want_to_mmap;
213 } else {
214 /* in case the range-offset is after the first mmap()ed area we skip the area */
215 c->file.mmap.offset = 0;
216
217 while (c->file.mmap.offset + we_want_to_mmap < c->file.start) {
218 c->file.mmap.offset += we_want_to_mmap;
219 }
220 }
221
222 /* length is rel, c->offset too, assume there is no limit at the mmap-boundaries */
223 we_want_to_send = c->file.length - c->offset;
224 to_mmap = (c->file.start + c->file.length) - c->file.mmap.offset;
225
226 /* we have more to send than we can mmap() at once */
227 if (abs_offset + we_want_to_send > c->file.mmap.offset + we_want_to_mmap) {
228 we_want_to_send = (c->file.mmap.offset + we_want_to_mmap) - abs_offset;
229 to_mmap = we_want_to_mmap;
230 }
231
232 if (-1 == c->file.fd) { /* open the file if not already open */
233 if (-1 == (c->file.fd = open(c->file.name->ptr, O_RDONLY))) {
234 log_error_write(srv, __FILE__, __LINE__, "sbs", "open failed for:", c->file.name, strerror(errno));
235
236 return -1;
237 }
238 #ifdef FD_CLOEXEC
239 fcntl(c->file.fd, F_SETFD, FD_CLOEXEC);
240 #endif
241 }
242
243 if (MAP_FAILED == (c->file.mmap.start = mmap(NULL, to_mmap, PROT_READ, MAP_SHARED, c->file.fd, c->file.mmap.offset))) {
244 /* close it here, otherwise we'd have to set FD_CLOEXEC */
245
246 log_error_write(srv, __FILE__, __LINE__, "ssbd", "mmap failed:",
247 strerror(errno), c->file.name, c->file.fd);
248
249 return -1;
250 }
251
252 c->file.mmap.length = to_mmap;
253 #ifdef LOCAL_BUFFERING
254 buffer_copy_string_len(c->mem, c->file.mmap.start, c->file.mmap.length);
255 #else
256 #ifdef HAVE_MADVISE
257 /* don't advise files < 64Kb */
258 if (c->file.mmap.length > (64 KByte)) {
259 /* darwin 7 is returning EINVAL all the time and I don't know how to
260 * detect this at runtime.i
261 *
262 * ignore the return value for now */
263 madvise(c->file.mmap.start, c->file.mmap.length, MADV_WILLNEED);
264 }
265 #endif
266 #endif
267
268 /* chunk_reset() or chunk_free() will cleanup for us */
269 }
270
271 /* to_send = abs_mmap_end - abs_offset */
272 toSend = (c->file.mmap.offset + c->file.mmap.length) - (abs_offset);
273
274 if (toSend < 0) {
275 log_error_write(srv, __FILE__, __LINE__, "soooo",
276 "toSend is negative:",
277 toSend,
278 c->file.mmap.length,
279 abs_offset,
280 c->file.mmap.offset);
281 assert(toSend < 0);
282 }
283
284 if (toSend > max_bytes) toSend = max_bytes;
285
286 #ifdef LOCAL_BUFFERING
287 start = c->mem->ptr;
288 #else
289 start = c->file.mmap.start;
290 #endif
291
292 if ((r = write(fd, start + (abs_offset - c->file.mmap.offset), toSend)) < 0) {
293 switch (errno) {
294 case EAGAIN:
295 case EINTR:
296 r = 0;
297 break;
298 case EPIPE:
299 case ECONNRESET:
300 return -2;
301 default:
302 log_error_write(srv, __FILE__, __LINE__, "ssd",
303 "write failed:", strerror(errno), fd);
304
305 return -1;
306 }
307 }
308
309 c->offset += r;
310 cq->bytes_out += r;
311 max_bytes -= r;
312
313 if (c->offset == c->file.length) {
314 chunk_finished = 1;
315
316 /* we don't need the mmaping anymore */
317 if (c->file.mmap.start != MAP_FAILED) {
318 munmap(c->file.mmap.start, c->file.mmap.length);
319 c->file.mmap.start = MAP_FAILED;
320 }
321 }
322
323 break;
324 }
325 default:
326
327 log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known");
328
329 return -1;
330 }
331
332 if (!chunk_finished) {
333 /* not finished yet */
334
335 break;
336 }
337 }
338
339 return 0;
340 }
341
342 #endif
343