1 #include "network_backends.h" 2 3 #ifdef USE_WRITEV 4 5 #include "network.h" 6 #include "fdevent.h" 7 #include "log.h" 8 #include "stat_cache.h" 9 10 #include <sys/types.h> 11 #include <sys/socket.h> 12 #include <sys/uio.h> 13 #include <sys/stat.h> 14 #include <sys/time.h> 15 #include <sys/resource.h> 16 #include <netinet/in.h> 17 #include <netinet/tcp.h> 18 19 #include <errno.h> 20 #include <fcntl.h> 21 #include <unistd.h> 22 #include <netdb.h> 23 #include <string.h> 24 #include <stdlib.h> 25 #include <limits.h> 26 #include <stdio.h> 27 #include <assert.h> 28 29 #if 0 30 #define LOCAL_BUFFERING 1 31 #endif 32 33 int network_write_chunkqueue_writev(server *srv, connection *con, int fd, chunkqueue *cq, off_t max_bytes) { 34 chunk *c; 35 36 for(c = cq->first; (max_bytes > 0) && (NULL != c); c = c->next) { 37 int chunk_finished = 0; 38 39 switch(c->type) { 40 case MEM_CHUNK: { 41 char * offset; 42 off_t toSend; 43 ssize_t r; 44 45 size_t num_chunks, i; 46 struct iovec *chunks; 47 chunk *tc; 48 size_t num_bytes = 0; 49 #if defined(_SC_IOV_MAX) /* IRIX, MacOS X, FreeBSD, Solaris, ... */ 50 const size_t max_chunks = sysconf(_SC_IOV_MAX); 51 #elif defined(IOV_MAX) /* Linux x86 (glibc-2.3.6-3) */ 52 const size_t max_chunks = IOV_MAX; 53 #elif defined(MAX_IOVEC) /* Linux ia64 (glibc-2.3.3-98.28) */ 54 const size_t max_chunks = MAX_IOVEC; 55 #elif defined(UIO_MAXIOV) /* Linux x86 (glibc-2.2.5-233) */ 56 const size_t max_chunks = UIO_MAXIOV; 57 #elif (defined(__FreeBSD__) && __FreeBSD_version < 500000) || defined(__DragonFly__) || defined(__APPLE__) 58 /* - FreeBSD 4.x 59 * - MacOS X 10.3.x 60 * (covered in -DKERNEL) 61 * */ 62 const size_t max_chunks = 1024; /* UIO_MAXIOV value from sys/uio.h */ 63 #else 64 #error "sysconf() doesnt return _SC_IOV_MAX ..., check the output of 'man writev' for the EINVAL error and send the output to [email protected]" 65 #endif 66 67 /* build writev list 68 * 69 * 1. limit: num_chunks < max_chunks 70 * 2. limit: num_bytes < max_bytes 71 */ 72 for (num_chunks = 0, tc = c; tc && tc->type == MEM_CHUNK && num_chunks < max_chunks; num_chunks++, tc = tc->next); 73 74 chunks = calloc(num_chunks, sizeof(*chunks)); 75 76 for(tc = c, i = 0; i < num_chunks; tc = tc->next, i++) { 77 if (tc->mem->used == 0) { 78 chunks[i].iov_base = tc->mem->ptr; 79 chunks[i].iov_len = 0; 80 } else { 81 offset = tc->mem->ptr + tc->offset; 82 toSend = tc->mem->used - 1 - tc->offset; 83 84 chunks[i].iov_base = offset; 85 86 /* protect the return value of writev() */ 87 if (toSend > max_bytes || 88 (off_t) num_bytes + toSend > max_bytes) { 89 chunks[i].iov_len = max_bytes - num_bytes; 90 91 num_chunks = i + 1; 92 break; 93 } else { 94 chunks[i].iov_len = toSend; 95 } 96 97 num_bytes += toSend; 98 } 99 } 100 101 if ((r = writev(fd, chunks, num_chunks)) < 0) { 102 switch (errno) { 103 case EAGAIN: 104 case EINTR: 105 r = 0; 106 break; 107 case EPIPE: 108 case ECONNRESET: 109 free(chunks); 110 return -2; 111 default: 112 log_error_write(srv, __FILE__, __LINE__, "ssd", 113 "writev failed:", strerror(errno), fd); 114 115 free(chunks); 116 return -1; 117 } 118 } 119 120 cq->bytes_out += r; 121 max_bytes -= r; 122 123 /* check which chunks have been written */ 124 125 for(i = 0, tc = c; i < num_chunks; i++, tc = tc->next) { 126 if (r >= (ssize_t)chunks[i].iov_len) { 127 /* written */ 128 r -= chunks[i].iov_len; 129 tc->offset += chunks[i].iov_len; 130 131 if (chunk_finished) { 132 /* skip the chunks from further touches */ 133 c = c->next; 134 } else { 135 /* chunks_written + c = c->next is done in the for()*/ 136 chunk_finished = 1; 137 } 138 } else { 139 /* partially written */ 140 141 tc->offset += r; 142 chunk_finished = 0; 143 144 break; 145 } 146 } 147 free(chunks); 148 149 break; 150 } 151 case FILE_CHUNK: { 152 ssize_t r; 153 off_t abs_offset; 154 off_t toSend; 155 stat_cache_entry *sce = NULL; 156 157 #define KByte * 1024 158 #define MByte * 1024 KByte 159 #define GByte * 1024 MByte 160 const off_t we_want_to_mmap = 512 KByte; 161 char *start = NULL; 162 163 if (HANDLER_ERROR == stat_cache_get_entry(srv, con, c->file.name, &sce)) { 164 log_error_write(srv, __FILE__, __LINE__, "sb", 165 strerror(errno), c->file.name); 166 return -1; 167 } 168 169 abs_offset = c->file.start + c->offset; 170 171 if (abs_offset > sce->st.st_size) { 172 log_error_write(srv, __FILE__, __LINE__, "sb", 173 "file was shrinked:", c->file.name); 174 175 return -1; 176 } 177 178 /* mmap the buffer 179 * - first mmap 180 * - new mmap as the we are at the end of the last one */ 181 if (c->file.mmap.start == MAP_FAILED || 182 abs_offset == (off_t)(c->file.mmap.offset + c->file.mmap.length)) { 183 184 /* Optimizations for the future: 185 * 186 * adaptive mem-mapping 187 * the problem: 188 * we mmap() the whole file. If someone has alot large files and 32bit 189 * machine the virtual address area will be unrun and we will have a failing 190 * mmap() call. 191 * solution: 192 * only mmap 16M in one chunk and move the window as soon as we have finished 193 * the first 8M 194 * 195 * read-ahead buffering 196 * the problem: 197 * sending out several large files in parallel trashes the read-ahead of the 198 * kernel leading to long wait-for-seek times. 199 * solutions: (increasing complexity) 200 * 1. use madvise 201 * 2. use a internal read-ahead buffer in the chunk-structure 202 * 3. use non-blocking IO for file-transfers 203 * */ 204 205 /* all mmap()ed areas are 512kb expect the last which might be smaller */ 206 off_t we_want_to_send; 207 size_t to_mmap; 208 209 /* this is a remap, move the mmap-offset */ 210 if (c->file.mmap.start != MAP_FAILED) { 211 munmap(c->file.mmap.start, c->file.mmap.length); 212 c->file.mmap.offset += we_want_to_mmap; 213 } else { 214 /* in case the range-offset is after the first mmap()ed area we skip the area */ 215 c->file.mmap.offset = 0; 216 217 while (c->file.mmap.offset + we_want_to_mmap < c->file.start) { 218 c->file.mmap.offset += we_want_to_mmap; 219 } 220 } 221 222 /* length is rel, c->offset too, assume there is no limit at the mmap-boundaries */ 223 we_want_to_send = c->file.length - c->offset; 224 to_mmap = (c->file.start + c->file.length) - c->file.mmap.offset; 225 226 /* we have more to send than we can mmap() at once */ 227 if (abs_offset + we_want_to_send > c->file.mmap.offset + we_want_to_mmap) { 228 we_want_to_send = (c->file.mmap.offset + we_want_to_mmap) - abs_offset; 229 to_mmap = we_want_to_mmap; 230 } 231 232 if (-1 == c->file.fd) { /* open the file if not already open */ 233 if (-1 == (c->file.fd = open(c->file.name->ptr, O_RDONLY))) { 234 log_error_write(srv, __FILE__, __LINE__, "sbs", "open failed for:", c->file.name, strerror(errno)); 235 236 return -1; 237 } 238 #ifdef FD_CLOEXEC 239 fcntl(c->file.fd, F_SETFD, FD_CLOEXEC); 240 #endif 241 } 242 243 if (MAP_FAILED == (c->file.mmap.start = mmap(NULL, to_mmap, PROT_READ, MAP_SHARED, c->file.fd, c->file.mmap.offset))) { 244 /* close it here, otherwise we'd have to set FD_CLOEXEC */ 245 246 log_error_write(srv, __FILE__, __LINE__, "ssbd", "mmap failed:", 247 strerror(errno), c->file.name, c->file.fd); 248 249 return -1; 250 } 251 252 c->file.mmap.length = to_mmap; 253 #ifdef LOCAL_BUFFERING 254 buffer_copy_string_len(c->mem, c->file.mmap.start, c->file.mmap.length); 255 #else 256 #ifdef HAVE_MADVISE 257 /* don't advise files < 64Kb */ 258 if (c->file.mmap.length > (64 KByte)) { 259 /* darwin 7 is returning EINVAL all the time and I don't know how to 260 * detect this at runtime.i 261 * 262 * ignore the return value for now */ 263 madvise(c->file.mmap.start, c->file.mmap.length, MADV_WILLNEED); 264 } 265 #endif 266 #endif 267 268 /* chunk_reset() or chunk_free() will cleanup for us */ 269 } 270 271 /* to_send = abs_mmap_end - abs_offset */ 272 toSend = (c->file.mmap.offset + c->file.mmap.length) - (abs_offset); 273 274 if (toSend < 0) { 275 log_error_write(srv, __FILE__, __LINE__, "soooo", 276 "toSend is negative:", 277 toSend, 278 c->file.mmap.length, 279 abs_offset, 280 c->file.mmap.offset); 281 assert(toSend < 0); 282 } 283 284 if (toSend > max_bytes) toSend = max_bytes; 285 286 #ifdef LOCAL_BUFFERING 287 start = c->mem->ptr; 288 #else 289 start = c->file.mmap.start; 290 #endif 291 292 if ((r = write(fd, start + (abs_offset - c->file.mmap.offset), toSend)) < 0) { 293 switch (errno) { 294 case EAGAIN: 295 case EINTR: 296 r = 0; 297 break; 298 case EPIPE: 299 case ECONNRESET: 300 return -2; 301 default: 302 log_error_write(srv, __FILE__, __LINE__, "ssd", 303 "write failed:", strerror(errno), fd); 304 305 return -1; 306 } 307 } 308 309 c->offset += r; 310 cq->bytes_out += r; 311 max_bytes -= r; 312 313 if (c->offset == c->file.length) { 314 chunk_finished = 1; 315 316 /* we don't need the mmaping anymore */ 317 if (c->file.mmap.start != MAP_FAILED) { 318 munmap(c->file.mmap.start, c->file.mmap.length); 319 c->file.mmap.start = MAP_FAILED; 320 } 321 } 322 323 break; 324 } 325 default: 326 327 log_error_write(srv, __FILE__, __LINE__, "ds", c, "type not known"); 328 329 return -1; 330 } 331 332 if (!chunk_finished) { 333 /* not finished yet */ 334 335 break; 336 } 337 } 338 339 return 0; 340 } 341 342 #endif 343