1 //===-- working_set.cpp ---------------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file is a part of EfficiencySanitizer, a family of performance tuners. 11 // 12 // This file contains working-set-specific code. 13 //===----------------------------------------------------------------------===// 14 15 #include "working_set.h" 16 #include "esan.h" 17 #include "esan_circular_buffer.h" 18 #include "esan_flags.h" 19 #include "esan_shadow.h" 20 #include "esan_sideline.h" 21 #include "sanitizer_common/sanitizer_procmaps.h" 22 23 // We shadow every cache line of app memory with one shadow byte. 24 // - The highest bit of each shadow byte indicates whether the corresponding 25 // cache line has ever been accessed. 26 // - The lowest bit of each shadow byte indicates whether the corresponding 27 // cache line was accessed since the last sample. 28 // - The other bits are used for working set snapshots at successively 29 // lower frequencies, each bit to the left from the lowest bit stepping 30 // down the frequency by 2 to the power of getFlags()->snapshot_step. 31 // Thus we have something like this: 32 // Bit 0: Since last sample 33 // Bit 1: Since last 2^2 samples 34 // Bit 2: Since last 2^4 samples 35 // Bit 3: ... 36 // Bit 7: Ever accessed. 37 // We live with races in accessing each shadow byte. 38 typedef unsigned char byte; 39 40 namespace __esan { 41 42 // Our shadow memory assumes that the line size is 64. 43 static const u32 CacheLineSize = 64; 44 45 // See the shadow byte layout description above. 46 static const u32 TotalWorkingSetBitIdx = 7; 47 // We accumulate to the left until we hit this bit. 48 // We don't need to accumulate to the final bit as it's set on each ref 49 // by the compiler instrumentation. 50 static const u32 MaxAccumBitIdx = 6; 51 static const u32 CurWorkingSetBitIdx = 0; 52 static const byte ShadowAccessedVal = 53 (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); 54 55 static SidelineThread Thread; 56 // If we use real-time-based timer samples this won't overflow in any realistic 57 // scenario, but if we switch to some other unit (such as memory accesses) we 58 // may want to consider a 64-bit int. 59 static u32 SnapshotNum; 60 61 // We store the wset size for each of 8 different sampling frequencies. 62 static const u32 NumFreq = 8; // One for each bit of our shadow bytes. 63 // We cannot use static objects as the global destructor is called 64 // prior to our finalize routine. 65 // These are each circular buffers, sized up front. 66 CircularBuffer<u32> SizePerFreq[NumFreq]; 67 // We cannot rely on static initializers (they may run too late) but 68 // we record the size here for clarity: 69 u32 CircularBufferSizes[NumFreq] = { 70 // These are each mmap-ed so our minimum is one page. 71 32*1024, 72 16*1024, 73 8*1024, 74 4*1024, 75 4*1024, 76 4*1024, 77 4*1024, 78 4*1024, 79 }; 80 81 void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, 82 bool IsWrite) { 83 if (Size == 0) 84 return; 85 SIZE_T I = 0; 86 uptr LineSize = getFlags()->cache_line_size; 87 // As Addr+Size could overflow at the top of a 32-bit address space, 88 // we avoid the simpler formula that rounds the start and end. 89 SIZE_T NumLines = Size / LineSize + 90 // Add any extra at the start or end adding on an extra line: 91 (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; 92 byte *Shadow = (byte *)appToShadow(Addr); 93 // Write shadow bytes until we're word-aligned. 94 while (I < NumLines && (uptr)Shadow % 4 != 0) { 95 if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) 96 *Shadow |= ShadowAccessedVal; 97 ++Shadow; 98 ++I; 99 } 100 // Write whole shadow words at a time. 101 // Using a word-stride loop improves the runtime of a microbenchmark of 102 // memset calls by 10%. 103 u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | 104 ShadowAccessedVal << 16 | ShadowAccessedVal << 24; 105 while (I + 4 <= NumLines) { 106 if ((*(u32*)Shadow & WordValue) != WordValue) 107 *(u32*)Shadow |= WordValue; 108 Shadow += 4; 109 I += 4; 110 } 111 // Write any trailing shadow bytes. 112 while (I < NumLines) { 113 if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) 114 *Shadow |= ShadowAccessedVal; 115 ++Shadow; 116 ++I; 117 } 118 } 119 120 // This routine will word-align ShadowStart and ShadowEnd prior to scanning. 121 // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit 122 // measures the access during the entire execution and should never be cleared. 123 static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, 124 uptr ShadowEnd) { 125 u32 WorkingSetSize = 0; 126 u32 ByteValue = 0x1 << BitIdx; 127 u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | 128 ByteValue << 24; 129 // Get word aligned start. 130 ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); 131 bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; 132 // Do not clear the bit that measures access during the entire execution. 133 bool Clear = BitIdx < TotalWorkingSetBitIdx; 134 for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { 135 if ((*Ptr & WordValue) != 0) { 136 byte *BytePtr = (byte *)Ptr; 137 for (u32 j = 0; j < sizeof(u32); ++j) { 138 if (BytePtr[j] & ByteValue) { 139 ++WorkingSetSize; 140 if (Accum) { 141 // Accumulate to the lower-frequency bit to the left. 142 BytePtr[j] |= (ByteValue << 1); 143 } 144 } 145 } 146 if (Clear) { 147 // Clear this bit from every shadow byte. 148 *Ptr &= ~WordValue; 149 } 150 } 151 } 152 return WorkingSetSize; 153 } 154 155 // Scan shadow memory to calculate the number of cache lines being accessed, 156 // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. 157 // We also clear the lowest bits (most recent working set snapshot). 158 // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit 159 // measures the access during the entire execution and should never be cleared. 160 static u32 computeWorkingSizeAndReset(u32 BitIdx) { 161 u32 WorkingSetSize = 0; 162 MemoryMappingLayout MemIter(true/*cache*/); 163 MemoryMappedSegment Segment; 164 while (MemIter.Next(&Segment)) { 165 VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__, 166 Segment.start, Segment.end, Segment.protection, 167 isAppMem(Segment.start), isShadowMem(Segment.start)); 168 if (isShadowMem(Segment.start) && Segment.IsWritable()) { 169 VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start, 170 Segment.end); 171 WorkingSetSize += 172 countAndClearShadowValues(BitIdx, Segment.start, Segment.end); 173 } 174 } 175 return WorkingSetSize; 176 } 177 178 // This is invoked from a signal handler but in a sideline thread doing nothing 179 // else so it is a little less fragile than a typical signal handler. 180 static void takeSample(void *Arg) { 181 u32 BitIdx = CurWorkingSetBitIdx; 182 u32 Freq = 1; 183 ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. 184 while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { 185 u32 NumLines = computeWorkingSizeAndReset(BitIdx); 186 VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, 187 SnapshotNum, BitIdx, Freq, NumLines); 188 SizePerFreq[BitIdx].push_back(NumLines); 189 Freq = Freq << getFlags()->snapshot_step; 190 BitIdx++; 191 } 192 } 193 194 unsigned int getSampleCountWorkingSet() 195 { 196 return SnapshotNum; 197 } 198 199 // Initialization that must be done before any instrumented code is executed. 200 void initializeShadowWorkingSet() { 201 CHECK(getFlags()->cache_line_size == CacheLineSize); 202 registerMemoryFaultHandler(); 203 } 204 205 void initializeWorkingSet() { 206 if (getFlags()->record_snapshots) { 207 for (u32 i = 0; i < NumFreq; ++i) 208 SizePerFreq[i].initialize(CircularBufferSizes[i]); 209 Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); 210 } 211 } 212 213 static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { 214 if (MilliSec > 600000) { 215 Unit = "min"; 216 return MilliSec / 60000; 217 } else if (MilliSec > 10000) { 218 Unit = "sec"; 219 return MilliSec / 1000; 220 } else { 221 Unit = "ms"; 222 return MilliSec; 223 } 224 } 225 226 static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { 227 // We need a constant to avoid software divide support: 228 static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; 229 static const u32 MegabyteCachelines = KilobyteCachelines << 10; 230 231 if (NumOfCachelines > 10 * MegabyteCachelines) { 232 Unit = "MB"; 233 return NumOfCachelines / MegabyteCachelines; 234 } else if (NumOfCachelines > 10 * KilobyteCachelines) { 235 Unit = "KB"; 236 return NumOfCachelines / KilobyteCachelines; 237 } else { 238 Unit = "Bytes"; 239 return NumOfCachelines * CacheLineSize; 240 } 241 } 242 243 void reportWorkingSet() { 244 const char *Unit; 245 if (getFlags()->record_snapshots) { 246 u32 Freq = 1; 247 Report(" Total number of samples: %u\n", SnapshotNum); 248 for (u32 i = 0; i < NumFreq; ++i) { 249 u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); 250 Report(" Samples array #%d at period %u %s\n", i, Time, Unit); 251 // FIXME: report whether we wrapped around and thus whether we 252 // have data on the whole run or just the last N samples. 253 for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { 254 u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); 255 Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, 256 SizePerFreq[i][j]); 257 } 258 Freq = Freq << getFlags()->snapshot_step; 259 } 260 } 261 262 // Get the working set size for the entire execution. 263 u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); 264 u32 Size = getSizeForPrinting(NumOfCachelines, Unit); 265 Report(" %s: the total working set size: %u %s (%u cache lines)\n", 266 SanitizerToolName, Size, Unit, NumOfCachelines); 267 } 268 269 int finalizeWorkingSet() { 270 if (getFlags()->record_snapshots) 271 Thread.joinThread(); 272 reportWorkingSet(); 273 if (getFlags()->record_snapshots) { 274 for (u32 i = 0; i < NumFreq; ++i) 275 SizePerFreq[i].free(); 276 } 277 return 0; 278 } 279 280 } // namespace __esan 281