automemcpy/lib/ResultAnalyzer.cpp

//===-- Analyze benchmark JSON files --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This code analyzes the json file produced by the `automemcpy` binary.
//
// As a remainder, `automemcpy` will benchmark each autogenerated memory
// functions against one of the predefined distributions available in the
// `libc/benchmarks/distributions` folder.
//
// It works as follows:
// - Reads one or more json files.
// - If there are several runs for the same function and distribution, picks the
//   median throughput (aka `BytesPerSecond`).
// - Aggregates the throughput per distributions and scores them from worst (0)
//   to best (1).
// - Each distribution categorizes each function into one of the following
//   categories: EXCELLENT, VERY_GOOD, GOOD, PASSABLE, INADEQUATE, MEDIOCRE,
//   BAD.
// - A process similar to the Majority Judgment voting system is used to `elect`
//   the best function. The histogram of grades is returned so we can
//   distinguish between functions with the same final grade. In the following
//   example both functions grade EXCELLENT but we may prefer the second one.
//
//   |            | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | ...
//   |------------|-----------|-----------|------|----------| ...
//   | Function_1 |     7     |     1     |   2  |          | ...
//   | Function_2 |     6     |     4     |      |          | ...

#include "automemcpy/ResultAnalyzer.h"
#include "llvm/ADT/StringRef.h"
#include <numeric>
#include <unordered_map>

namespace llvm {

namespace automemcpy {

StringRef Grade::getString(const GradeEnum &GE) {
  switch (GE) {
  case EXCELLENT:
    return "EXCELLENT";
  case VERY_GOOD:
    return "VERY_GOOD";
  case GOOD:
    return "GOOD";
  case PASSABLE:
    return "PASSABLE";
  case INADEQUATE:
    return "INADEQUATE";
  case MEDIOCRE:
    return "MEDIOCRE";
  case BAD:
    return "BAD";
  case ARRAY_SIZE:
    report_fatal_error("logic error");
  }
}

Grade::GradeEnum Grade::judge(double Score) {
  if (Score >= 6. / 7)
    return EXCELLENT;
  if (Score >= 5. / 7)
    return VERY_GOOD;
  if (Score >= 4. / 7)
    return GOOD;
  if (Score >= 3. / 7)
    return PASSABLE;
  if (Score >= 2. / 7)
    return INADEQUATE;
  if (Score >= 1. / 7)
    return MEDIOCRE;
  return BAD;
}

std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
  std::unordered_map<SampleId, std::vector<double>, SampleId::Hasher>
      BucketedSamples;
  for (const auto &S : Samples)
    BucketedSamples[S.Id].push_back(S.BytesPerSecond);
  std::unordered_map<FunctionId, StringMap<double>, FunctionId::Hasher>
      Throughputs;
  for (auto &Pair : BucketedSamples) {
    const auto &Id = Pair.first;
    auto &Values = Pair.second;
    const size_t HalfSize = Values.size() / 2;
    std::nth_element(Values.begin(), Values.begin() + HalfSize, Values.end());
    const double MedianValue = Values[HalfSize];
    Throughputs[Id.Function][Id.Distribution.Name] = MedianValue;
  }
  std::vector<FunctionData> Output;
  for (auto &Pair : Throughputs) {
    FunctionData Data;
    Data.Id = Pair.first;
    for (const auto &Pair : Pair.second)
      Data.PerDistributionData[Pair.getKey()].MedianBytesPerSecond =
          Pair.getValue();
    Output.push_back(std::move(Data));
  }
  return Output;
}

void fillScores(MutableArrayRef<FunctionData> Functions) {
  // A key to bucket throughput per function type and distribution.
  struct Key {
    FunctionType Type;
    StringRef Distribution;

    COMPARABLE_AND_HASHABLE(Key, Type, Distribution)
  };

  // Tracks minimum and maximum values.
  struct MinMax {
    double Min = std::numeric_limits<double>::max();
    double Max = std::numeric_limits<double>::min();
    void update(double Value) {
      if (Value < Min)
        Min = Value;
      if (Value > Max)
        Max = Value;
    }
    double normalize(double Value) const { return (Value - Min) / (Max - Min); }
  };

  std::unordered_map<Key, MinMax, Key::Hasher> ThroughputMinMax;
  for (const auto &Function : Functions) {
    const FunctionType Type = Function.Id.Type;
    for (const auto &Pair : Function.PerDistributionData) {
      const auto &Distribution = Pair.getKey();
      const double Throughput = Pair.getValue().MedianBytesPerSecond;
      const Key K{Type, Distribution};
      ThroughputMinMax[K].update(Throughput);
    }
  }

  for (auto &Function : Functions) {
    const FunctionType Type = Function.Id.Type;
    for (const auto &Pair : Function.PerDistributionData) {
      const auto &Distribution = Pair.getKey();
      const double Throughput = Pair.getValue().MedianBytesPerSecond;
      const Key K{Type, Distribution};
      Function.PerDistributionData[Distribution].Score =
          ThroughputMinMax[K].normalize(Throughput);
    }
  }
}

void castVotes(MutableArrayRef<FunctionData> Functions) {
  for (FunctionData &Function : Functions)
    for (const auto &Pair : Function.PerDistributionData) {
      const StringRef Distribution = Pair.getKey();
      const double Score = Pair.getValue().Score;
      const auto G = Grade::judge(Score);
      ++(Function.GradeHisto[G]);
      Function.PerDistributionData[Distribution].Grade = G;
    }

  for (FunctionData &Function : Functions) {
    const auto &GradeHisto = Function.GradeHisto;
    const size_t Votes =
        std::accumulate(GradeHisto.begin(), GradeHisto.end(), 0U);
    const size_t MedianVote = Votes / 2;
    size_t CountedVotes = 0;
    Grade::GradeEnum MedianGrade = Grade::BAD;
    for (size_t I = 0; I < GradeHisto.size(); ++I) {
      CountedVotes += GradeHisto[I];
      if (CountedVotes > MedianVote) {
        MedianGrade = Grade::GradeEnum(I);
        break;
      }
    }
    Function.FinalGrade = MedianGrade;
  }
}

} // namespace automemcpy
} // namespace llvm