#!/usr/local/bin/recon

local benchrun = require 'benchrun'
local csv = require 'csv'
local os = require 'os'
local perfdata = require 'perfdata'
local sysctl = require 'sysctl'

require 'strict'

local benchmark = benchrun.new({
  name = 'xnu.zero_fill_fault_throughput',
  version = 1,
  arg = arg,
  modify_argparser = function(parser)
    parser:option{
      name = '--cpu-workers',
      description = 'Number of threads to bring up to do faulting work',
      convert = tonumber,
      argname = 'count',
    }
    parser:flag{
      name = '--through-max-workers',
      description = 'Run with [1..n] CPU workers',
    }
    parser:flag{
      name = '--through-max-workers-fast',
      description = 'Run with 1, 2, and each power of four value in [4..n] CPU workers',
    }
    parser:option{
      name = '--path',
      description = 'Path to fault throughput binary',
      count = 1, -- This is a required option.
    }
    parser:option{
      name = '--duration',
      description = 'How long, in seconds, to run each iteration',
      default = 30,
      convert = tonumber,
      argname = 'seconds',
    }
    parser:option{
      name = '--variant',
      description = 'Which benchmark variant to run',
      choices = { 'separate-objects', 'share-objects' },
      default = 'separate-objects',
      argname = 'name',
    }
    parser:option{
      name = '--first-cpu',
      description = 'Pin threads to CPUs, starting with this CPU ID; requires enable_skstb=1 boot-arg',
      default = -1,
      convert = tonumber,
      argname = 'cpu-id'
    }
    parser:flag{
      name = '--verbose',
      description = 'Enable verbose logging at a performance cost',
    }
  end,
})

local ncpus, _ = sysctl('hw.logicalcpu_max')
benchmark:assert(ncpus > 0, 'invalid number of logical CPUs')
local cpu_workers = benchmark.opt.cpu_workers or ncpus
benchmark:assert(cpu_workers > 0, 'invalid number of CPU workers')

benchmark:assert(benchmark.opt.first_cpu > -2, 'negative first CPU')
benchmark:assert(benchmark.opt.first_cpu < ncpus, 'invalid first CPU')

local page_throughput_unit = perfdata.unit.custom('pages/sec')

local test_threads = {}

if benchmark.opt.through_max_workers then
  for i = 1, cpu_workers do
    table.insert(test_threads, i)
  end
elseif benchmark.opt.through_max_workers_fast then
  local i = 1
  while i <= cpu_workers do
    table.insert(test_threads, i)
    -- Always do a run with two threads to see what the first part of the
    -- scaling curve looks like (and to measure perf on dual core systems).
    if i == 1 and cpu_workers >= 2 then
      table.insert(test_threads, i + 1)
    end
    i = i * 4
  end
else
  table.insert(test_threads, cpu_workers)
end

for _, thread_count in ipairs(test_threads) do
  local cmd = {
    benchmark.opt.path;
    echo = true,
    name = ('with %d CPU workers%s'):format(thread_count,
        thread_count == 1 and '' or 's'),
  }
  if benchmark.opt.verbose then
    cmd[#cmd + 1] = '-v'
  end
  cmd[#cmd + 1] = benchmark.opt.variant
  cmd[#cmd + 1] = benchmark.opt.duration
  cmd[#cmd + 1] = thread_count
  if benchmark.opt.first_cpu ~= -1 then
    cmd[#cmd + 1] = benchmark.opt.first_cpu
  end

  for out in benchmark:run(cmd) do
    local result = out:match('-----Results-----\n(.*)')
    benchmark:assert(result, 'unable to find result data in output')
    local data = csv.openstring(result, { header = true })
    for field in data:lines() do
      for k, v in pairs(field) do
        benchmark.writer:add_value(k, page_throughput_unit, tonumber(v), {
          [perfdata.larger_better] = true,
          threads = thread_count,
          variant = benchmark.opt.variant
        })
      end
    end
  end
end

benchmark:finish()