17f9d289eSantirez# hll-err.rb - Copyright (C) 2014 Salvatore Sanfilippo 27f9d289eSantirez# BSD license, See the COPYING file for more information. 37f9d289eSantirez# 47f9d289eSantirez# This program is suited to output average and maximum errors of 57f9d289eSantirez# the Redis HyperLogLog implementation in a format suitable to print 67f9d289eSantirez# graphs using gnuplot. 77f9d289eSantirez 87f9d289eSantirezrequire 'rubygems' 97f9d289eSantirezrequire 'redis' 107f9d289eSantirezrequire 'digest/sha1' 117f9d289eSantirez 127f9d289eSantirez# Generate an array of [cardinality,relative_error] pairs 13cf34507bSantirez# in the 0 - max range, with the specified step. 147f9d289eSantirez# 157f9d289eSantirez# 'r' is the Redis object used to perform the queries. 167f9d289eSantirez# 'seed' must be different every time you want a test performed 177f9d289eSantirez# with a different set. The function guarantees that if 'seed' is the 187f9d289eSantirez# same, exactly the same dataset is used, and when it is different, 197f9d289eSantirez# a totally unrelated different data set is used (without any common 207f9d289eSantirez# element in practice). 217f9d289eSantirezdef run_experiment(r,seed,max,step) 227f9d289eSantirez r.del('hll') 237f9d289eSantirez i = 0 247f9d289eSantirez samples = [] 25cf34507bSantirez step = 1000 if step > 1000 267f9d289eSantirez while i < max do 277f9d289eSantirez elements = [] 28cf34507bSantirez step.times { 297f9d289eSantirez ele = Digest::SHA1.hexdigest(i.to_s+seed.to_s) 307f9d289eSantirez elements << ele 317f9d289eSantirez i += 1 327f9d289eSantirez } 335afcca34Santirez r.pfadd('hll',*elements) 345afcca34Santirez approx = r.pfcount('hll') 357f9d289eSantirez err = approx-i 367f9d289eSantirez rel_err = 100.to_f*err/i 377f9d289eSantirez samples << [i,rel_err] 387f9d289eSantirez end 397f9d289eSantirez samples 407f9d289eSantirezend 417f9d289eSantirez 42cf34507bSantirezdef filter_samples(numsets,max,step,filter) 437f9d289eSantirez r = Redis.new 447f9d289eSantirez dataset = {} 457f9d289eSantirez (0...numsets).each{|i| 46cf34507bSantirez dataset[i] = run_experiment(r,i,max,step) 47cf34507bSantirez STDERR.puts "Set #{i}" 487f9d289eSantirez } 497f9d289eSantirez dataset[0].each_with_index{|ele,index| 507f9d289eSantirez if filter == :max 510c9f06a2Santirez card=ele[0] 520c9f06a2Santirez err=ele[1].abs 537f9d289eSantirez (1...numsets).each{|i| 547f9d289eSantirez err = dataset[i][index][1] if err < dataset[i][index][1] 557f9d289eSantirez } 560c9f06a2Santirez puts "#{card} #{err}" 577f9d289eSantirez elsif filter == :avg 580c9f06a2Santirez card=ele[0] 590c9f06a2Santirez err = 0 600c9f06a2Santirez (0...numsets).each{|i| 617f9d289eSantirez err += dataset[i][index][1] 627f9d289eSantirez } 637f9d289eSantirez err /= numsets 640c9f06a2Santirez puts "#{card} #{err}" 65cf34507bSantirez elsif filter == :absavg 66cf34507bSantirez card=ele[0] 67cf34507bSantirez err = 0 68cf34507bSantirez (0...numsets).each{|i| 69cf34507bSantirez err += dataset[i][index][1].abs 70cf34507bSantirez } 71cf34507bSantirez err /= numsets 72cf34507bSantirez puts "#{card} #{err}" 730c9f06a2Santirez elsif filter == :all 740c9f06a2Santirez (0...numsets).each{|i| 750c9f06a2Santirez card,err = dataset[i][index] 760c9f06a2Santirez puts "#{card} #{err}" 770c9f06a2Santirez } 787f9d289eSantirez else 797f9d289eSantirez raise "Unknown filter #{filter}" 807f9d289eSantirez end 817f9d289eSantirez } 827f9d289eSantirezend 837f9d289eSantirez 84*b612affbSantirezif ARGV.length != 4 85*b612affbSantirez puts "Usage: hll-gnuplot-graph <samples> <max> <step> (max|avg|absavg|all)" 86*b612affbSantirez exit 1 87*b612affbSantirezend 88*b612affbSantirezfilter_samples(ARGV[0].to_i,ARGV[1].to_i,ARGV[2].to_i,ARGV[3].to_sym) 89