17f9d289eSantirez# hll-err.rb - Copyright (C) 2014 Salvatore Sanfilippo
27f9d289eSantirez# BSD license, See the COPYING file for more information.
37f9d289eSantirez#
47f9d289eSantirez# This program is suited to output average and maximum errors of
57f9d289eSantirez# the Redis HyperLogLog implementation in a format suitable to print
67f9d289eSantirez# graphs using gnuplot.
77f9d289eSantirez
87f9d289eSantirezrequire 'rubygems'
97f9d289eSantirezrequire 'redis'
107f9d289eSantirezrequire 'digest/sha1'
117f9d289eSantirez
127f9d289eSantirez# Generate an array of [cardinality,relative_error] pairs
13cf34507bSantirez# in the 0 - max range, with the specified step.
147f9d289eSantirez#
157f9d289eSantirez# 'r' is the Redis object used to perform the queries.
167f9d289eSantirez# 'seed' must be different every time you want a test performed
177f9d289eSantirez# with a different set. The function guarantees that if 'seed' is the
187f9d289eSantirez# same, exactly the same dataset is used, and when it is different,
197f9d289eSantirez# a totally unrelated different data set is used (without any common
207f9d289eSantirez# element in practice).
217f9d289eSantirezdef run_experiment(r,seed,max,step)
227f9d289eSantirez    r.del('hll')
237f9d289eSantirez    i = 0
247f9d289eSantirez    samples = []
25cf34507bSantirez    step = 1000 if step > 1000
267f9d289eSantirez    while i < max do
277f9d289eSantirez        elements = []
28cf34507bSantirez        step.times {
297f9d289eSantirez            ele = Digest::SHA1.hexdigest(i.to_s+seed.to_s)
307f9d289eSantirez            elements << ele
317f9d289eSantirez            i += 1
327f9d289eSantirez        }
335afcca34Santirez        r.pfadd('hll',*elements)
345afcca34Santirez        approx = r.pfcount('hll')
357f9d289eSantirez        err = approx-i
367f9d289eSantirez        rel_err = 100.to_f*err/i
377f9d289eSantirez        samples << [i,rel_err]
387f9d289eSantirez    end
397f9d289eSantirez    samples
407f9d289eSantirezend
417f9d289eSantirez
42cf34507bSantirezdef filter_samples(numsets,max,step,filter)
437f9d289eSantirez    r = Redis.new
447f9d289eSantirez    dataset = {}
457f9d289eSantirez    (0...numsets).each{|i|
46cf34507bSantirez        dataset[i] = run_experiment(r,i,max,step)
47cf34507bSantirez        STDERR.puts "Set #{i}"
487f9d289eSantirez    }
497f9d289eSantirez    dataset[0].each_with_index{|ele,index|
507f9d289eSantirez        if filter == :max
510c9f06a2Santirez            card=ele[0]
520c9f06a2Santirez            err=ele[1].abs
537f9d289eSantirez            (1...numsets).each{|i|
547f9d289eSantirez                err = dataset[i][index][1] if err < dataset[i][index][1]
557f9d289eSantirez            }
560c9f06a2Santirez            puts "#{card} #{err}"
577f9d289eSantirez        elsif filter == :avg
580c9f06a2Santirez            card=ele[0]
590c9f06a2Santirez            err = 0
600c9f06a2Santirez            (0...numsets).each{|i|
617f9d289eSantirez                err += dataset[i][index][1]
627f9d289eSantirez            }
637f9d289eSantirez            err /= numsets
640c9f06a2Santirez            puts "#{card} #{err}"
65cf34507bSantirez        elsif filter == :absavg
66cf34507bSantirez            card=ele[0]
67cf34507bSantirez            err = 0
68cf34507bSantirez            (0...numsets).each{|i|
69cf34507bSantirez                err += dataset[i][index][1].abs
70cf34507bSantirez            }
71cf34507bSantirez            err /= numsets
72cf34507bSantirez            puts "#{card} #{err}"
730c9f06a2Santirez        elsif filter == :all
740c9f06a2Santirez            (0...numsets).each{|i|
750c9f06a2Santirez                card,err = dataset[i][index]
760c9f06a2Santirez                puts "#{card} #{err}"
770c9f06a2Santirez            }
787f9d289eSantirez        else
797f9d289eSantirez            raise "Unknown filter #{filter}"
807f9d289eSantirez        end
817f9d289eSantirez    }
827f9d289eSantirezend
837f9d289eSantirez
84*b612affbSantirezif ARGV.length != 4
85*b612affbSantirez    puts "Usage: hll-gnuplot-graph <samples> <max> <step> (max|avg|absavg|all)"
86*b612affbSantirez    exit 1
87*b612affbSantirezend
88*b612affbSantirezfilter_samples(ARGV[0].to_i,ARGV[1].to_i,ARGV[2].to_i,ARGV[3].to_sym)
89