1*572c4311Sfengbojiang# hll-err.rb - Copyright (C) 2014 Salvatore Sanfilippo
2*572c4311Sfengbojiang# BSD license, See the COPYING file for more information.
3*572c4311Sfengbojiang#
4*572c4311Sfengbojiang# This program is suited to output average and maximum errors of
5*572c4311Sfengbojiang# the Redis HyperLogLog implementation in a format suitable to print
6*572c4311Sfengbojiang# graphs using gnuplot.
7*572c4311Sfengbojiang
8*572c4311Sfengbojiangrequire 'rubygems'
9*572c4311Sfengbojiangrequire 'redis'
10*572c4311Sfengbojiangrequire 'digest/sha1'
11*572c4311Sfengbojiang
12*572c4311Sfengbojiang# Generate an array of [cardinality,relative_error] pairs
13*572c4311Sfengbojiang# in the 0 - max range, with the specified step.
14*572c4311Sfengbojiang#
15*572c4311Sfengbojiang# 'r' is the Redis object used to perform the queries.
16*572c4311Sfengbojiang# 'seed' must be different every time you want a test performed
17*572c4311Sfengbojiang# with a different set. The function guarantees that if 'seed' is the
18*572c4311Sfengbojiang# same, exactly the same dataset is used, and when it is different,
19*572c4311Sfengbojiang# a totally unrelated different data set is used (without any common
20*572c4311Sfengbojiang# element in practice).
21*572c4311Sfengbojiangdef run_experiment(r,seed,max,step)
22*572c4311Sfengbojiang    r.del('hll')
23*572c4311Sfengbojiang    i = 0
24*572c4311Sfengbojiang    samples = []
25*572c4311Sfengbojiang    step = 1000 if step > 1000
26*572c4311Sfengbojiang    while i < max do
27*572c4311Sfengbojiang        elements = []
28*572c4311Sfengbojiang        step.times {
29*572c4311Sfengbojiang            ele = Digest::SHA1.hexdigest(i.to_s+seed.to_s)
30*572c4311Sfengbojiang            elements << ele
31*572c4311Sfengbojiang            i += 1
32*572c4311Sfengbojiang        }
33*572c4311Sfengbojiang        r.pfadd('hll',elements)
34*572c4311Sfengbojiang        approx = r.pfcount('hll')
35*572c4311Sfengbojiang        err = approx-i
36*572c4311Sfengbojiang        rel_err = 100.to_f*err/i
37*572c4311Sfengbojiang        samples << [i,rel_err]
38*572c4311Sfengbojiang    end
39*572c4311Sfengbojiang    samples
40*572c4311Sfengbojiangend
41*572c4311Sfengbojiang
42*572c4311Sfengbojiangdef filter_samples(numsets,max,step,filter)
43*572c4311Sfengbojiang    r = Redis.new
44*572c4311Sfengbojiang    dataset = {}
45*572c4311Sfengbojiang    (0...numsets).each{|i|
46*572c4311Sfengbojiang        dataset[i] = run_experiment(r,i,max,step)
47*572c4311Sfengbojiang        STDERR.puts "Set #{i}"
48*572c4311Sfengbojiang    }
49*572c4311Sfengbojiang    dataset[0].each_with_index{|ele,index|
50*572c4311Sfengbojiang        if filter == :max
51*572c4311Sfengbojiang            card=ele[0]
52*572c4311Sfengbojiang            err=ele[1].abs
53*572c4311Sfengbojiang            (1...numsets).each{|i|
54*572c4311Sfengbojiang                err = dataset[i][index][1] if err < dataset[i][index][1]
55*572c4311Sfengbojiang            }
56*572c4311Sfengbojiang            puts "#{card} #{err}"
57*572c4311Sfengbojiang        elsif filter == :avg
58*572c4311Sfengbojiang            card=ele[0]
59*572c4311Sfengbojiang            err = 0
60*572c4311Sfengbojiang            (0...numsets).each{|i|
61*572c4311Sfengbojiang                err += dataset[i][index][1]
62*572c4311Sfengbojiang            }
63*572c4311Sfengbojiang            err /= numsets
64*572c4311Sfengbojiang            puts "#{card} #{err}"
65*572c4311Sfengbojiang        elsif filter == :absavg
66*572c4311Sfengbojiang            card=ele[0]
67*572c4311Sfengbojiang            err = 0
68*572c4311Sfengbojiang            (0...numsets).each{|i|
69*572c4311Sfengbojiang                err += dataset[i][index][1].abs
70*572c4311Sfengbojiang            }
71*572c4311Sfengbojiang            err /= numsets
72*572c4311Sfengbojiang            puts "#{card} #{err}"
73*572c4311Sfengbojiang        elsif filter == :all
74*572c4311Sfengbojiang            (0...numsets).each{|i|
75*572c4311Sfengbojiang                card,err = dataset[i][index]
76*572c4311Sfengbojiang                puts "#{card} #{err}"
77*572c4311Sfengbojiang            }
78*572c4311Sfengbojiang        else
79*572c4311Sfengbojiang            raise "Unknown filter #{filter}"
80*572c4311Sfengbojiang        end
81*572c4311Sfengbojiang    }
82*572c4311Sfengbojiangend
83*572c4311Sfengbojiang
84*572c4311Sfengbojiangif ARGV.length != 4
85*572c4311Sfengbojiang    puts "Usage: hll-gnuplot-graph <samples> <max> <step> (max|avg|absavg|all)"
86*572c4311Sfengbojiang    exit 1
87*572c4311Sfengbojiangend
88*572c4311Sfengbojiangfilter_samples(ARGV[0].to_i,ARGV[1].to_i,ARGV[2].to_i,ARGV[3].to_sym)
89