1# hll-err.rb - Copyright (C) 2014 Salvatore Sanfilippo 2# BSD license, See the COPYING file for more information. 3# 4# This program is suited to output average and maximum errors of 5# the Redis HyperLogLog implementation in a format suitable to print 6# graphs using gnuplot. 7 8require 'rubygems' 9require 'redis' 10require 'digest/sha1' 11 12# Generate an array of [cardinality,relative_error] pairs 13# in the 0 - max range with step of 1000*step. 14# 15# 'r' is the Redis object used to perform the queries. 16# 'seed' must be different every time you want a test performed 17# with a different set. The function guarantees that if 'seed' is the 18# same, exactly the same dataset is used, and when it is different, 19# a totally unrelated different data set is used (without any common 20# element in practice). 21def run_experiment(r,seed,max,step) 22 r.del('hll') 23 i = 0 24 samples = [] 25 while i < max do 26 step.times { 27 elements = [] 28 1000.times { 29 ele = Digest::SHA1.hexdigest(i.to_s+seed.to_s) 30 elements << ele 31 i += 1 32 } 33 r.padd('hll',*elements) 34 } 35 approx = r.pcount('hll') 36 err = approx-i 37 rel_err = 100.to_f*err/i 38 samples << [i,rel_err] 39 end 40 samples 41end 42 43def filter_samples(numsets,filter) 44 r = Redis.new 45 dataset = {} 46 (0...numsets).each{|i| 47 dataset[i] = run_experiment(r,i,100000,1) 48 } 49 dataset[0].each_with_index{|ele,index| 50 if filter == :max 51 card=ele[0] 52 err=ele[1].abs 53 (1...numsets).each{|i| 54 err = dataset[i][index][1] if err < dataset[i][index][1] 55 } 56 puts "#{card} #{err}" 57 elsif filter == :avg 58 card=ele[0] 59 err = 0 60 (0...numsets).each{|i| 61 err += dataset[i][index][1] 62 } 63 err /= numsets 64 puts "#{card} #{err}" 65 elsif filter == :all 66 (0...numsets).each{|i| 67 card,err = dataset[i][index] 68 puts "#{card} #{err}" 69 } 70 else 71 raise "Unknown filter #{filter}" 72 end 73 } 74end 75 76filter_samples(100,:all) 77#filter_samples(100,:max) 78#filter_samples(100,:avg) 79