xref: /linux-6.15/kernel/dma/map_benchmark.c (revision 54624acf)
165789daaSBarry Song // SPDX-License-Identifier: GPL-2.0-only
265789daaSBarry Song /*
342e4eefbSHao Fang  * Copyright (C) 2020 HiSilicon Limited.
465789daaSBarry Song  */
565789daaSBarry Song 
665789daaSBarry Song #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
765789daaSBarry Song 
865789daaSBarry Song #include <linux/debugfs.h>
965789daaSBarry Song #include <linux/delay.h>
1065789daaSBarry Song #include <linux/device.h>
1165789daaSBarry Song #include <linux/dma-mapping.h>
1265789daaSBarry Song #include <linux/kernel.h>
1365789daaSBarry Song #include <linux/kthread.h>
148ddde07aSTian Tao #include <linux/map_benchmark.h>
1565789daaSBarry Song #include <linux/math64.h>
1665789daaSBarry Song #include <linux/module.h>
1765789daaSBarry Song #include <linux/pci.h>
1865789daaSBarry Song #include <linux/platform_device.h>
1965789daaSBarry Song #include <linux/slab.h>
2065789daaSBarry Song #include <linux/timekeeping.h>
2165789daaSBarry Song 
2265789daaSBarry Song struct map_benchmark_data {
2365789daaSBarry Song 	struct map_benchmark bparam;
2465789daaSBarry Song 	struct device *dev;
2565789daaSBarry Song 	struct dentry  *debugfs;
2665789daaSBarry Song 	enum dma_data_direction dir;
2765789daaSBarry Song 	atomic64_t sum_map_100ns;
2865789daaSBarry Song 	atomic64_t sum_unmap_100ns;
2965789daaSBarry Song 	atomic64_t sum_sq_map;
3065789daaSBarry Song 	atomic64_t sum_sq_unmap;
3165789daaSBarry Song 	atomic64_t loops;
3265789daaSBarry Song };
3365789daaSBarry Song 
map_benchmark_thread(void * data)3465789daaSBarry Song static int map_benchmark_thread(void *data)
3565789daaSBarry Song {
3665789daaSBarry Song 	void *buf;
3765789daaSBarry Song 	dma_addr_t dma_addr;
3865789daaSBarry Song 	struct map_benchmark_data *map = data;
39ca947482SXiang Chen 	int npages = map->bparam.granule;
40ca947482SXiang Chen 	u64 size = npages * PAGE_SIZE;
4165789daaSBarry Song 	int ret = 0;
4265789daaSBarry Song 
43ca947482SXiang Chen 	buf = alloc_pages_exact(size, GFP_KERNEL);
4465789daaSBarry Song 	if (!buf)
4565789daaSBarry Song 		return -ENOMEM;
4665789daaSBarry Song 
4765789daaSBarry Song 	while (!kthread_should_stop())  {
4865789daaSBarry Song 		u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
4965789daaSBarry Song 		ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
5065789daaSBarry Song 		ktime_t map_delta, unmap_delta;
5165789daaSBarry Song 
5265789daaSBarry Song 		/*
5365789daaSBarry Song 		 * for a non-coherent device, if we don't stain them in the
5465789daaSBarry Song 		 * cache, this will give an underestimate of the real-world
5565789daaSBarry Song 		 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
5665789daaSBarry Song 		 * 66 means evertything goes well! 66 is lucky.
5765789daaSBarry Song 		 */
5865789daaSBarry Song 		if (map->dir != DMA_FROM_DEVICE)
59ca947482SXiang Chen 			memset(buf, 0x66, size);
6065789daaSBarry Song 
6165789daaSBarry Song 		map_stime = ktime_get();
62ca947482SXiang Chen 		dma_addr = dma_map_single(map->dev, buf, size, map->dir);
6365789daaSBarry Song 		if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
6465789daaSBarry Song 			pr_err("dma_map_single failed on %s\n",
6565789daaSBarry Song 				dev_name(map->dev));
6665789daaSBarry Song 			ret = -ENOMEM;
6765789daaSBarry Song 			goto out;
6865789daaSBarry Song 		}
6965789daaSBarry Song 		map_etime = ktime_get();
7065789daaSBarry Song 		map_delta = ktime_sub(map_etime, map_stime);
7165789daaSBarry Song 
729dc00b25SBarry Song 		/* Pretend DMA is transmitting */
739dc00b25SBarry Song 		ndelay(map->bparam.dma_trans_ns);
749dc00b25SBarry Song 
7565789daaSBarry Song 		unmap_stime = ktime_get();
76ca947482SXiang Chen 		dma_unmap_single(map->dev, dma_addr, size, map->dir);
7765789daaSBarry Song 		unmap_etime = ktime_get();
7865789daaSBarry Song 		unmap_delta = ktime_sub(unmap_etime, unmap_stime);
7965789daaSBarry Song 
8065789daaSBarry Song 		/* calculate sum and sum of squares */
8165789daaSBarry Song 
8265789daaSBarry Song 		map_100ns = div64_ul(map_delta,  100);
8365789daaSBarry Song 		unmap_100ns = div64_ul(unmap_delta, 100);
8465789daaSBarry Song 		map_sq = map_100ns * map_100ns;
8565789daaSBarry Song 		unmap_sq = unmap_100ns * unmap_100ns;
8665789daaSBarry Song 
8765789daaSBarry Song 		atomic64_add(map_100ns, &map->sum_map_100ns);
8865789daaSBarry Song 		atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
8965789daaSBarry Song 		atomic64_add(map_sq, &map->sum_sq_map);
9065789daaSBarry Song 		atomic64_add(unmap_sq, &map->sum_sq_unmap);
9165789daaSBarry Song 		atomic64_inc(&map->loops);
92*54624acfSYicong Yang 
93*54624acfSYicong Yang 		/*
94*54624acfSYicong Yang 		 * We may test for a long time so periodically check whether
95*54624acfSYicong Yang 		 * we need to schedule to avoid starving the others. Otherwise
96*54624acfSYicong Yang 		 * we may hangup the kernel in a non-preemptible kernel when
97*54624acfSYicong Yang 		 * the test kthreads number >= CPU number, the test kthreads
98*54624acfSYicong Yang 		 * will run endless on every CPU since the thread resposible
99*54624acfSYicong Yang 		 * for notifying the kthread stop (in do_map_benchmark())
100*54624acfSYicong Yang 		 * could not be scheduled.
101*54624acfSYicong Yang 		 *
102*54624acfSYicong Yang 		 * Note this may degrade the test concurrency since the test
103*54624acfSYicong Yang 		 * threads may need to share the CPU time with other load
104*54624acfSYicong Yang 		 * in the system. So it's recommended to run this benchmark
105*54624acfSYicong Yang 		 * on an idle system.
106*54624acfSYicong Yang 		 */
107*54624acfSYicong Yang 		cond_resched();
10865789daaSBarry Song 	}
10965789daaSBarry Song 
11065789daaSBarry Song out:
111ca947482SXiang Chen 	free_pages_exact(buf, size);
11265789daaSBarry Song 	return ret;
11365789daaSBarry Song }
11465789daaSBarry Song 
do_map_benchmark(struct map_benchmark_data * map)11565789daaSBarry Song static int do_map_benchmark(struct map_benchmark_data *map)
11665789daaSBarry Song {
11765789daaSBarry Song 	struct task_struct **tsk;
11865789daaSBarry Song 	int threads = map->bparam.threads;
11965789daaSBarry Song 	int node = map->bparam.node;
12065789daaSBarry Song 	u64 loops;
12165789daaSBarry Song 	int ret = 0;
12265789daaSBarry Song 	int i;
12365789daaSBarry Song 
12465789daaSBarry Song 	tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
12565789daaSBarry Song 	if (!tsk)
12665789daaSBarry Song 		return -ENOMEM;
12765789daaSBarry Song 
12865789daaSBarry Song 	get_device(map->dev);
12965789daaSBarry Song 
13065789daaSBarry Song 	for (i = 0; i < threads; i++) {
13165789daaSBarry Song 		tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
13265789daaSBarry Song 				map->bparam.node, "dma-map-benchmark/%d", i);
13365789daaSBarry Song 		if (IS_ERR(tsk[i])) {
13465789daaSBarry Song 			pr_err("create dma_map thread failed\n");
13565789daaSBarry Song 			ret = PTR_ERR(tsk[i]);
136bb9025f4SFedor Pchelkin 			while (--i >= 0)
137bb9025f4SFedor Pchelkin 				kthread_stop(tsk[i]);
13865789daaSBarry Song 			goto out;
13965789daaSBarry Song 		}
14065789daaSBarry Song 
14165789daaSBarry Song 		if (node != NUMA_NO_NODE)
142e64746e7SFedor Pchelkin 			kthread_bind_mask(tsk[i], cpumask_of_node(node));
14365789daaSBarry Song 	}
14465789daaSBarry Song 
14565789daaSBarry Song 	/* clear the old value in the previous benchmark */
14665789daaSBarry Song 	atomic64_set(&map->sum_map_100ns, 0);
14765789daaSBarry Song 	atomic64_set(&map->sum_unmap_100ns, 0);
14865789daaSBarry Song 	atomic64_set(&map->sum_sq_map, 0);
14965789daaSBarry Song 	atomic64_set(&map->sum_sq_unmap, 0);
15065789daaSBarry Song 	atomic64_set(&map->loops, 0);
15165789daaSBarry Song 
152d17405d5SBarry Song 	for (i = 0; i < threads; i++) {
153d17405d5SBarry Song 		get_task_struct(tsk[i]);
15465789daaSBarry Song 		wake_up_process(tsk[i]);
155d17405d5SBarry Song 	}
15665789daaSBarry Song 
15765789daaSBarry Song 	msleep_interruptible(map->bparam.seconds * 1000);
15865789daaSBarry Song 
159bb9025f4SFedor Pchelkin 	/* wait for the completion of all started benchmark threads */
16065789daaSBarry Song 	for (i = 0; i < threads; i++) {
161bb9025f4SFedor Pchelkin 		int kthread_ret = kthread_stop_put(tsk[i]);
162bb9025f4SFedor Pchelkin 
163bb9025f4SFedor Pchelkin 		if (kthread_ret)
164bb9025f4SFedor Pchelkin 			ret = kthread_ret;
165bb9025f4SFedor Pchelkin 	}
166bb9025f4SFedor Pchelkin 
16765789daaSBarry Song 	if (ret)
16865789daaSBarry Song 		goto out;
16965789daaSBarry Song 
17065789daaSBarry Song 	loops = atomic64_read(&map->loops);
17165789daaSBarry Song 	if (likely(loops > 0)) {
17265789daaSBarry Song 		u64 map_variance, unmap_variance;
17365789daaSBarry Song 		u64 sum_map = atomic64_read(&map->sum_map_100ns);
17465789daaSBarry Song 		u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
17565789daaSBarry Song 		u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
17665789daaSBarry Song 		u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
17765789daaSBarry Song 
17865789daaSBarry Song 		/* average latency */
17965789daaSBarry Song 		map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
18065789daaSBarry Song 		map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
18165789daaSBarry Song 
18265789daaSBarry Song 		/* standard deviation of latency */
18365789daaSBarry Song 		map_variance = div64_u64(sum_sq_map, loops) -
18465789daaSBarry Song 				map->bparam.avg_map_100ns *
18565789daaSBarry Song 				map->bparam.avg_map_100ns;
18665789daaSBarry Song 		unmap_variance = div64_u64(sum_sq_unmap, loops) -
18765789daaSBarry Song 				map->bparam.avg_unmap_100ns *
18865789daaSBarry Song 				map->bparam.avg_unmap_100ns;
18965789daaSBarry Song 		map->bparam.map_stddev = int_sqrt64(map_variance);
19065789daaSBarry Song 		map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
19165789daaSBarry Song 	}
19265789daaSBarry Song 
19365789daaSBarry Song out:
19465789daaSBarry Song 	put_device(map->dev);
19565789daaSBarry Song 	kfree(tsk);
19665789daaSBarry Song 	return ret;
19765789daaSBarry Song }
19865789daaSBarry Song 
map_benchmark_ioctl(struct file * file,unsigned int cmd,unsigned long arg)19965789daaSBarry Song static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
20065789daaSBarry Song 		unsigned long arg)
20165789daaSBarry Song {
20265789daaSBarry Song 	struct map_benchmark_data *map = file->private_data;
20365789daaSBarry Song 	void __user *argp = (void __user *)arg;
20465789daaSBarry Song 	u64 old_dma_mask;
20565789daaSBarry Song 	int ret;
20665789daaSBarry Song 
20765789daaSBarry Song 	if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
20865789daaSBarry Song 		return -EFAULT;
20965789daaSBarry Song 
21065789daaSBarry Song 	switch (cmd) {
21165789daaSBarry Song 	case DMA_MAP_BENCHMARK:
21265789daaSBarry Song 		if (map->bparam.threads == 0 ||
21365789daaSBarry Song 		    map->bparam.threads > DMA_MAP_MAX_THREADS) {
21465789daaSBarry Song 			pr_err("invalid thread number\n");
21565789daaSBarry Song 			return -EINVAL;
21665789daaSBarry Song 		}
21765789daaSBarry Song 
21865789daaSBarry Song 		if (map->bparam.seconds == 0 ||
21965789daaSBarry Song 		    map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
22065789daaSBarry Song 			pr_err("invalid duration seconds\n");
22165789daaSBarry Song 			return -EINVAL;
22265789daaSBarry Song 		}
22365789daaSBarry Song 
2249dc00b25SBarry Song 		if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
2259dc00b25SBarry Song 			pr_err("invalid transmission delay\n");
2269dc00b25SBarry Song 			return -EINVAL;
2279dc00b25SBarry Song 		}
2289dc00b25SBarry Song 
22965789daaSBarry Song 		if (map->bparam.node != NUMA_NO_NODE &&
2301ff05e72SFedor Pchelkin 		    (map->bparam.node < 0 || map->bparam.node >= MAX_NUMNODES ||
2311ff05e72SFedor Pchelkin 		     !node_possible(map->bparam.node))) {
23265789daaSBarry Song 			pr_err("invalid numa node\n");
23365789daaSBarry Song 			return -EINVAL;
23465789daaSBarry Song 		}
23565789daaSBarry Song 
236ca947482SXiang Chen 		if (map->bparam.granule < 1 || map->bparam.granule > 1024) {
237ca947482SXiang Chen 			pr_err("invalid granule size\n");
238ca947482SXiang Chen 			return -EINVAL;
239ca947482SXiang Chen 		}
240ca947482SXiang Chen 
24165789daaSBarry Song 		switch (map->bparam.dma_dir) {
24265789daaSBarry Song 		case DMA_MAP_BIDIRECTIONAL:
24365789daaSBarry Song 			map->dir = DMA_BIDIRECTIONAL;
24465789daaSBarry Song 			break;
24565789daaSBarry Song 		case DMA_MAP_FROM_DEVICE:
24665789daaSBarry Song 			map->dir = DMA_FROM_DEVICE;
24765789daaSBarry Song 			break;
24865789daaSBarry Song 		case DMA_MAP_TO_DEVICE:
24965789daaSBarry Song 			map->dir = DMA_TO_DEVICE;
25065789daaSBarry Song 			break;
25165789daaSBarry Song 		default:
25265789daaSBarry Song 			pr_err("invalid DMA direction\n");
25365789daaSBarry Song 			return -EINVAL;
25465789daaSBarry Song 		}
25565789daaSBarry Song 
25665789daaSBarry Song 		old_dma_mask = dma_get_mask(map->dev);
25765789daaSBarry Song 
25865789daaSBarry Song 		ret = dma_set_mask(map->dev,
25965789daaSBarry Song 				   DMA_BIT_MASK(map->bparam.dma_bits));
26065789daaSBarry Song 		if (ret) {
26165789daaSBarry Song 			pr_err("failed to set dma_mask on device %s\n",
26265789daaSBarry Song 				dev_name(map->dev));
26365789daaSBarry Song 			return -EINVAL;
26465789daaSBarry Song 		}
26565789daaSBarry Song 
26665789daaSBarry Song 		ret = do_map_benchmark(map);
26765789daaSBarry Song 
26865789daaSBarry Song 		/*
26965789daaSBarry Song 		 * restore the original dma_mask as many devices' dma_mask are
27065789daaSBarry Song 		 * set by architectures, acpi, busses. When we bind them back
27165789daaSBarry Song 		 * to their original drivers, those drivers shouldn't see
27265789daaSBarry Song 		 * dma_mask changed by benchmark
27365789daaSBarry Song 		 */
27465789daaSBarry Song 		dma_set_mask(map->dev, old_dma_mask);
275f7c9ccaaSFedor Pchelkin 
276f7c9ccaaSFedor Pchelkin 		if (ret)
277f7c9ccaaSFedor Pchelkin 			return ret;
27865789daaSBarry Song 		break;
27965789daaSBarry Song 	default:
28065789daaSBarry Song 		return -EINVAL;
28165789daaSBarry Song 	}
28265789daaSBarry Song 
28365789daaSBarry Song 	if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
28465789daaSBarry Song 		return -EFAULT;
28565789daaSBarry Song 
28665789daaSBarry Song 	return ret;
28765789daaSBarry Song }
28865789daaSBarry Song 
28965789daaSBarry Song static const struct file_operations map_benchmark_fops = {
29065789daaSBarry Song 	.open			= simple_open,
29165789daaSBarry Song 	.unlocked_ioctl		= map_benchmark_ioctl,
29265789daaSBarry Song };
29365789daaSBarry Song 
map_benchmark_remove_debugfs(void * data)29465789daaSBarry Song static void map_benchmark_remove_debugfs(void *data)
29565789daaSBarry Song {
29665789daaSBarry Song 	struct map_benchmark_data *map = (struct map_benchmark_data *)data;
29765789daaSBarry Song 
29865789daaSBarry Song 	debugfs_remove(map->debugfs);
29965789daaSBarry Song }
30065789daaSBarry Song 
__map_benchmark_probe(struct device * dev)30165789daaSBarry Song static int __map_benchmark_probe(struct device *dev)
30265789daaSBarry Song {
30365789daaSBarry Song 	struct dentry *entry;
30465789daaSBarry Song 	struct map_benchmark_data *map;
30565789daaSBarry Song 	int ret;
30665789daaSBarry Song 
30765789daaSBarry Song 	map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
30865789daaSBarry Song 	if (!map)
30965789daaSBarry Song 		return -ENOMEM;
31065789daaSBarry Song 	map->dev = dev;
31165789daaSBarry Song 
31265789daaSBarry Song 	ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
31365789daaSBarry Song 	if (ret) {
31465789daaSBarry Song 		pr_err("Can't add debugfs remove action\n");
31565789daaSBarry Song 		return ret;
31665789daaSBarry Song 	}
31765789daaSBarry Song 
31865789daaSBarry Song 	/*
31965789daaSBarry Song 	 * we only permit a device bound with this driver, 2nd probe
32065789daaSBarry Song 	 * will fail
32165789daaSBarry Song 	 */
32265789daaSBarry Song 	entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
32365789daaSBarry Song 			&map_benchmark_fops);
32465789daaSBarry Song 	if (IS_ERR(entry))
32565789daaSBarry Song 		return PTR_ERR(entry);
32665789daaSBarry Song 	map->debugfs = entry;
32765789daaSBarry Song 
32865789daaSBarry Song 	return 0;
32965789daaSBarry Song }
33065789daaSBarry Song 
map_benchmark_platform_probe(struct platform_device * pdev)33165789daaSBarry Song static int map_benchmark_platform_probe(struct platform_device *pdev)
33265789daaSBarry Song {
33365789daaSBarry Song 	return __map_benchmark_probe(&pdev->dev);
33465789daaSBarry Song }
33565789daaSBarry Song 
33665789daaSBarry Song static struct platform_driver map_benchmark_platform_driver = {
33765789daaSBarry Song 	.driver		= {
33865789daaSBarry Song 		.name	= "dma_map_benchmark",
33965789daaSBarry Song 	},
34065789daaSBarry Song 	.probe = map_benchmark_platform_probe,
34165789daaSBarry Song };
34265789daaSBarry Song 
34365789daaSBarry Song static int
map_benchmark_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)34465789daaSBarry Song map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
34565789daaSBarry Song {
34665789daaSBarry Song 	return __map_benchmark_probe(&pdev->dev);
34765789daaSBarry Song }
34865789daaSBarry Song 
34965789daaSBarry Song static struct pci_driver map_benchmark_pci_driver = {
35065789daaSBarry Song 	.name	= "dma_map_benchmark",
35165789daaSBarry Song 	.probe	= map_benchmark_pci_probe,
35265789daaSBarry Song };
35365789daaSBarry Song 
map_benchmark_init(void)35465789daaSBarry Song static int __init map_benchmark_init(void)
35565789daaSBarry Song {
35665789daaSBarry Song 	int ret;
35765789daaSBarry Song 
35865789daaSBarry Song 	ret = pci_register_driver(&map_benchmark_pci_driver);
35965789daaSBarry Song 	if (ret)
36065789daaSBarry Song 		return ret;
36165789daaSBarry Song 
36265789daaSBarry Song 	ret = platform_driver_register(&map_benchmark_platform_driver);
36365789daaSBarry Song 	if (ret) {
36465789daaSBarry Song 		pci_unregister_driver(&map_benchmark_pci_driver);
36565789daaSBarry Song 		return ret;
36665789daaSBarry Song 	}
36765789daaSBarry Song 
36865789daaSBarry Song 	return 0;
36965789daaSBarry Song }
37065789daaSBarry Song 
map_benchmark_cleanup(void)37165789daaSBarry Song static void __exit map_benchmark_cleanup(void)
37265789daaSBarry Song {
37365789daaSBarry Song 	platform_driver_unregister(&map_benchmark_platform_driver);
37465789daaSBarry Song 	pci_unregister_driver(&map_benchmark_pci_driver);
37565789daaSBarry Song }
37665789daaSBarry Song 
37765789daaSBarry Song module_init(map_benchmark_init);
37865789daaSBarry Song module_exit(map_benchmark_cleanup);
37965789daaSBarry Song 
38065789daaSBarry Song MODULE_AUTHOR("Barry Song <[email protected]>");
38165789daaSBarry Song MODULE_DESCRIPTION("dma_map benchmark driver");
382