Blame - tools/perf/bench/epoll-wait.c - SHIFTPHONES/mainline/linux

blob: d1c5cb526b9ff0ea40a542b918d68194e54e35d7 [file] [log] [blame]

Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#ifdef HAVE_EVENTFD
				3	/*
				4	* Copyright (C) 2018 Davidlohr Bueso.
				5	*
				6	* This program benchmarks concurrent epoll_wait(2) monitoring multiple
				7	* file descriptors under one or two load balancing models. The first,
				8	* and default, is the single/combined queueing (which refers to a single
				9	* epoll instance for N worker threads):
				10	*
				11	* \|---> [worker A]
				12	* \|---> [worker B]
				13	* [combined queue] .---> [worker C]
				14	* \|---> [worker D]
				15	* \|---> [worker E]
				16	*
				17	* While the second model, enabled via --multiq option, uses multiple
				18	* queueing (which refers to one epoll instance per worker). For example,
				19	* short lived tcp connections in a high throughput httpd server will
				20	* ditribute the accept()'ing connections across CPUs. In this case each
				21	* worker does a limited amount of processing.
				22	*
				23	* [queue A] ---> [worker]
				24	* [queue B] ---> [worker]
				25	* [queue C] ---> [worker]
				26	* [queue D] ---> [worker]
				27	* [queue E] ---> [worker]
				28	*
				29	* Naturally, the single queue will enforce more concurrency on the epoll
				30	* instance, and can therefore scale poorly compared to multiple queues.
				31	* However, this is a benchmark raw data and must be taken with a grain of
				32	* salt when choosing how to make use of sys_epoll.
				33
				34	* Each thread has a number of private, nonblocking file descriptors,
				35	* referred to as fdmap. A writer thread will constantly be writing to
				36	* the fdmaps of all threads, minimizing each threads's chances of
				37	* epoll_wait not finding any ready read events and blocking as this
				38	* is not what we want to stress. The size of the fdmap can be adjusted
				39	* by the user; enlarging the value will increase the chances of
				40	* epoll_wait(2) blocking as the lineal writer thread will take "longer",
				41	* at least at a high level.
				42	*
				43	* Note that because fds are private to each thread, this workload does
				44	* not stress scenarios where multiple tasks are awoken per ready IO; ie:
				45	* EPOLLEXCLUSIVE semantics.
				46	*
				47	* The end result/metric is throughput: number of ops/second where an
				48	* operation consists of:
				49	*
				50	* epoll_wait(2) + [others]
				51	*
				52	* ... where [others] is the cost of re-adding the fd (EPOLLET),
				53	* or rearming it (EPOLLONESHOT).
				54	*
				55	*
				56	* The purpose of this is program is that it be useful for measuring
				57	* kernel related changes to the sys_epoll, and not comparing different
				58	* IO polling methods, for example. Hence everything is very adhoc and
				59	* outputs raw microbenchmark numbers. Also this uses eventfd, similar
				60	* tools tend to use pipes or sockets, but the result is the same.
				61	*/
				62
				63	/* For the CLR_() macros */
				64	#include <string.h>
				65	#include <pthread.h>
Arnaldo Carvalho de Melo	91854f9	2019-08-29 14:59:50 -0300	[diff] [blame]	66	#include <unistd.h>
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	67
				68	#include <errno.h>
				69	#include <inttypes.h>
				70	#include <signal.h>
				71	#include <stdlib.h>
				72	#include <linux/compiler.h>
				73	#include <linux/kernel.h>
				74	#include <sys/time.h>
				75	#include <sys/resource.h>
				76	#include <sys/epoll.h>
				77	#include <sys/eventfd.h>
				78	#include <sys/types.h>
Arnaldo Carvalho de Melo	87ffb6c	2019-09-10 16:29:02 +0100	[diff] [blame]	79	#include <internal/cpumap.h>
Jiri Olsa	9c3516d	2019-07-21 13:24:30 +0200	[diff] [blame]	80	#include <perf/cpumap.h>
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	81
				82	#include "../util/stat.h"
				83	#include <subcmd/parse-options.h>
				84	#include "bench.h"
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	85
				86	#include <err.h>
				87
				88	#define printinfo(fmt, arg...) \
				89	do { if (__verbose) { printf(fmt, ## arg); fflush(stdout); } } while (0)
				90
				91	static unsigned int nthreads = 0;
				92	static unsigned int nsecs = 8;
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	93	static bool wdone, done, __verbose, randomize, nonblocking;
				94
				95	/*
				96	* epoll related shared variables.
				97	*/
				98
				99	/* Maximum number of nesting allowed inside epoll sets */
				100	#define EPOLL_MAXNESTS 4
				101
				102	static int epollfd;
				103	static int *epollfdp;
				104	static bool noaffinity;
				105	static unsigned int nested = 0;
				106	static bool et; /* edge-trigger */
				107	static bool oneshot;
				108	static bool multiq; /* use an epoll instance per thread */
				109
				110	/* amount of fds to monitor, per thread */
				111	static unsigned int nfds = 64;
				112
				113	static pthread_mutex_t thread_lock;
				114	static unsigned int threads_starting;
				115	static struct stats throughput_stats;
				116	static pthread_cond_t thread_parent, thread_worker;
				117
				118	struct worker {
				119	int tid;
				120	int epollfd; /* for --multiq */
				121	pthread_t thread;
				122	unsigned long ops;
				123	int *fdmap;
				124	};
				125
				126	static const struct option options[] = {
				127	/* general benchmark options */
				128	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
				129	OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"),
				130	OPT_UINTEGER('f', "nfds", &nfds, "Specify amount of file descriptors to monitor for each thread"),
				131	OPT_BOOLEAN( 'n', "noaffinity", &noaffinity, "Disables CPU affinity"),
				132	OPT_BOOLEAN('R', "randomize", &randomize, "Enable random write behaviour (default is lineal)"),
				133	OPT_BOOLEAN( 'v', "verbose", &__verbose, "Verbose mode"),
				134
				135	/* epoll specific options */
				136	OPT_BOOLEAN( 'm', "multiq", &multiq, "Use multiple epoll instances (one per thread)"),
				137	OPT_BOOLEAN( 'B', "nonblocking", &nonblocking, "Nonblocking epoll_wait(2) behaviour"),
				138	OPT_UINTEGER( 'N', "nested", &nested, "Nesting level epoll hierarchy (default is 0, no nesting)"),
				139	OPT_BOOLEAN( 'S', "oneshot", &oneshot, "Use EPOLLONESHOT semantics"),
				140	OPT_BOOLEAN( 'E', "edge", &et, "Use Edge-triggered interface (default is LT)"),
				141
				142	OPT_END()
				143	};
				144
				145	static const char * const bench_epoll_wait_usage[] = {
				146	"perf bench epoll wait <options>",
				147	NULL
				148	};
				149
				150
				151	/*
				152	* Arrange the N elements of ARRAY in random order.
				153	* Only effective if N is much smaller than RAND_MAX;
				154	* if this may not be the case, use a better random
				155	* number generator. -- Ben Pfaff.
				156	*/
				157	static void shuffle(void *array, size_t n, size_t size)
				158	{
				159	char *carray = array;
				160	void *aux;
				161	size_t i;
				162
				163	if (n <= 1)
				164	return;
				165
				166	aux = calloc(1, size);
				167	if (!aux)
				168	err(EXIT_FAILURE, "calloc");
				169
				170	for (i = 1; i < n; ++i) {
				171	size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
				172	j *= size;
				173
				174	memcpy(aux, &carray[j], size);
				175	memcpy(&carray[j], &carray[i*size], size);
				176	memcpy(&carray[i*size], aux, size);
				177	}
				178
				179	free(aux);
				180	}
				181
				182
				183	static void workerfn(void arg)
				184	{
				185	int fd, ret, r;
				186	struct worker w = (struct worker ) arg;
				187	unsigned long ops = w->ops;
				188	struct epoll_event ev;
				189	uint64_t val;
				190	int to = nonblocking? 0 : -1;
				191	int efd = multiq ? w->epollfd : epollfd;
				192
				193	pthread_mutex_lock(&thread_lock);
				194	threads_starting--;
				195	if (!threads_starting)
				196	pthread_cond_signal(&thread_parent);
				197	pthread_cond_wait(&thread_worker, &thread_lock);
				198	pthread_mutex_unlock(&thread_lock);
				199
				200	do {
				201	/*
				202	* Block undefinitely waiting for the IN event.
				203	* In order to stress the epoll_wait(2) syscall,
				204	* call it event per event, instead of a larger
				205	* batch (max)limit.
				206	*/
				207	do {
				208	ret = epoll_wait(efd, &ev, 1, to);
				209	} while (ret < 0 && errno == EINTR);
				210	if (ret < 0)
				211	err(EXIT_FAILURE, "epoll_wait");
				212
				213	fd = ev.data.fd;
				214
				215	do {
				216	r = read(fd, &val, sizeof(val));
				217	} while (!done && (r < 0 && errno == EAGAIN));
				218
				219	if (et) {
				220	ev.events = EPOLLIN \| EPOLLET;
				221	ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev);
				222	}
				223
				224	if (oneshot) {
				225	/* rearm the file descriptor with a new event mask */
				226	ev.events \|= EPOLLIN \| EPOLLONESHOT;
				227	ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev);
				228	}
				229
				230	ops++;
				231	} while (!done);
				232
				233	if (multiq)
				234	close(w->epollfd);
				235
				236	w->ops = ops;
				237	return NULL;
				238	}
				239
				240	static void nest_epollfd(struct worker *w)
				241	{
				242	unsigned int i;
				243	struct epoll_event ev;
				244	int efd = multiq ? w->epollfd : epollfd;
				245
				246	if (nested > EPOLL_MAXNESTS)
				247	nested = EPOLL_MAXNESTS;
				248
				249	epollfdp = calloc(nested, sizeof(*epollfdp));
				250	if (!epollfdp)
				251	err(EXIT_FAILURE, "calloc");
				252
				253	for (i = 0; i < nested; i++) {
				254	epollfdp[i] = epoll_create(1);
				255	if (epollfdp[i] < 0)
				256	err(EXIT_FAILURE, "epoll_create");
				257	}
				258
				259	ev.events = EPOLLHUP; /* anything */
				260	ev.data.u64 = i; /* any number */
				261
				262	for (i = nested - 1; i; i--) {
				263	if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD,
				264	epollfdp[i], &ev) < 0)
				265	err(EXIT_FAILURE, "epoll_ctl");
				266	}
				267
				268	if (epoll_ctl(efd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0)
				269	err(EXIT_FAILURE, "epoll_ctl");
				270	}
				271
				272	static void toggle_done(int sig __maybe_unused,
				273	siginfo_t *info __maybe_unused,
				274	void *uc __maybe_unused)
				275	{
				276	/* inform all threads that we're done for the day */
				277	done = true;
Arnaldo Carvalho de Melo	e4d9b04	2020-03-02 12:09:38 -0300	[diff] [blame^]	278	gettimeofday(&bench__end, NULL);
				279	timersub(&bench__end, &bench__start, &bench__runtime);
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	280	}
				281
				282	static void print_summary(void)
				283	{
				284	unsigned long avg = avg_stats(&throughput_stats);
				285	double stddev = stddev_stats(&throughput_stats);
				286
				287	printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
				288	avg, rel_stddev_stats(stddev, avg),
Arnaldo Carvalho de Melo	e4d9b04	2020-03-02 12:09:38 -0300	[diff] [blame^]	289	(int)bench__runtime.tv_sec);
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	290	}
				291
Jiri Olsa	f854839	2019-07-21 13:23:49 +0200	[diff] [blame]	292	static int do_threads(struct worker worker, struct perf_cpu_map cpu)
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	293	{
				294	pthread_attr_t thread_attr, *attrp = NULL;
				295	cpu_set_t cpuset;
				296	unsigned int i, j;
Changbin Du	11c1ea6	2019-03-16 16:05:43 +0800	[diff] [blame]	297	int ret = 0, events = EPOLLIN;
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	298
				299	if (oneshot)
				300	events \|= EPOLLONESHOT;
				301	if (et)
				302	events \|= EPOLLET;
				303
				304	printinfo("starting worker/consumer %sthreads%s\n",
				305	noaffinity ? "":"CPU affinity ",
				306	nonblocking ? " (nonblocking)":"");
				307	if (!noaffinity)
				308	pthread_attr_init(&thread_attr);
				309
				310	for (i = 0; i < nthreads; i++) {
				311	struct worker *w = &worker[i];
				312
				313	if (multiq) {
				314	w->epollfd = epoll_create(1);
				315	if (w->epollfd < 0)
				316	err(EXIT_FAILURE, "epoll_create");
				317
				318	if (nested)
				319	nest_epollfd(w);
				320	}
				321
				322	w->tid = i;
				323	w->fdmap = calloc(nfds, sizeof(int));
				324	if (!w->fdmap)
				325	return 1;
				326
				327	for (j = 0; j < nfds; j++) {
				328	int efd = multiq ? w->epollfd : epollfd;
				329	struct epoll_event ev;
				330
				331	w->fdmap[j] = eventfd(0, EFD_NONBLOCK);
				332	if (w->fdmap[j] < 0)
				333	err(EXIT_FAILURE, "eventfd");
				334
				335	ev.data.fd = w->fdmap[j];
				336	ev.events = events;
				337
				338	ret = epoll_ctl(efd, EPOLL_CTL_ADD,
				339	w->fdmap[j], &ev);
				340	if (ret < 0)
				341	err(EXIT_FAILURE, "epoll_ctl");
				342	}
				343
				344	if (!noaffinity) {
				345	CPU_ZERO(&cpuset);
				346	CPU_SET(cpu->map[i % cpu->nr], &cpuset);
				347
				348	ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset);
				349	if (ret)
				350	err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
				351
				352	attrp = &thread_attr;
				353	}
				354
				355	ret = pthread_create(&w->thread, attrp, workerfn,
				356	(void )(struct worker ) w);
				357	if (ret)
				358	err(EXIT_FAILURE, "pthread_create");
				359	}
				360
				361	if (!noaffinity)
				362	pthread_attr_destroy(&thread_attr);
				363
				364	return ret;
				365	}
				366
				367	static void writerfn(void p)
				368	{
				369	struct worker *worker = p;
				370	size_t i, j, iter;
				371	const uint64_t val = 1;
				372	ssize_t sz;
				373	struct timespec ts = { .tv_sec = 0,
				374	.tv_nsec = 500 };
				375
				376	printinfo("starting writer-thread: doing %s writes ...\n",
				377	randomize? "random":"lineal");
				378
				379	for (iter = 0; !wdone; iter++) {
				380	if (randomize) {
				381	shuffle((void )worker, nthreads, sizeof(worker));
				382	}
				383
				384	for (i = 0; i < nthreads; i++) {
				385	struct worker *w = &worker[i];
				386
				387	if (randomize) {
				388	shuffle((void *)w->fdmap, nfds, sizeof(int));
				389	}
				390
				391	for (j = 0; j < nfds; j++) {
				392	do {
				393	sz = write(w->fdmap[j], &val, sizeof(val));
				394	} while (!wdone && (sz < 0 && errno == EAGAIN));
				395	}
				396	}
				397
				398	nanosleep(&ts, NULL);
				399	}
				400
				401	printinfo("exiting writer-thread (total full-loops: %zd)\n", iter);
				402	return NULL;
				403	}
				404
				405	static int cmpworker(const void p1, const void p2)
				406	{
				407
				408	struct worker w1 = (struct worker ) p1;
				409	struct worker w2 = (struct worker ) p2;
				410	return w1->tid > w2->tid;
				411	}
				412
				413	int bench_epoll_wait(int argc, const char **argv)
				414	{
				415	int ret = 0;
				416	struct sigaction act;
				417	unsigned int i;
				418	struct worker *worker = NULL;
Jiri Olsa	f854839	2019-07-21 13:23:49 +0200	[diff] [blame]	419	struct perf_cpu_map *cpu;
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	420	pthread_t wthread;
				421	struct rlimit rl, prevrl;
				422
				423	argc = parse_options(argc, argv, options, bench_epoll_wait_usage, 0);
				424	if (argc) {
				425	usage_with_options(bench_epoll_wait_usage, options);
				426	exit(EXIT_FAILURE);
				427	}
				428
				429	sigfillset(&act.sa_mask);
				430	act.sa_sigaction = toggle_done;
				431	sigaction(SIGINT, &act, NULL);
				432
Jiri Olsa	9c3516d	2019-07-21 13:24:30 +0200	[diff] [blame]	433	cpu = perf_cpu_map__new(NULL);
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	434	if (!cpu)
				435	goto errmem;
				436
				437	/* a single, main epoll instance */
				438	if (!multiq) {
				439	epollfd = epoll_create(1);
				440	if (epollfd < 0)
				441	err(EXIT_FAILURE, "epoll_create");
				442
				443	/*
				444	* Deal with nested epolls, if any.
				445	*/
				446	if (nested)
				447	nest_epollfd(NULL);
				448	}
				449
				450	printinfo("Using %s queue model\n", multiq ? "multi" : "single");
				451	printinfo("Nesting level(s): %d\n", nested);
				452
				453	/* default to the number of CPUs and leave one for the writer pthread */
				454	if (!nthreads)
				455	nthreads = cpu->nr - 1;
				456
				457	worker = calloc(nthreads, sizeof(*worker));
				458	if (!worker) {
				459	goto errmem;
				460	}
				461
				462	if (getrlimit(RLIMIT_NOFILE, &prevrl))
				463	err(EXIT_FAILURE, "getrlimit");
				464	rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
				465	printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n",
				466	(uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max);
				467	if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
				468	err(EXIT_FAILURE, "setrlimit");
				469
				470	printf("Run summary [PID %d]: %d threads monitoring%s on "
				471	"%d file-descriptors for %d secs.\n\n",
				472	getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs);
				473
				474	init_stats(&throughput_stats);
				475	pthread_mutex_init(&thread_lock, NULL);
				476	pthread_cond_init(&thread_parent, NULL);
				477	pthread_cond_init(&thread_worker, NULL);
				478
				479	threads_starting = nthreads;
				480
Arnaldo Carvalho de Melo	e4d9b04	2020-03-02 12:09:38 -0300	[diff] [blame^]	481	gettimeofday(&bench__start, NULL);
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	482
				483	do_threads(worker, cpu);
				484
				485	pthread_mutex_lock(&thread_lock);
				486	while (threads_starting)
				487	pthread_cond_wait(&thread_parent, &thread_lock);
				488	pthread_cond_broadcast(&thread_worker);
				489	pthread_mutex_unlock(&thread_lock);
				490
				491	/*
				492	* At this point the workers should be blocked waiting for read events
				493	* to become ready. Launch the writer which will constantly be writing
				494	* to each thread's fdmap.
				495	*/
				496	ret = pthread_create(&wthread, NULL, writerfn,
				497	(void )(struct worker ) worker);
				498	if (ret)
				499	err(EXIT_FAILURE, "pthread_create");
				500
				501	sleep(nsecs);
				502	toggle_done(0, NULL, NULL);
				503	printinfo("main thread: toggling done\n");
				504
				505	sleep(1); /* meh */
				506	wdone = true;
				507	ret = pthread_join(wthread, NULL);
				508	if (ret)
				509	err(EXIT_FAILURE, "pthread_join");
				510
				511	/* cleanup & report results */
				512	pthread_cond_destroy(&thread_parent);
				513	pthread_cond_destroy(&thread_worker);
				514	pthread_mutex_destroy(&thread_lock);
				515
				516	/* sort the array back before reporting */
				517	if (randomize)
				518	qsort(worker, nthreads, sizeof(struct worker), cmpworker);
				519
				520	for (i = 0; i < nthreads; i++) {
Arnaldo Carvalho de Melo	e4d9b04	2020-03-02 12:09:38 -0300	[diff] [blame^]	521	unsigned long t = worker[i].ops / bench__runtime.tv_sec;
Davidlohr Bueso	121dd9e	2018-11-06 07:22:25 -0800	[diff] [blame]	522
				523	update_stats(&throughput_stats, t);
				524
				525	if (nfds == 1)
				526	printf("[thread %2d] fdmap: %p [ %04ld ops/sec ]\n",
				527	worker[i].tid, &worker[i].fdmap[0], t);
				528	else
				529	printf("[thread %2d] fdmap: %p ... %p [ %04ld ops/sec ]\n",
				530	worker[i].tid, &worker[i].fdmap[0],
				531	&worker[i].fdmap[nfds-1], t);
				532	}
				533
				534	print_summary();
				535
				536	close(epollfd);
				537	return ret;
				538	errmem:
				539	err(EXIT_FAILURE, "calloc");
				540	}
				541	#endif // HAVE_EVENTFD