epoll: support for disabling items, and a self-test app
Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item. If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the epoll item in a multi-threaded environment. Also added a new test_epoll self- test app to both demonstrate the need for this feature and test it. Signed-off-by: Paton J. Lewis <palewis@adobe.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Jason Baron <jbaron@redhat.com> Cc: Paul Holland <pholland@adobe.com> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
					parent
					
						
							
								a0a0a7a94c
							
						
					
				
			
			
				commit
				
					
						03a7beb55b
					
				
			
		
					 5 changed files with 392 additions and 4 deletions
				
			
		|  | @ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) | |||
| /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ | ||||
| static inline int ep_op_has_event(int op) | ||||
| { | ||||
| 	return op != EPOLL_CTL_DEL; | ||||
| 	return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; | ||||
| } | ||||
| 
 | ||||
| /* Initialize the poll safe wake up structure */ | ||||
|  | @ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item | ||||
|  * had no event flags set, indicating that another thread may be currently | ||||
|  * handling that item's events (in the case that EPOLLONESHOT was being | ||||
|  * used). Otherwise a zero result indicates that the item has been disabled | ||||
|  * from receiving events. A disabled item may be re-enabled via | ||||
|  * EPOLL_CTL_MOD. Must be called with "mtx" held. | ||||
|  */ | ||||
| static int ep_disable(struct eventpoll *ep, struct epitem *epi) | ||||
| { | ||||
| 	int result = 0; | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&ep->lock, flags); | ||||
| 	if (epi->event.events & ~EP_PRIVATE_BITS) { | ||||
| 		if (ep_is_linked(&epi->rdllink)) | ||||
| 			list_del_init(&epi->rdllink); | ||||
| 		/* Ensure ep_poll_callback will not add epi back onto ready
 | ||||
| 		   list: */ | ||||
| 		epi->event.events &= EP_PRIVATE_BITS; | ||||
| 		} | ||||
| 	else | ||||
| 		result = -EBUSY; | ||||
| 	spin_unlock_irqrestore(&ep->lock, flags); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| static void ep_free(struct eventpoll *ep) | ||||
| { | ||||
| 	struct rb_node *rbp; | ||||
|  | @ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
| 	rb_insert_color(&epi->rbn, &ep->rbr); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #define PATH_ARR_SIZE 5 | ||||
| /*
 | ||||
|  * These are the number paths of length 1 to 5, that we are allowing to emanate | ||||
|  | @ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
| 		} else | ||||
| 			error = -ENOENT; | ||||
| 		break; | ||||
| 	case EPOLL_CTL_DISABLE: | ||||
| 		if (epi) | ||||
| 			error = ep_disable(ep, epi); | ||||
| 		else | ||||
| 			error = -ENOENT; | ||||
| 		break; | ||||
| 	} | ||||
| 	mutex_unlock(&ep->mtx); | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,6 +25,7 @@ | |||
| #define EPOLL_CTL_ADD 1 | ||||
| #define EPOLL_CTL_DEL 2 | ||||
| #define EPOLL_CTL_MOD 3 | ||||
| #define EPOLL_CTL_DISABLE 4 | ||||
| 
 | ||||
| /*
 | ||||
|  * Request the handling of system wakeup events so as to prevent system suspends | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug | ||||
| TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll | ||||
| 
 | ||||
| all: | ||||
| 	for TARGET in $(TARGETS); do \
 | ||||
|  |  | |||
							
								
								
									
										11
									
								
								tools/testing/selftests/epoll/Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tools/testing/selftests/epoll/Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | |||
| # Makefile for epoll selftests
 | ||||
| 
 | ||||
| all: test_epoll | ||||
| %: %.c | ||||
| 	gcc -pthread -g -o $@ $^ | ||||
| 
 | ||||
| run_tests: all | ||||
| 	./test_epoll | ||||
| 
 | ||||
| clean: | ||||
| 	$(RM) test_epoll | ||||
							
								
								
									
										344
									
								
								tools/testing/selftests/epoll/test_epoll.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										344
									
								
								tools/testing/selftests/epoll/test_epoll.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,344 @@ | |||
| /*
 | ||||
|  *  tools/testing/selftests/epoll/test_epoll.c | ||||
|  * | ||||
|  *  Copyright 2012 Adobe Systems Incorporated | ||||
|  * | ||||
|  *  This program is free software; you can redistribute it and/or modify | ||||
|  *  it under the terms of the GNU General Public License as published by | ||||
|  *  the Free Software Foundation; either version 2 of the License, or | ||||
|  *  (at your option) any later version. | ||||
|  * | ||||
|  *  Paton J. Lewis <palewis@adobe.com> | ||||
|  * | ||||
|  */ | ||||
| 
 | ||||
| #include <errno.h> | ||||
| #include <fcntl.h> | ||||
| #include <pthread.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| #include <unistd.h> | ||||
| #include <sys/epoll.h> | ||||
| #include <sys/socket.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * A pointer to an epoll_item_private structure will be stored in the epoll | ||||
|  * item's event structure so that we can get access to the epoll_item_private | ||||
|  * data after calling epoll_wait: | ||||
|  */ | ||||
| struct epoll_item_private { | ||||
| 	int index;  /* Position of this struct within the epoll_items array. */ | ||||
| 	int fd; | ||||
| 	uint32_t events; | ||||
| 	pthread_mutex_t mutex;  /* Guards the following variables... */ | ||||
| 	int stop; | ||||
| 	int status;  /* Stores any error encountered while handling item. */ | ||||
| 	/* The following variable allows us to test whether we have encountered
 | ||||
| 	   a problem while attempting to cancel and delete the associated | ||||
| 	   event. When the test program exits, 'deleted' should be exactly | ||||
| 	   one. If it is greater than one, then the failed test reflects a real | ||||
| 	   world situation where we would have tried to access the epoll item's | ||||
| 	   private data after deleting it: */ | ||||
| 	int deleted; | ||||
| }; | ||||
| 
 | ||||
| struct epoll_item_private *epoll_items; | ||||
| 
 | ||||
| /*
 | ||||
|  * Delete the specified item from the epoll set. In a real-world secneario this | ||||
|  * is where we would free the associated data structure, but in this testing | ||||
|  * environment we retain the structure so that we can test for double-deletion: | ||||
|  */ | ||||
| void delete_item(int index) | ||||
| { | ||||
| 	__sync_fetch_and_add(&epoll_items[index].deleted, 1); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * A pointer to a read_thread_data structure will be passed as the argument to | ||||
|  * each read thread: | ||||
|  */ | ||||
| struct read_thread_data { | ||||
| 	int stop; | ||||
| 	int status;  /* Indicates any error encountered by the read thread. */ | ||||
| 	int epoll_set; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * The function executed by the read threads: | ||||
|  */ | ||||
| void *read_thread_function(void *function_data) | ||||
| { | ||||
| 	struct read_thread_data *thread_data = | ||||
| 		(struct read_thread_data *)function_data; | ||||
| 	struct epoll_event event_data; | ||||
| 	struct epoll_item_private *item_data; | ||||
| 	char socket_data; | ||||
| 
 | ||||
| 	/* Handle events until we encounter an error or this thread's 'stop'
 | ||||
| 	   condition is set: */ | ||||
| 	while (1) { | ||||
| 		int result = epoll_wait(thread_data->epoll_set, | ||||
| 					&event_data, | ||||
| 					1,	/* Number of desired events */ | ||||
| 					1000);  /* Timeout in ms */ | ||||
| 		if (result < 0) { | ||||
| 			/* Breakpoints signal all threads. Ignore that while
 | ||||
| 			   debugging: */ | ||||
| 			if (errno == EINTR) | ||||
| 				continue; | ||||
| 			thread_data->status = errno; | ||||
| 			return 0; | ||||
| 		} else if (thread_data->stop) | ||||
| 			return 0; | ||||
| 		else if (result == 0)  /* Timeout */ | ||||
| 			continue; | ||||
| 
 | ||||
| 		/* We need the mutex here because checking for the stop
 | ||||
| 		   condition and re-enabling the epoll item need to be done | ||||
| 		   together as one atomic operation when EPOLL_CTL_DISABLE is | ||||
| 		   available: */ | ||||
| 		item_data = (struct epoll_item_private *)event_data.data.ptr; | ||||
| 		pthread_mutex_lock(&item_data->mutex); | ||||
| 
 | ||||
| 		/* Remove the item from the epoll set if we want to stop
 | ||||
| 		   handling that event: */ | ||||
| 		if (item_data->stop) | ||||
| 			delete_item(item_data->index); | ||||
| 		else { | ||||
| 			/* Clear the data that was written to the other end of
 | ||||
| 			   our non-blocking socket: */ | ||||
| 			do { | ||||
| 				if (read(item_data->fd, &socket_data, 1) < 1) { | ||||
| 					if ((errno == EAGAIN) || | ||||
| 					    (errno == EWOULDBLOCK)) | ||||
| 						break; | ||||
| 					else | ||||
| 						goto error_unlock; | ||||
| 				} | ||||
| 			} while (item_data->events & EPOLLET); | ||||
| 
 | ||||
| 			/* The item was one-shot, so re-enable it: */ | ||||
| 			event_data.events = item_data->events; | ||||
| 			if (epoll_ctl(thread_data->epoll_set, | ||||
| 						  EPOLL_CTL_MOD, | ||||
| 						  item_data->fd, | ||||
| 						  &event_data) < 0) | ||||
| 				goto error_unlock; | ||||
| 		} | ||||
| 
 | ||||
| 		pthread_mutex_unlock(&item_data->mutex); | ||||
| 	} | ||||
| 
 | ||||
| error_unlock: | ||||
| 	thread_data->status = item_data->status = errno; | ||||
| 	pthread_mutex_unlock(&item_data->mutex); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * A pointer to a write_thread_data structure will be passed as the argument to | ||||
|  * the write thread: | ||||
|  */ | ||||
| struct write_thread_data { | ||||
| 	int stop; | ||||
| 	int status;  /* Indicates any error encountered by the write thread. */ | ||||
| 	int n_fds; | ||||
| 	int *fds; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * The function executed by the write thread. It writes a single byte to each | ||||
|  * socket in turn until the stop condition for this thread is set. If writing to | ||||
|  * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for | ||||
|  * the moment and just move on to the next socket in the list. We don't care | ||||
|  * about the order in which we deliver events to the epoll set. In fact we don't | ||||
|  * care about the data we're writing to the pipes at all; we just want to | ||||
|  * trigger epoll events: | ||||
|  */ | ||||
| void *write_thread_function(void *function_data) | ||||
| { | ||||
| 	const char data = 'X'; | ||||
| 	int index; | ||||
| 	struct write_thread_data *thread_data = | ||||
| 		(struct write_thread_data *)function_data; | ||||
| 	while (!write_thread_data->stop) | ||||
| 		for (index = 0; | ||||
| 		     !thread_data->stop && (index < thread_data->n_fds); | ||||
| 		     ++index) | ||||
| 			if ((write(thread_data->fds[index], &data, 1) < 1) && | ||||
| 				(errno != EAGAIN) && | ||||
| 				(errno != EWOULDBLOCK)) { | ||||
| 				write_thread_data->status = errno; | ||||
| 				return; | ||||
| 			} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Arguments are currently ignored: | ||||
|  */ | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
| 	const int n_read_threads = 100; | ||||
| 	const int n_epoll_items = 500; | ||||
| 	int index; | ||||
| 	int epoll_set = epoll_create1(0); | ||||
| 	struct write_thread_data write_thread_data = { | ||||
| 		0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int)) | ||||
| 	}; | ||||
| 	struct read_thread_data *read_thread_data = | ||||
| 		malloc(n_read_threads * sizeof(struct read_thread_data)); | ||||
| 	pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t)); | ||||
| 	pthread_t write_thread; | ||||
| 
 | ||||
| 	printf("-----------------\n"); | ||||
| 	printf("Runing test_epoll\n"); | ||||
| 	printf("-----------------\n"); | ||||
| 
 | ||||
| 	epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private)); | ||||
| 
 | ||||
| 	if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 || | ||||
| 		read_thread_data == 0 || read_threads == 0) | ||||
| 		goto error; | ||||
| 
 | ||||
| 	if (sysconf(_SC_NPROCESSORS_ONLN) < 2) { | ||||
| 		printf("Error: please run this test on a multi-core system.\n"); | ||||
| 		goto error; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Create the socket pairs and epoll items: */ | ||||
| 	for (index = 0; index < n_epoll_items; ++index) { | ||||
| 		int socket_pair[2]; | ||||
| 		struct epoll_event event_data; | ||||
| 		if (socketpair(AF_UNIX, | ||||
| 			       SOCK_STREAM | SOCK_NONBLOCK, | ||||
| 			       0, | ||||
| 			       socket_pair) < 0) | ||||
| 			goto error; | ||||
| 		write_thread_data.fds[index] = socket_pair[0]; | ||||
| 		epoll_items[index].index = index; | ||||
| 		epoll_items[index].fd = socket_pair[1]; | ||||
| 		if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0) | ||||
| 			goto error; | ||||
| 		/* We always use EPOLLONESHOT because this test is currently
 | ||||
| 		   structured to demonstrate the need for EPOLL_CTL_DISABLE, | ||||
| 		   which only produces useful information in the EPOLLONESHOT | ||||
| 		   case (without EPOLLONESHOT, calling epoll_ctl with | ||||
| 		   EPOLL_CTL_DISABLE will never return EBUSY). If support for | ||||
| 		   testing events without EPOLLONESHOT is desired, it should | ||||
| 		   probably be implemented in a separate unit test. */ | ||||
| 		epoll_items[index].events = EPOLLIN | EPOLLONESHOT; | ||||
| 		if (index < n_epoll_items / 2) | ||||
| 			epoll_items[index].events |= EPOLLET; | ||||
| 		epoll_items[index].stop = 0; | ||||
| 		epoll_items[index].status = 0; | ||||
| 		epoll_items[index].deleted = 0; | ||||
| 		event_data.events = epoll_items[index].events; | ||||
| 		event_data.data.ptr = &epoll_items[index]; | ||||
| 		if (epoll_ctl(epoll_set, | ||||
| 			      EPOLL_CTL_ADD, | ||||
| 			      epoll_items[index].fd, | ||||
| 			      &event_data) < 0) | ||||
| 			goto error; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Create and start the read threads: */ | ||||
| 	for (index = 0; index < n_read_threads; ++index) { | ||||
| 		read_thread_data[index].stop = 0; | ||||
| 		read_thread_data[index].status = 0; | ||||
| 		read_thread_data[index].epoll_set = epoll_set; | ||||
| 		if (pthread_create(&read_threads[index], | ||||
| 				   NULL, | ||||
| 				   read_thread_function, | ||||
| 				   &read_thread_data[index]) != 0) | ||||
| 			goto error; | ||||
| 	} | ||||
| 
 | ||||
| 	if (pthread_create(&write_thread, | ||||
| 			   NULL, | ||||
| 			   write_thread_function, | ||||
| 			   &write_thread_data) != 0) | ||||
| 		goto error; | ||||
| 
 | ||||
| 	/* Cancel all event pollers: */ | ||||
| #ifdef EPOLL_CTL_DISABLE | ||||
| 	for (index = 0; index < n_epoll_items; ++index) { | ||||
| 		pthread_mutex_lock(&epoll_items[index].mutex); | ||||
| 		++epoll_items[index].stop; | ||||
| 		if (epoll_ctl(epoll_set, | ||||
| 			      EPOLL_CTL_DISABLE, | ||||
| 			      epoll_items[index].fd, | ||||
| 			      NULL) == 0) | ||||
| 			delete_item(index); | ||||
| 		else if (errno != EBUSY) { | ||||
| 			pthread_mutex_unlock(&epoll_items[index].mutex); | ||||
| 			goto error; | ||||
| 		} | ||||
| 		/* EBUSY means events were being handled; allow the other thread
 | ||||
| 		   to delete the item. */ | ||||
| 		pthread_mutex_unlock(&epoll_items[index].mutex); | ||||
| 	} | ||||
| #else | ||||
| 	for (index = 0; index < n_epoll_items; ++index) { | ||||
| 		pthread_mutex_lock(&epoll_items[index].mutex); | ||||
| 		++epoll_items[index].stop; | ||||
| 		pthread_mutex_unlock(&epoll_items[index].mutex); | ||||
| 		/* Wait in case a thread running read_thread_function is
 | ||||
| 		   currently executing code between epoll_wait and | ||||
| 		   pthread_mutex_lock with this item. Note that a longer delay | ||||
| 		   would make double-deletion less likely (at the expense of | ||||
| 		   performance), but there is no guarantee that any delay would | ||||
| 		   ever be sufficient. Note also that we delete all event | ||||
| 		   pollers at once for testing purposes, but in a real-world | ||||
| 		   environment we are likely to want to be able to cancel event | ||||
| 		   pollers at arbitrary times. Therefore we can't improve this | ||||
| 		   situation by just splitting this loop into two loops | ||||
| 		   (i.e. signal 'stop' for all items, sleep, and then delete all | ||||
| 		   items). We also can't fix the problem via EPOLL_CTL_DEL | ||||
| 		   because that command can't prevent the case where some other | ||||
| 		   thread is executing read_thread_function within the region | ||||
| 		   mentioned above: */ | ||||
| 		usleep(1); | ||||
| 		pthread_mutex_lock(&epoll_items[index].mutex); | ||||
| 		if (!epoll_items[index].deleted) | ||||
| 			delete_item(index); | ||||
| 		pthread_mutex_unlock(&epoll_items[index].mutex); | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	/* Shut down the read threads: */ | ||||
| 	for (index = 0; index < n_read_threads; ++index) | ||||
| 		__sync_fetch_and_add(&read_thread_data[index].stop, 1); | ||||
| 	for (index = 0; index < n_read_threads; ++index) { | ||||
| 		if (pthread_join(read_threads[index], NULL) != 0) | ||||
| 			goto error; | ||||
| 		if (read_thread_data[index].status) | ||||
| 			goto error; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Shut down the write thread: */ | ||||
| 	__sync_fetch_and_add(&write_thread_data.stop, 1); | ||||
| 	if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status) | ||||
| 		goto error; | ||||
| 
 | ||||
| 	/* Check for final error conditions: */ | ||||
| 	for (index = 0; index < n_epoll_items; ++index) { | ||||
| 		if (epoll_items[index].status != 0) | ||||
| 			goto error; | ||||
| 		if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0) | ||||
| 			goto error; | ||||
| 	} | ||||
| 	for (index = 0; index < n_epoll_items; ++index) | ||||
| 		if (epoll_items[index].deleted != 1) { | ||||
| 			printf("Error: item data deleted %1d times.\n", | ||||
| 				   epoll_items[index].deleted); | ||||
| 			goto error; | ||||
| 		} | ||||
| 
 | ||||
| 	printf("[PASS]\n"); | ||||
| 	return 0; | ||||
| 
 | ||||
|  error: | ||||
| 	printf("[FAIL]\n"); | ||||
| 	return errno; | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Paton J. Lewis
				Paton J. Lewis