 fcf634098c
			
		
	
	
	fcf634098c
	
	
	
		
			
			The basic idea behind cross memory attach is to allow MPI programs doing
intra-node communication to do a single copy of the message rather than a
double copy of the message via shared memory.
The following patch attempts to achieve this by allowing a destination
process, given an address and size from a source process, to copy memory
directly from the source process into its own address space via a system
call.  There is also a symmetrical ability to copy from the current
process's address space into a destination process's address space.
- Use of /proc/pid/mem has been considered, but there are issues with
  using it:
  - Does not allow for specifying iovecs for both src and dest, assuming
    preadv or pwritev was implemented either the area read from or
  written to would need to be contiguous.
  - Currently mem_read allows only processes who are currently
  ptrace'ing the target and are still able to ptrace the target to read
  from the target. This check could possibly be moved to the open call,
  but its not clear exactly what race this restriction is stopping
  (reason  appears to have been lost)
  - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix
  domain socket is a bit ugly from a userspace point of view,
  especially when you may have hundreds if not (eventually) thousands
  of processes  that all need to do this with each other
  - Doesn't allow for some future use of the interface we would like to
  consider adding in the future (see below)
  - Interestingly reading from /proc/pid/mem currently actually
  involves two copies! (But this could be fixed pretty easily)
As mentioned previously use of vmsplice instead was considered, but has
problems.  Since you need the reader and writer working co-operatively if
the pipe is not drained then you block.  Which requires some wrapping to
do non blocking on the send side or polling on the receive.  In all to all
communication it requires ordering otherwise you can deadlock.  And in the
example of many MPI tasks writing to one MPI task vmsplice serialises the
copying.
There are some cases of MPI collectives where even a single copy interface
does not get us the performance gain we could.  For example in an
MPI_Reduce rather than copy the data from the source we would like to
instead use it directly in a mathops (say the reduce is doing a sum) as
this would save us doing a copy.  We don't need to keep a copy of the data
from the source.  I haven't implemented this, but I think this interface
could in the future do all this through the use of the flags - eg could
specify the math operation and type and the kernel rather than just
copying the data would apply the specified operation between the source
and destination and store it in the destination.
Although we don't have a "second user" of the interface (though I've had
some nibbles from people who may be interested in using it for intra
process messaging which is not MPI).  This interface is something which
hardware vendors are already doing for their custom drivers to implement
fast local communication.  And so in addition to this being useful for
OpenMPI it would mean the driver maintainers don't have to fix things up
when the mm changes.
There was some discussion about how much faster a true zero copy would
go. Here's a link back to the email with some testing I did on that:
http://marc.info/?l=linux-mm&m=130105930902915&w=2
There is a basic man page for the proposed interface here:
http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt
This has been implemented for x86 and powerpc, other architecture should
mainly (I think) just need to add syscall numbers for the process_vm_readv
and process_vm_writev. There are 32 bit compatibility versions for
64-bit kernels.
For arch maintainers there are some simple tests to be able to quickly
verify that the syscalls are working correctly here:
http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz
Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: <linux-man@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			434 lines
		
	
	
	
		
			12 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			434 lines
		
	
	
	
		
			12 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _ASM_POWERPC_UNISTD_H_
 | |
| #define _ASM_POWERPC_UNISTD_H_
 | |
| 
 | |
| /*
 | |
|  * This file contains the system call numbers.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License
 | |
|  * as published by the Free Software Foundation; either version
 | |
|  * 2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| 
 | |
| #define __NR_restart_syscall	  0
 | |
| #define __NR_exit		  1
 | |
| #define __NR_fork		  2
 | |
| #define __NR_read		  3
 | |
| #define __NR_write		  4
 | |
| #define __NR_open		  5
 | |
| #define __NR_close		  6
 | |
| #define __NR_waitpid		  7
 | |
| #define __NR_creat		  8
 | |
| #define __NR_link		  9
 | |
| #define __NR_unlink		 10
 | |
| #define __NR_execve		 11
 | |
| #define __NR_chdir		 12
 | |
| #define __NR_time		 13
 | |
| #define __NR_mknod		 14
 | |
| #define __NR_chmod		 15
 | |
| #define __NR_lchown		 16
 | |
| #define __NR_break		 17
 | |
| #define __NR_oldstat		 18
 | |
| #define __NR_lseek		 19
 | |
| #define __NR_getpid		 20
 | |
| #define __NR_mount		 21
 | |
| #define __NR_umount		 22
 | |
| #define __NR_setuid		 23
 | |
| #define __NR_getuid		 24
 | |
| #define __NR_stime		 25
 | |
| #define __NR_ptrace		 26
 | |
| #define __NR_alarm		 27
 | |
| #define __NR_oldfstat		 28
 | |
| #define __NR_pause		 29
 | |
| #define __NR_utime		 30
 | |
| #define __NR_stty		 31
 | |
| #define __NR_gtty		 32
 | |
| #define __NR_access		 33
 | |
| #define __NR_nice		 34
 | |
| #define __NR_ftime		 35
 | |
| #define __NR_sync		 36
 | |
| #define __NR_kill		 37
 | |
| #define __NR_rename		 38
 | |
| #define __NR_mkdir		 39
 | |
| #define __NR_rmdir		 40
 | |
| #define __NR_dup		 41
 | |
| #define __NR_pipe		 42
 | |
| #define __NR_times		 43
 | |
| #define __NR_prof		 44
 | |
| #define __NR_brk		 45
 | |
| #define __NR_setgid		 46
 | |
| #define __NR_getgid		 47
 | |
| #define __NR_signal		 48
 | |
| #define __NR_geteuid		 49
 | |
| #define __NR_getegid		 50
 | |
| #define __NR_acct		 51
 | |
| #define __NR_umount2		 52
 | |
| #define __NR_lock		 53
 | |
| #define __NR_ioctl		 54
 | |
| #define __NR_fcntl		 55
 | |
| #define __NR_mpx		 56
 | |
| #define __NR_setpgid		 57
 | |
| #define __NR_ulimit		 58
 | |
| #define __NR_oldolduname	 59
 | |
| #define __NR_umask		 60
 | |
| #define __NR_chroot		 61
 | |
| #define __NR_ustat		 62
 | |
| #define __NR_dup2		 63
 | |
| #define __NR_getppid		 64
 | |
| #define __NR_getpgrp		 65
 | |
| #define __NR_setsid		 66
 | |
| #define __NR_sigaction		 67
 | |
| #define __NR_sgetmask		 68
 | |
| #define __NR_ssetmask		 69
 | |
| #define __NR_setreuid		 70
 | |
| #define __NR_setregid		 71
 | |
| #define __NR_sigsuspend		 72
 | |
| #define __NR_sigpending		 73
 | |
| #define __NR_sethostname	 74
 | |
| #define __NR_setrlimit		 75
 | |
| #define __NR_getrlimit		 76
 | |
| #define __NR_getrusage		 77
 | |
| #define __NR_gettimeofday	 78
 | |
| #define __NR_settimeofday	 79
 | |
| #define __NR_getgroups		 80
 | |
| #define __NR_setgroups		 81
 | |
| #define __NR_select		 82
 | |
| #define __NR_symlink		 83
 | |
| #define __NR_oldlstat		 84
 | |
| #define __NR_readlink		 85
 | |
| #define __NR_uselib		 86
 | |
| #define __NR_swapon		 87
 | |
| #define __NR_reboot		 88
 | |
| #define __NR_readdir		 89
 | |
| #define __NR_mmap		 90
 | |
| #define __NR_munmap		 91
 | |
| #define __NR_truncate		 92
 | |
| #define __NR_ftruncate		 93
 | |
| #define __NR_fchmod		 94
 | |
| #define __NR_fchown		 95
 | |
| #define __NR_getpriority	 96
 | |
| #define __NR_setpriority	 97
 | |
| #define __NR_profil		 98
 | |
| #define __NR_statfs		 99
 | |
| #define __NR_fstatfs		100
 | |
| #define __NR_ioperm		101
 | |
| #define __NR_socketcall		102
 | |
| #define __NR_syslog		103
 | |
| #define __NR_setitimer		104
 | |
| #define __NR_getitimer		105
 | |
| #define __NR_stat		106
 | |
| #define __NR_lstat		107
 | |
| #define __NR_fstat		108
 | |
| #define __NR_olduname		109
 | |
| #define __NR_iopl		110
 | |
| #define __NR_vhangup		111
 | |
| #define __NR_idle		112
 | |
| #define __NR_vm86		113
 | |
| #define __NR_wait4		114
 | |
| #define __NR_swapoff		115
 | |
| #define __NR_sysinfo		116
 | |
| #define __NR_ipc		117
 | |
| #define __NR_fsync		118
 | |
| #define __NR_sigreturn		119
 | |
| #define __NR_clone		120
 | |
| #define __NR_setdomainname	121
 | |
| #define __NR_uname		122
 | |
| #define __NR_modify_ldt		123
 | |
| #define __NR_adjtimex		124
 | |
| #define __NR_mprotect		125
 | |
| #define __NR_sigprocmask	126
 | |
| #define __NR_create_module	127
 | |
| #define __NR_init_module	128
 | |
| #define __NR_delete_module	129
 | |
| #define __NR_get_kernel_syms	130
 | |
| #define __NR_quotactl		131
 | |
| #define __NR_getpgid		132
 | |
| #define __NR_fchdir		133
 | |
| #define __NR_bdflush		134
 | |
| #define __NR_sysfs		135
 | |
| #define __NR_personality	136
 | |
| #define __NR_afs_syscall	137 /* Syscall for Andrew File System */
 | |
| #define __NR_setfsuid		138
 | |
| #define __NR_setfsgid		139
 | |
| #define __NR__llseek		140
 | |
| #define __NR_getdents		141
 | |
| #define __NR__newselect		142
 | |
| #define __NR_flock		143
 | |
| #define __NR_msync		144
 | |
| #define __NR_readv		145
 | |
| #define __NR_writev		146
 | |
| #define __NR_getsid		147
 | |
| #define __NR_fdatasync		148
 | |
| #define __NR__sysctl		149
 | |
| #define __NR_mlock		150
 | |
| #define __NR_munlock		151
 | |
| #define __NR_mlockall		152
 | |
| #define __NR_munlockall		153
 | |
| #define __NR_sched_setparam		154
 | |
| #define __NR_sched_getparam		155
 | |
| #define __NR_sched_setscheduler		156
 | |
| #define __NR_sched_getscheduler		157
 | |
| #define __NR_sched_yield		158
 | |
| #define __NR_sched_get_priority_max	159
 | |
| #define __NR_sched_get_priority_min	160
 | |
| #define __NR_sched_rr_get_interval	161
 | |
| #define __NR_nanosleep		162
 | |
| #define __NR_mremap		163
 | |
| #define __NR_setresuid		164
 | |
| #define __NR_getresuid		165
 | |
| #define __NR_query_module	166
 | |
| #define __NR_poll		167
 | |
| #define __NR_nfsservctl		168
 | |
| #define __NR_setresgid		169
 | |
| #define __NR_getresgid		170
 | |
| #define __NR_prctl		171
 | |
| #define __NR_rt_sigreturn	172
 | |
| #define __NR_rt_sigaction	173
 | |
| #define __NR_rt_sigprocmask	174
 | |
| #define __NR_rt_sigpending	175
 | |
| #define __NR_rt_sigtimedwait	176
 | |
| #define __NR_rt_sigqueueinfo	177
 | |
| #define __NR_rt_sigsuspend	178
 | |
| #define __NR_pread64		179
 | |
| #define __NR_pwrite64		180
 | |
| #define __NR_chown		181
 | |
| #define __NR_getcwd		182
 | |
| #define __NR_capget		183
 | |
| #define __NR_capset		184
 | |
| #define __NR_sigaltstack	185
 | |
| #define __NR_sendfile		186
 | |
| #define __NR_getpmsg		187	/* some people actually want streams */
 | |
| #define __NR_putpmsg		188	/* some people actually want streams */
 | |
| #define __NR_vfork		189
 | |
| #define __NR_ugetrlimit		190	/* SuS compliant getrlimit */
 | |
| #define __NR_readahead		191
 | |
| #ifndef __powerpc64__			/* these are 32-bit only */
 | |
| #define __NR_mmap2		192
 | |
| #define __NR_truncate64		193
 | |
| #define __NR_ftruncate64	194
 | |
| #define __NR_stat64		195
 | |
| #define __NR_lstat64		196
 | |
| #define __NR_fstat64		197
 | |
| #endif
 | |
| #define __NR_pciconfig_read	198
 | |
| #define __NR_pciconfig_write	199
 | |
| #define __NR_pciconfig_iobase	200
 | |
| #define __NR_multiplexer	201
 | |
| #define __NR_getdents64		202
 | |
| #define __NR_pivot_root		203
 | |
| #ifndef __powerpc64__
 | |
| #define __NR_fcntl64		204
 | |
| #endif
 | |
| #define __NR_madvise		205
 | |
| #define __NR_mincore		206
 | |
| #define __NR_gettid		207
 | |
| #define __NR_tkill		208
 | |
| #define __NR_setxattr		209
 | |
| #define __NR_lsetxattr		210
 | |
| #define __NR_fsetxattr		211
 | |
| #define __NR_getxattr		212
 | |
| #define __NR_lgetxattr		213
 | |
| #define __NR_fgetxattr		214
 | |
| #define __NR_listxattr		215
 | |
| #define __NR_llistxattr		216
 | |
| #define __NR_flistxattr		217
 | |
| #define __NR_removexattr	218
 | |
| #define __NR_lremovexattr	219
 | |
| #define __NR_fremovexattr	220
 | |
| #define __NR_futex		221
 | |
| #define __NR_sched_setaffinity	222
 | |
| #define __NR_sched_getaffinity	223
 | |
| /* 224 currently unused */
 | |
| #define __NR_tuxcall		225
 | |
| #ifndef __powerpc64__
 | |
| #define __NR_sendfile64		226
 | |
| #endif
 | |
| #define __NR_io_setup		227
 | |
| #define __NR_io_destroy		228
 | |
| #define __NR_io_getevents	229
 | |
| #define __NR_io_submit		230
 | |
| #define __NR_io_cancel		231
 | |
| #define __NR_set_tid_address	232
 | |
| #define __NR_fadvise64		233
 | |
| #define __NR_exit_group		234
 | |
| #define __NR_lookup_dcookie	235
 | |
| #define __NR_epoll_create	236
 | |
| #define __NR_epoll_ctl		237
 | |
| #define __NR_epoll_wait		238
 | |
| #define __NR_remap_file_pages	239
 | |
| #define __NR_timer_create	240
 | |
| #define __NR_timer_settime	241
 | |
| #define __NR_timer_gettime	242
 | |
| #define __NR_timer_getoverrun	243
 | |
| #define __NR_timer_delete	244
 | |
| #define __NR_clock_settime	245
 | |
| #define __NR_clock_gettime	246
 | |
| #define __NR_clock_getres	247
 | |
| #define __NR_clock_nanosleep	248
 | |
| #define __NR_swapcontext	249
 | |
| #define __NR_tgkill		250
 | |
| #define __NR_utimes		251
 | |
| #define __NR_statfs64		252
 | |
| #define __NR_fstatfs64		253
 | |
| #ifndef __powerpc64__
 | |
| #define __NR_fadvise64_64	254
 | |
| #endif
 | |
| #define __NR_rtas		255
 | |
| #define __NR_sys_debug_setcontext 256
 | |
| /* Number 257 is reserved for vserver */
 | |
| #define __NR_migrate_pages	258
 | |
| #define __NR_mbind		259
 | |
| #define __NR_get_mempolicy	260
 | |
| #define __NR_set_mempolicy	261
 | |
| #define __NR_mq_open		262
 | |
| #define __NR_mq_unlink		263
 | |
| #define __NR_mq_timedsend	264
 | |
| #define __NR_mq_timedreceive	265
 | |
| #define __NR_mq_notify		266
 | |
| #define __NR_mq_getsetattr	267
 | |
| #define __NR_kexec_load		268
 | |
| #define __NR_add_key		269
 | |
| #define __NR_request_key	270
 | |
| #define __NR_keyctl		271
 | |
| #define __NR_waitid		272
 | |
| #define __NR_ioprio_set		273
 | |
| #define __NR_ioprio_get		274
 | |
| #define __NR_inotify_init	275
 | |
| #define __NR_inotify_add_watch	276
 | |
| #define __NR_inotify_rm_watch	277
 | |
| #define __NR_spu_run		278
 | |
| #define __NR_spu_create		279
 | |
| #define __NR_pselect6		280
 | |
| #define __NR_ppoll		281
 | |
| #define __NR_unshare		282
 | |
| #define __NR_splice		283
 | |
| #define __NR_tee		284
 | |
| #define __NR_vmsplice		285
 | |
| #define __NR_openat		286
 | |
| #define __NR_mkdirat		287
 | |
| #define __NR_mknodat		288
 | |
| #define __NR_fchownat		289
 | |
| #define __NR_futimesat		290
 | |
| #ifdef __powerpc64__
 | |
| #define __NR_newfstatat		291
 | |
| #else
 | |
| #define __NR_fstatat64		291
 | |
| #endif
 | |
| #define __NR_unlinkat		292
 | |
| #define __NR_renameat		293
 | |
| #define __NR_linkat		294
 | |
| #define __NR_symlinkat		295
 | |
| #define __NR_readlinkat		296
 | |
| #define __NR_fchmodat		297
 | |
| #define __NR_faccessat		298
 | |
| #define __NR_get_robust_list	299
 | |
| #define __NR_set_robust_list	300
 | |
| #define __NR_move_pages		301
 | |
| #define __NR_getcpu		302
 | |
| #define __NR_epoll_pwait	303
 | |
| #define __NR_utimensat		304
 | |
| #define __NR_signalfd		305
 | |
| #define __NR_timerfd_create	306
 | |
| #define __NR_eventfd		307
 | |
| #define __NR_sync_file_range2	308
 | |
| #define __NR_fallocate		309
 | |
| #define __NR_subpage_prot	310
 | |
| #define __NR_timerfd_settime	311
 | |
| #define __NR_timerfd_gettime	312
 | |
| #define __NR_signalfd4		313
 | |
| #define __NR_eventfd2		314
 | |
| #define __NR_epoll_create1	315
 | |
| #define __NR_dup3		316
 | |
| #define __NR_pipe2		317
 | |
| #define __NR_inotify_init1	318
 | |
| #define __NR_perf_event_open	319
 | |
| #define __NR_preadv		320
 | |
| #define __NR_pwritev		321
 | |
| #define __NR_rt_tgsigqueueinfo	322
 | |
| #define __NR_fanotify_init	323
 | |
| #define __NR_fanotify_mark	324
 | |
| #define __NR_prlimit64		325
 | |
| #define __NR_socket		326
 | |
| #define __NR_bind		327
 | |
| #define __NR_connect		328
 | |
| #define __NR_listen		329
 | |
| #define __NR_accept		330
 | |
| #define __NR_getsockname	331
 | |
| #define __NR_getpeername	332
 | |
| #define __NR_socketpair		333
 | |
| #define __NR_send		334
 | |
| #define __NR_sendto		335
 | |
| #define __NR_recv		336
 | |
| #define __NR_recvfrom		337
 | |
| #define __NR_shutdown		338
 | |
| #define __NR_setsockopt		339
 | |
| #define __NR_getsockopt		340
 | |
| #define __NR_sendmsg		341
 | |
| #define __NR_recvmsg		342
 | |
| #define __NR_recvmmsg		343
 | |
| #define __NR_accept4		344
 | |
| #define __NR_name_to_handle_at	345
 | |
| #define __NR_open_by_handle_at	346
 | |
| #define __NR_clock_adjtime	347
 | |
| #define __NR_syncfs		348
 | |
| #define __NR_sendmmsg		349
 | |
| #define __NR_setns		350
 | |
| #define __NR_process_vm_readv	351
 | |
| #define __NR_process_vm_writev	352
 | |
| 
 | |
| #ifdef __KERNEL__
 | |
| 
 | |
| #define __NR_syscalls		353
 | |
| 
 | |
| #define __NR__exit __NR_exit
 | |
| #define NR_syscalls	__NR_syscalls
 | |
| 
 | |
| #ifndef __ASSEMBLY__
 | |
| 
 | |
| #include <linux/types.h>
 | |
| #include <linux/compiler.h>
 | |
| #include <linux/linkage.h>
 | |
| 
 | |
| #define __ARCH_WANT_IPC_PARSE_VERSION
 | |
| #define __ARCH_WANT_OLD_READDIR
 | |
| #define __ARCH_WANT_STAT64
 | |
| #define __ARCH_WANT_SYS_ALARM
 | |
| #define __ARCH_WANT_SYS_GETHOSTNAME
 | |
| #define __ARCH_WANT_SYS_IPC
 | |
| #define __ARCH_WANT_SYS_PAUSE
 | |
| #define __ARCH_WANT_SYS_SGETMASK
 | |
| #define __ARCH_WANT_SYS_SIGNAL
 | |
| #define __ARCH_WANT_SYS_TIME
 | |
| #define __ARCH_WANT_SYS_UTIME
 | |
| #define __ARCH_WANT_SYS_WAITPID
 | |
| #define __ARCH_WANT_SYS_SOCKETCALL
 | |
| #define __ARCH_WANT_SYS_FADVISE64
 | |
| #define __ARCH_WANT_SYS_GETPGRP
 | |
| #define __ARCH_WANT_SYS_LLSEEK
 | |
| #define __ARCH_WANT_SYS_NICE
 | |
| #define __ARCH_WANT_SYS_OLD_GETRLIMIT
 | |
| #define __ARCH_WANT_SYS_OLD_UNAME
 | |
| #define __ARCH_WANT_SYS_OLDUMOUNT
 | |
| #define __ARCH_WANT_SYS_SIGPENDING
 | |
| #define __ARCH_WANT_SYS_SIGPROCMASK
 | |
| #define __ARCH_WANT_SYS_RT_SIGACTION
 | |
| #define __ARCH_WANT_SYS_RT_SIGSUSPEND
 | |
| #ifdef CONFIG_PPC32
 | |
| #define __ARCH_WANT_OLD_STAT
 | |
| #endif
 | |
| #ifdef CONFIG_PPC64
 | |
| #define __ARCH_WANT_COMPAT_SYS_TIME
 | |
| #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 | |
| #define __ARCH_WANT_SYS_NEWFSTATAT
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|  * "Conditional" syscalls
 | |
|  */
 | |
| #define cond_syscall(x) \
 | |
| 	asmlinkage long x (void) __attribute__((weak,alias("sys_ni_syscall")))
 | |
| 
 | |
| #endif		/* __ASSEMBLY__ */
 | |
| #endif		/* __KERNEL__ */
 | |
| 
 | |
| #endif /* _ASM_POWERPC_UNISTD_H_ */
 |