 57dda6ef5b
			
		
	
	
	57dda6ef5b
	
	
	
		
			
			This new copy_4K_page() function was originally tuned for the best performance on the Cell processor, but after testing on more 64bit powerpc chips it was found that with a small modification it either matched the performance offered by the current mainline version or bettered it by a small amount. It was found that on a Cell-based QS22 blade the amount of system time measured when compiling a 2.6.26 pseries_defconfig decreased by 4%. Using the same test, a 4-way 970MP machine saw a decrease of 2% in system time. No noticeable change was seen on Power4, Power5 or Power6. The 4096 byte page is copied in thirty-two 128 byte strides. An initial setup loop executes dcbt instructions for the whole source page and dcbz instructions for the whole destination page. To do this, the cache line size is retrieved from ppc64_caches. A new CPU feature bit, CPU_FTR_CP_USE_DCBTZ, (introduced in the previous patch) is used to make the modification to this new copy routine - on Power4, 970 and Cell the feature bit is set so the setup loop is executed, but on all other 64bit chips the setup loop is nop'ed out. Signed-off-by: Mark Nelson <markn@au1.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
		
			
				
	
	
		
			107 lines
		
	
	
	
		
			2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			107 lines
		
	
	
	
		
			2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2008 Mark Nelson, IBM Corp.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License
 | |
|  * as published by the Free Software Foundation; either version
 | |
|  * 2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| #include <asm/processor.h>
 | |
| #include <asm/ppc_asm.h>
 | |
| #include <asm/asm-offsets.h>
 | |
| 
 | |
|         .section        ".toc","aw"
 | |
| PPC64_CACHES:
 | |
|         .tc             ppc64_caches[TC],ppc64_caches
 | |
|         .section        ".text"
 | |
| 
 | |
| 
 | |
| _GLOBAL(copy_4K_page)
 | |
| 	li	r5,4096		/* 4K page size */
 | |
| BEGIN_FTR_SECTION
 | |
| 	ld      r10,PPC64_CACHES@toc(r2)
 | |
| 	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
 | |
| 	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
 | |
| 	li	r9,0
 | |
| 	srd	r8,r5,r11
 | |
| 
 | |
| 	mtctr	r8
 | |
| setup:
 | |
| 	dcbt	r9,r4
 | |
| 	dcbz	r9,r3
 | |
| 	add	r9,r9,r12
 | |
| 	bdnz	setup
 | |
| END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
 | |
| 	addi	r3,r3,-8
 | |
| 	srdi    r8,r5,7		/* page is copied in 128 byte strides */
 | |
| 	addi	r8,r8,-1	/* one stride copied outside loop */
 | |
| 
 | |
| 	mtctr	r8
 | |
| 
 | |
| 	ld	r5,0(r4)
 | |
| 	ld	r6,8(r4)
 | |
| 	ld	r7,16(r4)
 | |
| 	ldu	r8,24(r4)
 | |
| 1:	std	r5,8(r3)
 | |
| 	ld	r9,8(r4)
 | |
| 	std	r6,16(r3)
 | |
| 	ld	r10,16(r4)
 | |
| 	std	r7,24(r3)
 | |
| 	ld	r11,24(r4)
 | |
| 	std	r8,32(r3)
 | |
| 	ld	r12,32(r4)
 | |
| 	std	r9,40(r3)
 | |
| 	ld	r5,40(r4)
 | |
| 	std	r10,48(r3)
 | |
| 	ld	r6,48(r4)
 | |
| 	std	r11,56(r3)
 | |
| 	ld	r7,56(r4)
 | |
| 	std	r12,64(r3)
 | |
| 	ld	r8,64(r4)
 | |
| 	std	r5,72(r3)
 | |
| 	ld	r9,72(r4)
 | |
| 	std	r6,80(r3)
 | |
| 	ld	r10,80(r4)
 | |
| 	std	r7,88(r3)
 | |
| 	ld	r11,88(r4)
 | |
| 	std	r8,96(r3)
 | |
| 	ld	r12,96(r4)
 | |
| 	std	r9,104(r3)
 | |
| 	ld	r5,104(r4)
 | |
| 	std	r10,112(r3)
 | |
| 	ld	r6,112(r4)
 | |
| 	std	r11,120(r3)
 | |
| 	ld	r7,120(r4)
 | |
| 	stdu	r12,128(r3)
 | |
| 	ldu	r8,128(r4)
 | |
| 	bdnz	1b
 | |
| 
 | |
| 	std	r5,8(r3)
 | |
| 	ld	r9,8(r4)
 | |
| 	std	r6,16(r3)
 | |
| 	ld	r10,16(r4)
 | |
| 	std	r7,24(r3)
 | |
| 	ld	r11,24(r4)
 | |
| 	std	r8,32(r3)
 | |
| 	ld	r12,32(r4)
 | |
| 	std	r9,40(r3)
 | |
| 	ld	r5,40(r4)
 | |
| 	std	r10,48(r3)
 | |
| 	ld	r6,48(r4)
 | |
| 	std	r11,56(r3)
 | |
| 	ld	r7,56(r4)
 | |
| 	std	r12,64(r3)
 | |
| 	ld	r8,64(r4)
 | |
| 	std	r5,72(r3)
 | |
| 	ld	r9,72(r4)
 | |
| 	std	r6,80(r3)
 | |
| 	ld	r10,80(r4)
 | |
| 	std	r7,88(r3)
 | |
| 	ld	r11,88(r4)
 | |
| 	std	r8,96(r3)
 | |
| 	ld	r12,96(r4)
 | |
| 	std	r9,104(r3)
 | |
| 	std	r10,112(r3)
 | |
| 	std	r11,120(r3)
 | |
| 	std	r12,128(r3)
 | |
| 	blr
 |