 15c2d45d17
			
		
	
	
	15c2d45d17
	
	
	
		
			
			I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
		
			
				
	
	
		
			166 lines
		
	
	
	
		
			2.5 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			166 lines
		
	
	
	
		
			2.5 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * String handling functions for PowerPC.
 | |
|  *
 | |
|  * Copyright (C) 1996 Paul Mackerras.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License
 | |
|  * as published by the Free Software Foundation; either version
 | |
|  * 2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| #include <asm/processor.h>
 | |
| #include <asm/errno.h>
 | |
| #include <asm/ppc_asm.h>
 | |
| 
 | |
| 	.section __ex_table,"a"
 | |
| 	PPC_LONG_ALIGN
 | |
| 	.text
 | |
| 	
 | |
| _GLOBAL(strcpy)
 | |
| 	addi	r5,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r0,1(r4)
 | |
| 	cmpwi	0,r0,0
 | |
| 	stbu	r0,1(r5)
 | |
| 	bne	1b
 | |
| 	blr
 | |
| 
 | |
| /* This clears out any unused part of the destination buffer,
 | |
|    just as the libc version does.  -- paulus */
 | |
| _GLOBAL(strncpy)
 | |
| 	PPC_LCMPI 0,r5,0
 | |
| 	beqlr
 | |
| 	mtctr	r5
 | |
| 	addi	r6,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r0,1(r4)
 | |
| 	cmpwi	0,r0,0
 | |
| 	stbu	r0,1(r6)
 | |
| 	bdnzf	2,1b		/* dec ctr, branch if ctr != 0 && !cr0.eq */
 | |
| 	bnelr			/* if we didn't hit a null char, we're done */
 | |
| 	mfctr	r5
 | |
| 	PPC_LCMPI 0,r5,0	/* any space left in destination buffer? */
 | |
| 	beqlr			/* we know r0 == 0 here */
 | |
| 2:	stbu	r0,1(r6)	/* clear it out if so */
 | |
| 	bdnz	2b
 | |
| 	blr
 | |
| 
 | |
| _GLOBAL(strcat)
 | |
| 	addi	r5,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r0,1(r5)
 | |
| 	cmpwi	0,r0,0
 | |
| 	bne	1b
 | |
| 	addi	r5,r5,-1
 | |
| 1:	lbzu	r0,1(r4)
 | |
| 	cmpwi	0,r0,0
 | |
| 	stbu	r0,1(r5)
 | |
| 	bne	1b
 | |
| 	blr
 | |
| 
 | |
| _GLOBAL(strcmp)
 | |
| 	addi	r5,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r3,1(r5)
 | |
| 	cmpwi	1,r3,0
 | |
| 	lbzu	r0,1(r4)
 | |
| 	subf.	r3,r0,r3
 | |
| 	beqlr	1
 | |
| 	beq	1b
 | |
| 	blr
 | |
| 
 | |
| _GLOBAL(strncmp)
 | |
| 	PPC_LCMPI 0,r5,0
 | |
| 	beq-	2f
 | |
| 	mtctr	r5
 | |
| 	addi	r5,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r3,1(r5)
 | |
| 	cmpwi	1,r3,0
 | |
| 	lbzu	r0,1(r4)
 | |
| 	subf.	r3,r0,r3
 | |
| 	beqlr	1
 | |
| 	bdnzt	eq,1b
 | |
| 	blr
 | |
| 2:	li	r3,0
 | |
| 	blr
 | |
| 
 | |
| _GLOBAL(strlen)
 | |
| 	addi	r4,r3,-1
 | |
| 1:	lbzu	r0,1(r4)
 | |
| 	cmpwi	0,r0,0
 | |
| 	bne	1b
 | |
| 	subf	r3,r3,r4
 | |
| 	blr
 | |
| 
 | |
| #ifdef CONFIG_PPC32
 | |
| _GLOBAL(memcmp)
 | |
| 	PPC_LCMPI 0,r5,0
 | |
| 	beq-	2f
 | |
| 	mtctr	r5
 | |
| 	addi	r6,r3,-1
 | |
| 	addi	r4,r4,-1
 | |
| 1:	lbzu	r3,1(r6)
 | |
| 	lbzu	r0,1(r4)
 | |
| 	subf.	r3,r0,r3
 | |
| 	bdnzt	2,1b
 | |
| 	blr
 | |
| 2:	li	r3,0
 | |
| 	blr
 | |
| #endif
 | |
| 
 | |
| _GLOBAL(memchr)
 | |
| 	PPC_LCMPI 0,r5,0
 | |
| 	beq-	2f
 | |
| 	mtctr	r5
 | |
| 	addi	r3,r3,-1
 | |
| 1:	lbzu	r0,1(r3)
 | |
| 	cmpw	0,r0,r4
 | |
| 	bdnzf	2,1b
 | |
| 	beqlr
 | |
| 2:	li	r3,0
 | |
| 	blr
 | |
| 
 | |
| #ifdef CONFIG_PPC32
 | |
| _GLOBAL(__clear_user)
 | |
| 	addi	r6,r3,-4
 | |
| 	li	r3,0
 | |
| 	li	r5,0
 | |
| 	cmplwi	0,r4,4
 | |
| 	blt	7f
 | |
| 	/* clear a single word */
 | |
| 11:	stwu	r5,4(r6)
 | |
| 	beqlr
 | |
| 	/* clear word sized chunks */
 | |
| 	andi.	r0,r6,3
 | |
| 	add	r4,r0,r4
 | |
| 	subf	r6,r0,r6
 | |
| 	srwi	r0,r4,2
 | |
| 	andi.	r4,r4,3
 | |
| 	mtctr	r0
 | |
| 	bdz	7f
 | |
| 1:	stwu	r5,4(r6)
 | |
| 	bdnz	1b
 | |
| 	/* clear byte sized chunks */
 | |
| 7:	cmpwi	0,r4,0
 | |
| 	beqlr
 | |
| 	mtctr	r4
 | |
| 	addi	r6,r6,3
 | |
| 8:	stbu	r5,1(r6)
 | |
| 	bdnz	8b
 | |
| 	blr
 | |
| 90:	mr	r3,r4
 | |
| 	blr
 | |
| 91:	mfctr	r3
 | |
| 	slwi	r3,r3,2
 | |
| 	add	r3,r3,r4
 | |
| 	blr
 | |
| 92:	mfctr	r3
 | |
| 	blr
 | |
| 
 | |
| 	.section __ex_table,"a"
 | |
| 	PPC_LONG	11b,90b
 | |
| 	PPC_LONG	1b,91b
 | |
| 	PPC_LONG	8b,92b
 | |
| 	.text
 | |
| #endif
 |