 5210d1e688
			
		
	
	
	5210d1e688
	
	
	
		
			
			Hand optimised asm code for ARC700 pipeline. Originally written/optimized by Joern Rennecke Signed-off-by: Vineet Gupta <vgupta@synopsys.com> Cc: Joern Rennecke <joern.rennecke@embecosm.com>
		
			
				
	
	
		
			96 lines
		
	
	
	
		
			2.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			96 lines
		
	
	
	
		
			2.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License version 2 as
 | |
|  * published by the Free Software Foundation.
 | |
|  */
 | |
| 
 | |
| /* This is optimized primarily for the ARC700.
 | |
|    It would be possible to speed up the loops by one cycle / word
 | |
|    respective one cycle / byte by forcing double source 1 alignment, unrolling
 | |
|    by a factor of two, and speculatively loading the second word / byte of
 | |
|    source 1; however, that would increase the overhead for loop setup / finish,
 | |
|    and strcmp might often terminate early.  */
 | |
| 
 | |
| #include <asm/linkage.h>
 | |
| 
 | |
| ARC_ENTRY strcmp
 | |
| 	or	r2,r0,r1
 | |
| 	bmsk_s	r2,r2,1
 | |
| 	brne	r2,0,.Lcharloop
 | |
| 	mov_s	r12,0x01010101
 | |
| 	ror	r5,r12
 | |
| .Lwordloop:
 | |
| 	ld.ab	r2,[r0,4]
 | |
| 	ld.ab	r3,[r1,4]
 | |
| 	nop_s
 | |
| 	sub	r4,r2,r12
 | |
| 	bic	r4,r4,r2
 | |
| 	and	r4,r4,r5
 | |
| 	brne	r4,0,.Lfound0
 | |
| 	breq	r2,r3,.Lwordloop
 | |
| #ifdef	__LITTLE_ENDIAN__
 | |
| 	xor	r0,r2,r3	; mask for difference
 | |
| 	sub_s	r1,r0,1
 | |
| 	bic_s	r0,r0,r1	; mask for least significant difference bit
 | |
| 	sub	r1,r5,r0
 | |
| 	xor	r0,r5,r1	; mask for least significant difference byte
 | |
| 	and_s	r2,r2,r0
 | |
| 	and_s	r3,r3,r0
 | |
| #endif /* LITTLE ENDIAN */
 | |
| 	cmp_s	r2,r3
 | |
| 	mov_s	r0,1
 | |
| 	j_s.d	[blink]
 | |
| 	bset.lo	r0,r0,31
 | |
| 
 | |
| 	.balign	4
 | |
| #ifdef __LITTLE_ENDIAN__
 | |
| .Lfound0:
 | |
| 	xor	r0,r2,r3	; mask for difference
 | |
| 	or	r0,r0,r4	; or in zero indicator
 | |
| 	sub_s	r1,r0,1
 | |
| 	bic_s	r0,r0,r1	; mask for least significant difference bit
 | |
| 	sub	r1,r5,r0
 | |
| 	xor	r0,r5,r1	; mask for least significant difference byte
 | |
| 	and_s	r2,r2,r0
 | |
| 	and_s	r3,r3,r0
 | |
| 	sub.f	r0,r2,r3
 | |
| 	mov.hi	r0,1
 | |
| 	j_s.d	[blink]
 | |
| 	bset.lo	r0,r0,31
 | |
| #else /* BIG ENDIAN */
 | |
| 	/* The zero-detection above can mis-detect 0x01 bytes as zeroes
 | |
| 	   because of carry-propagateion from a lower significant zero byte.
 | |
| 	   We can compensate for this by checking that bit0 is zero.
 | |
| 	   This compensation is not necessary in the step where we
 | |
| 	   get a low estimate for r2, because in any affected bytes
 | |
| 	   we already have 0x00 or 0x01, which will remain unchanged
 | |
| 	   when bit 7 is cleared.  */
 | |
| 	.balign	4
 | |
| .Lfound0:
 | |
| 	lsr	r0,r4,8
 | |
| 	lsr_s	r1,r2
 | |
| 	bic_s	r2,r2,r0	; get low estimate for r2 and get ...
 | |
| 	bic_s	r0,r0,r1	; <this is the adjusted mask for zeros>
 | |
| 	or_s	r3,r3,r0	; ... high estimate r3 so that r2 > r3 will ...
 | |
| 	cmp_s	r3,r2		; ... be independent of trailing garbage
 | |
| 	or_s	r2,r2,r0	; likewise for r3 > r2
 | |
| 	bic_s	r3,r3,r0
 | |
| 	rlc	r0,0		; r0 := r2 > r3 ? 1 : 0
 | |
| 	cmp_s	r2,r3
 | |
| 	j_s.d	[blink]
 | |
| 	bset.lo	r0,r0,31
 | |
| #endif /* ENDIAN */
 | |
| 
 | |
| 	.balign	4
 | |
| .Lcharloop:
 | |
| 	ldb.ab	r2,[r0,1]
 | |
| 	ldb.ab	r3,[r1,1]
 | |
| 	nop_s
 | |
| 	breq	r2,0,.Lcmpend
 | |
| 	breq	r2,r3,.Lcharloop
 | |
| .Lcmpend:
 | |
| 	j_s.d	[blink]
 | |
| 	sub	r0,r2,r3
 | |
| ARC_EXIT strcmp
 |