 6ebbf2ce43
			
		
	
	
	6ebbf2ce43
	
	
	
		
			
			ARMv6 and greater introduced a new instruction ("bx") which can be used
to return from function calls.  Recent CPUs perform better when the
"bx lr" instruction is used rather than the "mov pc, lr" instruction,
and this sequence is strongly recommended to be used by the ARM
architecture manual (section A.4.1.1).
We provide a new macro "ret" with all its variants for the condition
code which will resolve to the appropriate instruction.
Rather than doing this piecemeal, and miss some instances, change all
the "mov pc" instances to use the new macro, with the exception of
the "movs" instruction and the kprobes code.  This allows us to detect
the "mov pc, lr" case and fix it up - and also gives us the possibility
of deploying this for other registers depending on the CPU selection.
Reported-by: Will Deacon <will.deacon@arm.com>
Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1
Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S
Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood
Tested-by: Shawn Guo <shawn.guo@freescale.com>
Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs
Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385
Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci
Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp
Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen
Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M
Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
		
	
			
		
			
				
	
	
		
			212 lines
		
	
	
	
		
			4 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			212 lines
		
	
	
	
		
			4 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  *  linux/arch/arm/lib/div64.S
 | |
|  *
 | |
|  *  Optimized computation of 64-bit dividend / 32-bit divisor
 | |
|  *
 | |
|  *  Author:	Nicolas Pitre
 | |
|  *  Created:	Oct 5, 2003
 | |
|  *  Copyright:	Monta Vista Software, Inc.
 | |
|  *
 | |
|  *  This program is free software; you can redistribute it and/or modify
 | |
|  *  it under the terms of the GNU General Public License version 2 as
 | |
|  *  published by the Free Software Foundation.
 | |
|  */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/assembler.h>
 | |
| #include <asm/unwind.h>
 | |
| 
 | |
| #ifdef __ARMEB__
 | |
| #define xh r0
 | |
| #define xl r1
 | |
| #define yh r2
 | |
| #define yl r3
 | |
| #else
 | |
| #define xl r0
 | |
| #define xh r1
 | |
| #define yl r2
 | |
| #define yh r3
 | |
| #endif
 | |
| 
 | |
| /*
 | |
|  * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
 | |
|  *
 | |
|  * Note: Calling convention is totally non standard for optimal code.
 | |
|  *       This is meant to be used by do_div() from include/asm/div64.h only.
 | |
|  *
 | |
|  * Input parameters:
 | |
|  * 	xh-xl	= dividend (clobbered)
 | |
|  * 	r4	= divisor (preserved)
 | |
|  *
 | |
|  * Output values:
 | |
|  * 	yh-yl	= result
 | |
|  * 	xh	= remainder
 | |
|  *
 | |
|  * Clobbered regs: xl, ip
 | |
|  */
 | |
| 
 | |
| ENTRY(__do_div64)
 | |
| UNWIND(.fnstart)
 | |
| 
 | |
| 	@ Test for easy paths first.
 | |
| 	subs	ip, r4, #1
 | |
| 	bls	9f			@ divisor is 0 or 1
 | |
| 	tst	ip, r4
 | |
| 	beq	8f			@ divisor is power of 2
 | |
| 
 | |
| 	@ See if we need to handle upper 32-bit result.
 | |
| 	cmp	xh, r4
 | |
| 	mov	yh, #0
 | |
| 	blo	3f
 | |
| 
 | |
| 	@ Align divisor with upper part of dividend.
 | |
| 	@ The aligned divisor is stored in yl preserving the original.
 | |
| 	@ The bit position is stored in ip.
 | |
| 
 | |
| #if __LINUX_ARM_ARCH__ >= 5
 | |
| 
 | |
| 	clz	yl, r4
 | |
| 	clz	ip, xh
 | |
| 	sub	yl, yl, ip
 | |
| 	mov	ip, #1
 | |
| 	mov	ip, ip, lsl yl
 | |
| 	mov	yl, r4, lsl yl
 | |
| 
 | |
| #else
 | |
| 
 | |
| 	mov	yl, r4
 | |
| 	mov	ip, #1
 | |
| 1:	cmp	yl, #0x80000000
 | |
| 	cmpcc	yl, xh
 | |
| 	movcc	yl, yl, lsl #1
 | |
| 	movcc	ip, ip, lsl #1
 | |
| 	bcc	1b
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 	@ The division loop for needed upper bit positions.
 | |
|  	@ Break out early if dividend reaches 0.
 | |
| 2:	cmp	xh, yl
 | |
| 	orrcs	yh, yh, ip
 | |
| 	subcss	xh, xh, yl
 | |
| 	movnes	ip, ip, lsr #1
 | |
| 	mov	yl, yl, lsr #1
 | |
| 	bne	2b
 | |
| 
 | |
| 	@ See if we need to handle lower 32-bit result.
 | |
| 3:	cmp	xh, #0
 | |
| 	mov	yl, #0
 | |
| 	cmpeq	xl, r4
 | |
| 	movlo	xh, xl
 | |
| 	retlo	lr
 | |
| 
 | |
| 	@ The division loop for lower bit positions.
 | |
| 	@ Here we shift remainer bits leftwards rather than moving the
 | |
| 	@ divisor for comparisons, considering the carry-out bit as well.
 | |
| 	mov	ip, #0x80000000
 | |
| 4:	movs	xl, xl, lsl #1
 | |
| 	adcs	xh, xh, xh
 | |
| 	beq	6f
 | |
| 	cmpcc	xh, r4
 | |
| 5:	orrcs	yl, yl, ip
 | |
| 	subcs	xh, xh, r4
 | |
| 	movs	ip, ip, lsr #1
 | |
| 	bne	4b
 | |
| 	ret	lr
 | |
| 
 | |
| 	@ The top part of remainder became zero.  If carry is set
 | |
| 	@ (the 33th bit) this is a false positive so resume the loop.
 | |
| 	@ Otherwise, if lower part is also null then we are done.
 | |
| 6:	bcs	5b
 | |
| 	cmp	xl, #0
 | |
| 	reteq	lr
 | |
| 
 | |
| 	@ We still have remainer bits in the low part.  Bring them up.
 | |
| 
 | |
| #if __LINUX_ARM_ARCH__ >= 5
 | |
| 
 | |
| 	clz	xh, xl			@ we know xh is zero here so...
 | |
| 	add	xh, xh, #1
 | |
| 	mov	xl, xl, lsl xh
 | |
| 	mov	ip, ip, lsr xh
 | |
| 
 | |
| #else
 | |
| 
 | |
| 7:	movs	xl, xl, lsl #1
 | |
| 	mov	ip, ip, lsr #1
 | |
| 	bcc	7b
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 	@ Current remainder is now 1.  It is worthless to compare with
 | |
| 	@ divisor at this point since divisor can not be smaller than 3 here.
 | |
| 	@ If possible, branch for another shift in the division loop.
 | |
| 	@ If no bit position left then we are done.
 | |
| 	movs	ip, ip, lsr #1
 | |
| 	mov	xh, #1
 | |
| 	bne	4b
 | |
| 	ret	lr
 | |
| 
 | |
| 8:	@ Division by a power of 2: determine what that divisor order is
 | |
| 	@ then simply shift values around
 | |
| 
 | |
| #if __LINUX_ARM_ARCH__ >= 5
 | |
| 
 | |
| 	clz	ip, r4
 | |
| 	rsb	ip, ip, #31
 | |
| 
 | |
| #else
 | |
| 
 | |
| 	mov	yl, r4
 | |
| 	cmp	r4, #(1 << 16)
 | |
| 	mov	ip, #0
 | |
| 	movhs	yl, yl, lsr #16
 | |
| 	movhs	ip, #16
 | |
| 
 | |
| 	cmp	yl, #(1 << 8)
 | |
| 	movhs	yl, yl, lsr #8
 | |
| 	addhs	ip, ip, #8
 | |
| 
 | |
| 	cmp	yl, #(1 << 4)
 | |
| 	movhs	yl, yl, lsr #4
 | |
| 	addhs	ip, ip, #4
 | |
| 
 | |
| 	cmp	yl, #(1 << 2)
 | |
| 	addhi	ip, ip, #3
 | |
| 	addls	ip, ip, yl, lsr #1
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 	mov	yh, xh, lsr ip
 | |
| 	mov	yl, xl, lsr ip
 | |
| 	rsb	ip, ip, #32
 | |
|  ARM(	orr	yl, yl, xh, lsl ip	)
 | |
|  THUMB(	lsl	xh, xh, ip		)
 | |
|  THUMB(	orr	yl, yl, xh		)
 | |
| 	mov	xh, xl, lsl ip
 | |
| 	mov	xh, xh, lsr ip
 | |
| 	ret	lr
 | |
| 
 | |
| 	@ eq -> division by 1: obvious enough...
 | |
| 9:	moveq	yl, xl
 | |
| 	moveq	yh, xh
 | |
| 	moveq	xh, #0
 | |
| 	reteq	lr
 | |
| UNWIND(.fnend)
 | |
| 
 | |
| UNWIND(.fnstart)
 | |
| UNWIND(.pad #4)
 | |
| UNWIND(.save {lr})
 | |
| Ldiv0_64:
 | |
| 	@ Division by 0:
 | |
| 	str	lr, [sp, #-8]!
 | |
| 	bl	__div0
 | |
| 
 | |
| 	@ as wrong as it could be...
 | |
| 	mov	yl, #0
 | |
| 	mov	yh, #0
 | |
| 	mov	xh, #0
 | |
| 	ldr	pc, [sp], #8
 | |
| 
 | |
| UNWIND(.fnend)
 | |
| ENDPROC(__do_div64)
 |