 6ebbf2ce43
			
		
	
	
	6ebbf2ce43
	
	
	
		
			
			ARMv6 and greater introduced a new instruction ("bx") which can be used
to return from function calls.  Recent CPUs perform better when the
"bx lr" instruction is used rather than the "mov pc, lr" instruction,
and this sequence is strongly recommended to be used by the ARM
architecture manual (section A.4.1.1).
We provide a new macro "ret" with all its variants for the condition
code which will resolve to the appropriate instruction.
Rather than doing this piecemeal, and miss some instances, change all
the "mov pc" instances to use the new macro, with the exception of
the "movs" instruction and the kprobes code.  This allows us to detect
the "mov pc, lr" case and fix it up - and also gives us the possibility
of deploying this for other registers depending on the CPU selection.
Reported-by: Will Deacon <will.deacon@arm.com>
Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1
Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S
Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood
Tested-by: Shawn Guo <shawn.guo@freescale.com>
Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs
Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385
Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci
Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp
Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen
Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M
Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
		
	
			
		
			
				
	
	
		
			333 lines
		
	
	
	
		
			6.8 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			333 lines
		
	
	
	
		
			6.8 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  *  linux/arch/arm/lib/csumpartialcopygeneric.S
 | |
|  *
 | |
|  *  Copyright (C) 1995-2001 Russell King
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License version 2 as
 | |
|  * published by the Free Software Foundation.
 | |
|  */
 | |
| #include <asm/assembler.h>
 | |
| 
 | |
| /*
 | |
|  * unsigned int
 | |
|  * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
 | |
|  *  r0 = src, r1 = dst, r2 = len, r3 = sum
 | |
|  *  Returns : r0 = checksum
 | |
|  *
 | |
|  * Note that 'tst' and 'teq' preserve the carry flag.
 | |
|  */
 | |
| 
 | |
| src	.req	r0
 | |
| dst	.req	r1
 | |
| len	.req	r2
 | |
| sum	.req	r3
 | |
| 
 | |
| .Lzero:		mov	r0, sum
 | |
| 		load_regs
 | |
| 
 | |
| 		/*
 | |
| 		 * Align an unaligned destination pointer.  We know that
 | |
| 		 * we have >= 8 bytes here, so we don't need to check
 | |
| 		 * the length.  Note that the source pointer hasn't been
 | |
| 		 * aligned yet.
 | |
| 		 */
 | |
| .Ldst_unaligned:
 | |
| 		tst	dst, #1
 | |
| 		beq	.Ldst_16bit
 | |
| 
 | |
| 		load1b	ip
 | |
| 		sub	len, len, #1
 | |
| 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 | |
| 		strb	ip, [dst], #1
 | |
| 		tst	dst, #2
 | |
| 		reteq	lr			@ dst is now 32bit aligned
 | |
| 
 | |
| .Ldst_16bit:	load2b	r8, ip
 | |
| 		sub	len, len, #2
 | |
| 		adcs	sum, sum, r8, put_byte_0
 | |
| 		strb	r8, [dst], #1
 | |
| 		adcs	sum, sum, ip, put_byte_1
 | |
| 		strb	ip, [dst], #1
 | |
| 		ret	lr			@ dst is now 32bit aligned
 | |
| 
 | |
| 		/*
 | |
| 		 * Handle 0 to 7 bytes, with any alignment of source and
 | |
| 		 * destination pointers.  Note that when we get here, C = 0
 | |
| 		 */
 | |
| .Lless8:	teq	len, #0			@ check for zero count
 | |
| 		beq	.Lzero
 | |
| 
 | |
| 		/* we must have at least one byte. */
 | |
| 		tst	dst, #1			@ dst 16-bit aligned
 | |
| 		beq	.Lless8_aligned
 | |
| 
 | |
| 		/* Align dst */
 | |
| 		load1b	ip
 | |
| 		sub	len, len, #1
 | |
| 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 | |
| 		strb	ip, [dst], #1
 | |
| 		tst	len, #6
 | |
| 		beq	.Lless8_byteonly
 | |
| 
 | |
| 1:		load2b	r8, ip
 | |
| 		sub	len, len, #2
 | |
| 		adcs	sum, sum, r8, put_byte_0
 | |
| 		strb	r8, [dst], #1
 | |
| 		adcs	sum, sum, ip, put_byte_1
 | |
| 		strb	ip, [dst], #1
 | |
| .Lless8_aligned:
 | |
| 		tst	len, #6
 | |
| 		bne	1b
 | |
| .Lless8_byteonly:
 | |
| 		tst	len, #1
 | |
| 		beq	.Ldone
 | |
| 		load1b	r8
 | |
| 		adcs	sum, sum, r8, put_byte_0	@ update checksum
 | |
| 		strb	r8, [dst], #1
 | |
| 		b	.Ldone
 | |
| 
 | |
| FN_ENTRY
 | |
| 		save_regs
 | |
| 
 | |
| 		cmp	len, #8			@ Ensure that we have at least
 | |
| 		blo	.Lless8			@ 8 bytes to copy.
 | |
| 
 | |
| 		adds	sum, sum, #0		@ C = 0
 | |
| 		tst	dst, #3			@ Test destination alignment
 | |
| 		blne	.Ldst_unaligned		@ align destination, return here
 | |
| 
 | |
| 		/*
 | |
| 		 * Ok, the dst pointer is now 32bit aligned, and we know
 | |
| 		 * that we must have more than 4 bytes to copy.  Note
 | |
| 		 * that C contains the carry from the dst alignment above.
 | |
| 		 */
 | |
| 
 | |
| 		tst	src, #3			@ Test source alignment
 | |
| 		bne	.Lsrc_not_aligned
 | |
| 
 | |
| 		/* Routine for src & dst aligned */
 | |
| 
 | |
| 		bics	ip, len, #15
 | |
| 		beq	2f
 | |
| 
 | |
| 1:		load4l	r4, r5, r6, r7
 | |
| 		stmia	dst!, {r4, r5, r6, r7}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		adcs	sum, sum, r6
 | |
| 		adcs	sum, sum, r7
 | |
| 		sub	ip, ip, #16
 | |
| 		teq	ip, #0
 | |
| 		bne	1b
 | |
| 
 | |
| 2:		ands	ip, len, #12
 | |
| 		beq	4f
 | |
| 		tst	ip, #8
 | |
| 		beq	3f
 | |
| 		load2l	r4, r5
 | |
| 		stmia	dst!, {r4, r5}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		tst	ip, #4
 | |
| 		beq	4f
 | |
| 
 | |
| 3:		load1l	r4
 | |
| 		str	r4, [dst], #4
 | |
| 		adcs	sum, sum, r4
 | |
| 
 | |
| 4:		ands	len, len, #3
 | |
| 		beq	.Ldone
 | |
| 		load1l	r4
 | |
| 		tst	len, #2
 | |
| 		mov	r5, r4, get_byte_0
 | |
| 		beq	.Lexit
 | |
| 		adcs	sum, sum, r4, lspush #16
 | |
| 		strb	r5, [dst], #1
 | |
| 		mov	r5, r4, get_byte_1
 | |
| 		strb	r5, [dst], #1
 | |
| 		mov	r5, r4, get_byte_2
 | |
| .Lexit:		tst	len, #1
 | |
| 		strneb	r5, [dst], #1
 | |
| 		andne	r5, r5, #255
 | |
| 		adcnes	sum, sum, r5, put_byte_0
 | |
| 
 | |
| 		/*
 | |
| 		 * If the dst pointer was not 16-bit aligned, we
 | |
| 		 * need to rotate the checksum here to get around
 | |
| 		 * the inefficient byte manipulations in the
 | |
| 		 * architecture independent code.
 | |
| 		 */
 | |
| .Ldone:		adc	r0, sum, #0
 | |
| 		ldr	sum, [sp, #0]		@ dst
 | |
| 		tst	sum, #1
 | |
| 		movne	r0, r0, ror #8
 | |
| 		load_regs
 | |
| 
 | |
| .Lsrc_not_aligned:
 | |
| 		adc	sum, sum, #0		@ include C from dst alignment
 | |
| 		and	ip, src, #3
 | |
| 		bic	src, src, #3
 | |
| 		load1l	r5
 | |
| 		cmp	ip, #2
 | |
| 		beq	.Lsrc2_aligned
 | |
| 		bhi	.Lsrc3_aligned
 | |
| 		mov	r4, r5, lspull #8		@ C = 0
 | |
| 		bics	ip, len, #15
 | |
| 		beq	2f
 | |
| 1:		load4l	r5, r6, r7, r8
 | |
| 		orr	r4, r4, r5, lspush #24
 | |
| 		mov	r5, r5, lspull #8
 | |
| 		orr	r5, r5, r6, lspush #24
 | |
| 		mov	r6, r6, lspull #8
 | |
| 		orr	r6, r6, r7, lspush #24
 | |
| 		mov	r7, r7, lspull #8
 | |
| 		orr	r7, r7, r8, lspush #24
 | |
| 		stmia	dst!, {r4, r5, r6, r7}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		adcs	sum, sum, r6
 | |
| 		adcs	sum, sum, r7
 | |
| 		mov	r4, r8, lspull #8
 | |
| 		sub	ip, ip, #16
 | |
| 		teq	ip, #0
 | |
| 		bne	1b
 | |
| 2:		ands	ip, len, #12
 | |
| 		beq	4f
 | |
| 		tst	ip, #8
 | |
| 		beq	3f
 | |
| 		load2l	r5, r6
 | |
| 		orr	r4, r4, r5, lspush #24
 | |
| 		mov	r5, r5, lspull #8
 | |
| 		orr	r5, r5, r6, lspush #24
 | |
| 		stmia	dst!, {r4, r5}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		mov	r4, r6, lspull #8
 | |
| 		tst	ip, #4
 | |
| 		beq	4f
 | |
| 3:		load1l	r5
 | |
| 		orr	r4, r4, r5, lspush #24
 | |
| 		str	r4, [dst], #4
 | |
| 		adcs	sum, sum, r4
 | |
| 		mov	r4, r5, lspull #8
 | |
| 4:		ands	len, len, #3
 | |
| 		beq	.Ldone
 | |
| 		mov	r5, r4, get_byte_0
 | |
| 		tst	len, #2
 | |
| 		beq	.Lexit
 | |
| 		adcs	sum, sum, r4, lspush #16
 | |
| 		strb	r5, [dst], #1
 | |
| 		mov	r5, r4, get_byte_1
 | |
| 		strb	r5, [dst], #1
 | |
| 		mov	r5, r4, get_byte_2
 | |
| 		b	.Lexit
 | |
| 
 | |
| .Lsrc2_aligned:	mov	r4, r5, lspull #16
 | |
| 		adds	sum, sum, #0
 | |
| 		bics	ip, len, #15
 | |
| 		beq	2f
 | |
| 1:		load4l	r5, r6, r7, r8
 | |
| 		orr	r4, r4, r5, lspush #16
 | |
| 		mov	r5, r5, lspull #16
 | |
| 		orr	r5, r5, r6, lspush #16
 | |
| 		mov	r6, r6, lspull #16
 | |
| 		orr	r6, r6, r7, lspush #16
 | |
| 		mov	r7, r7, lspull #16
 | |
| 		orr	r7, r7, r8, lspush #16
 | |
| 		stmia	dst!, {r4, r5, r6, r7}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		adcs	sum, sum, r6
 | |
| 		adcs	sum, sum, r7
 | |
| 		mov	r4, r8, lspull #16
 | |
| 		sub	ip, ip, #16
 | |
| 		teq	ip, #0
 | |
| 		bne	1b
 | |
| 2:		ands	ip, len, #12
 | |
| 		beq	4f
 | |
| 		tst	ip, #8
 | |
| 		beq	3f
 | |
| 		load2l	r5, r6
 | |
| 		orr	r4, r4, r5, lspush #16
 | |
| 		mov	r5, r5, lspull #16
 | |
| 		orr	r5, r5, r6, lspush #16
 | |
| 		stmia	dst!, {r4, r5}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		mov	r4, r6, lspull #16
 | |
| 		tst	ip, #4
 | |
| 		beq	4f
 | |
| 3:		load1l	r5
 | |
| 		orr	r4, r4, r5, lspush #16
 | |
| 		str	r4, [dst], #4
 | |
| 		adcs	sum, sum, r4
 | |
| 		mov	r4, r5, lspull #16
 | |
| 4:		ands	len, len, #3
 | |
| 		beq	.Ldone
 | |
| 		mov	r5, r4, get_byte_0
 | |
| 		tst	len, #2
 | |
| 		beq	.Lexit
 | |
| 		adcs	sum, sum, r4
 | |
| 		strb	r5, [dst], #1
 | |
| 		mov	r5, r4, get_byte_1
 | |
| 		strb	r5, [dst], #1
 | |
| 		tst	len, #1
 | |
| 		beq	.Ldone
 | |
| 		load1b	r5
 | |
| 		b	.Lexit
 | |
| 
 | |
| .Lsrc3_aligned:	mov	r4, r5, lspull #24
 | |
| 		adds	sum, sum, #0
 | |
| 		bics	ip, len, #15
 | |
| 		beq	2f
 | |
| 1:		load4l	r5, r6, r7, r8
 | |
| 		orr	r4, r4, r5, lspush #8
 | |
| 		mov	r5, r5, lspull #24
 | |
| 		orr	r5, r5, r6, lspush #8
 | |
| 		mov	r6, r6, lspull #24
 | |
| 		orr	r6, r6, r7, lspush #8
 | |
| 		mov	r7, r7, lspull #24
 | |
| 		orr	r7, r7, r8, lspush #8
 | |
| 		stmia	dst!, {r4, r5, r6, r7}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		adcs	sum, sum, r6
 | |
| 		adcs	sum, sum, r7
 | |
| 		mov	r4, r8, lspull #24
 | |
| 		sub	ip, ip, #16
 | |
| 		teq	ip, #0
 | |
| 		bne	1b
 | |
| 2:		ands	ip, len, #12
 | |
| 		beq	4f
 | |
| 		tst	ip, #8
 | |
| 		beq	3f
 | |
| 		load2l	r5, r6
 | |
| 		orr	r4, r4, r5, lspush #8
 | |
| 		mov	r5, r5, lspull #24
 | |
| 		orr	r5, r5, r6, lspush #8
 | |
| 		stmia	dst!, {r4, r5}
 | |
| 		adcs	sum, sum, r4
 | |
| 		adcs	sum, sum, r5
 | |
| 		mov	r4, r6, lspull #24
 | |
| 		tst	ip, #4
 | |
| 		beq	4f
 | |
| 3:		load1l	r5
 | |
| 		orr	r4, r4, r5, lspush #8
 | |
| 		str	r4, [dst], #4
 | |
| 		adcs	sum, sum, r4
 | |
| 		mov	r4, r5, lspull #24
 | |
| 4:		ands	len, len, #3
 | |
| 		beq	.Ldone
 | |
| 		mov	r5, r4, get_byte_0
 | |
| 		tst	len, #2
 | |
| 		beq	.Lexit
 | |
| 		strb	r5, [dst], #1
 | |
| 		adcs	sum, sum, r4
 | |
| 		load1l	r4
 | |
| 		mov	r5, r4, get_byte_0
 | |
| 		strb	r5, [dst], #1
 | |
| 		adcs	sum, sum, r4, lspush #24
 | |
| 		mov	r5, r4, get_byte_1
 | |
| 		b	.Lexit
 | |
| FN_EXIT
 |