237 lines
		
	
	
	
		
			5 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
		
		
			
		
	
	
			237 lines
		
	
	
	
		
			5 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
|   | /* | ||
|  |  * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) | ||
|  |  * | ||
|  |  * This program is free software; you can redistribute it and/or modify
 | ||
|  |  * it under the terms of the GNU General Public License version 2 as | ||
|  |  * published by the Free Software Foundation. | ||
|  |  */ | ||
|  | 
 | ||
|  | #include <linux/linkage.h> | ||
|  | 
 | ||
|  | #ifdef __LITTLE_ENDIAN__ | ||
|  | # define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 | ||
|  | # define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
 | ||
|  | # define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM | ||
|  | # define MERGE_2(RX,RY,IMM) | ||
|  | # define EXTRACT_1(RX,RY,IMM)	and	RX, RY, 0xFFFF | ||
|  | # define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, IMM | ||
|  | #else | ||
|  | # define SHIFT_1(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
 | ||
|  | # define SHIFT_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 | ||
|  | # define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 | ||
|  | # define MERGE_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 | ||
|  | # define EXTRACT_1(RX,RY,IMM)	lsr	RX, RY, IMM | ||
|  | # define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, 0x08 | ||
|  | #endif | ||
|  | 
 | ||
|  | #ifdef CONFIG_ARC_HAS_LL64 | ||
|  | # define PREFETCH_READ(RX)	prefetch    [RX, 56] | ||
|  | # define PREFETCH_WRITE(RX)	prefetchw   [RX, 64] | ||
|  | # define LOADX(DST,RX)		ldd.ab	DST, [RX, 8] | ||
|  | # define STOREX(SRC,RX)		std.ab	SRC, [RX, 8] | ||
|  | # define ZOLSHFT		5 | ||
|  | # define ZOLAND			0x1F | ||
|  | #else | ||
|  | # define PREFETCH_READ(RX)	prefetch    [RX, 28] | ||
|  | # define PREFETCH_WRITE(RX)	prefetchw   [RX, 32] | ||
|  | # define LOADX(DST,RX)		ld.ab	DST, [RX, 4] | ||
|  | # define STOREX(SRC,RX)		st.ab	SRC, [RX, 4] | ||
|  | # define ZOLSHFT		4 | ||
|  | # define ZOLAND			0xF | ||
|  | #endif | ||
|  | 
 | ||
|  | ENTRY(memcpy) | ||
|  | 	prefetch [r1]		; Prefetch the read location
 | ||
|  | 	prefetchw [r0]		; Prefetch the write location
 | ||
|  | 	mov.f	0, r2 | ||
|  | ;;; if size is zero
 | ||
|  | 	jz.d	[blink] | ||
|  | 	mov	r3, r0		; don;t clobber ret val
 | ||
|  | 
 | ||
|  | ;;; if size <= 8
 | ||
|  | 	cmp	r2, 8 | ||
|  | 	bls.d	@smallchunk
 | ||
|  | 	mov.f	lp_count, r2 | ||
|  | 
 | ||
|  | 	and.f	r4, r0, 0x03 | ||
|  | 	rsub	lp_count, r4, 4 | ||
|  | 	lpnz	@aligndestination
 | ||
|  | 	;; LOOP BEGIN
 | ||
|  | 	ldb.ab	r5, [r1,1] | ||
|  | 	sub	r2, r2, 1 | ||
|  | 	stb.ab	r5, [r3,1] | ||
|  | aligndestination: | ||
|  | 
 | ||
|  | ;;; Check the alignment of the source
 | ||
|  | 	and.f	r4, r1, 0x03 | ||
|  | 	bnz.d	@sourceunaligned
 | ||
|  | 
 | ||
|  | ;;; CASE 0: Both source and destination are 32bit aligned
 | ||
|  | ;;; Convert len to Dwords, unfold x4
 | ||
|  | 	lsr.f	lp_count, r2, ZOLSHFT | ||
|  | 	lpnz	@copy32_64bytes
 | ||
|  | 	;; LOOP START
 | ||
|  | 	LOADX (r6, r1) | ||
|  | 	PREFETCH_READ (r1) | ||
|  | 	PREFETCH_WRITE (r3) | ||
|  | 	LOADX (r8, r1) | ||
|  | 	LOADX (r10, r1) | ||
|  | 	LOADX (r4, r1) | ||
|  | 	STOREX (r6, r3) | ||
|  | 	STOREX (r8, r3) | ||
|  | 	STOREX (r10, r3) | ||
|  | 	STOREX (r4, r3) | ||
|  | copy32_64bytes: | ||
|  | 
 | ||
|  | 	and.f	lp_count, r2, ZOLAND ;Last remaining 31 bytes
 | ||
|  | smallchunk: | ||
|  | 	lpnz	@copyremainingbytes
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ldb.ab	r5, [r1,1] | ||
|  | 	stb.ab	r5, [r3,1] | ||
|  | copyremainingbytes: | ||
|  | 
 | ||
|  | 	j	[blink] | ||
|  | ;;; END CASE 0
 | ||
|  | 
 | ||
|  | sourceunaligned: | ||
|  | 	cmp	r4, 2 | ||
|  | 	beq.d	@unalignedOffby2
 | ||
|  | 	sub	r2, r2, 1 | ||
|  | 
 | ||
|  | 	bhi.d	@unalignedOffby3
 | ||
|  | 	ldb.ab	r5, [r1, 1] | ||
|  | 
 | ||
|  | ;;; CASE 1: The source is unaligned, off by 1
 | ||
|  | 	;; Hence I need to read 1 byte for a 16bit alignment
 | ||
|  | 	;; and 2bytes to reach 32bit alignment
 | ||
|  | 	ldh.ab	r6, [r1, 2] | ||
|  | 	sub	r2, r2, 2 | ||
|  | 	;; Convert to words, unfold x2
 | ||
|  | 	lsr.f	lp_count, r2, 3 | ||
|  | 	MERGE_1 (r6, r6, 8) | ||
|  | 	MERGE_2 (r5, r5, 24) | ||
|  | 	or	r5, r5, r6 | ||
|  | 
 | ||
|  | 	;; Both src and dst are aligned
 | ||
|  | 	lpnz	@copy8bytes_1
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ld.ab	r6, [r1, 4] | ||
|  | 	prefetch [r1, 28]	;Prefetch the next read location
 | ||
|  | 	ld.ab	r8, [r1,4] | ||
|  | 	prefetchw [r3, 32]	;Prefetch the next write location
 | ||
|  | 
 | ||
|  | 	SHIFT_1	(r7, r6, 24) | ||
|  | 	or	r7, r7, r5 | ||
|  | 	SHIFT_2	(r5, r6, 8) | ||
|  | 
 | ||
|  | 	SHIFT_1	(r9, r8, 24) | ||
|  | 	or	r9, r9, r5 | ||
|  | 	SHIFT_2	(r5, r8, 8) | ||
|  | 
 | ||
|  | 	st.ab	r7, [r3, 4] | ||
|  | 	st.ab	r9, [r3, 4] | ||
|  | copy8bytes_1: | ||
|  | 
 | ||
|  | 	;; Write back the remaining 16bits
 | ||
|  | 	EXTRACT_1 (r6, r5, 16) | ||
|  | 	sth.ab	r6, [r3, 2] | ||
|  | 	;; Write back the remaining 8bits
 | ||
|  | 	EXTRACT_2 (r5, r5, 16) | ||
|  | 	stb.ab	r5, [r3, 1] | ||
|  | 
 | ||
|  | 	and.f	lp_count, r2, 0x07 ;Last 8bytes
 | ||
|  | 	lpnz	@copybytewise_1
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ldb.ab	r6, [r1,1] | ||
|  | 	stb.ab	r6, [r3,1] | ||
|  | copybytewise_1: | ||
|  | 	j	[blink] | ||
|  | 
 | ||
|  | unalignedOffby2: | ||
|  | ;;; CASE 2: The source is unaligned, off by 2
 | ||
|  | 	ldh.ab	r5, [r1, 2] | ||
|  | 	sub	r2, r2, 1 | ||
|  | 
 | ||
|  | 	;; Both src and dst are aligned
 | ||
|  | 	;; Convert to words, unfold x2
 | ||
|  | 	lsr.f	lp_count, r2, 3 | ||
|  | #ifdef __BIG_ENDIAN__ | ||
|  | 	asl.nz	r5, r5, 16 | ||
|  | #endif | ||
|  | 	lpnz	@copy8bytes_2
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ld.ab	r6, [r1, 4] | ||
|  | 	prefetch [r1, 28]	;Prefetch the next read location
 | ||
|  | 	ld.ab	r8, [r1,4] | ||
|  | 	prefetchw [r3, 32]	;Prefetch the next write location
 | ||
|  | 
 | ||
|  | 	SHIFT_1	(r7, r6, 16) | ||
|  | 	or	r7, r7, r5 | ||
|  | 	SHIFT_2	(r5, r6, 16) | ||
|  | 
 | ||
|  | 	SHIFT_1	(r9, r8, 16) | ||
|  | 	or	r9, r9, r5 | ||
|  | 	SHIFT_2	(r5, r8, 16) | ||
|  | 
 | ||
|  | 	st.ab	r7, [r3, 4] | ||
|  | 	st.ab	r9, [r3, 4] | ||
|  | copy8bytes_2: | ||
|  | 
 | ||
|  | #ifdef __BIG_ENDIAN__ | ||
|  | 	lsr.nz	r5, r5, 16 | ||
|  | #endif | ||
|  | 	sth.ab	r5, [r3, 2] | ||
|  | 
 | ||
|  | 	and.f	lp_count, r2, 0x07 ;Last 8bytes
 | ||
|  | 	lpnz	@copybytewise_2
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ldb.ab	r6, [r1,1] | ||
|  | 	stb.ab	r6, [r3,1] | ||
|  | copybytewise_2: | ||
|  | 	j	[blink] | ||
|  | 
 | ||
|  | unalignedOffby3: | ||
|  | ;;; CASE 3: The source is unaligned, off by 3
 | ||
|  | ;;; Hence, I need to read 1byte for achieve the 32bit alignment
 | ||
|  | 
 | ||
|  | 	;; Both src and dst are aligned
 | ||
|  | 	;; Convert to words, unfold x2
 | ||
|  | 	lsr.f	lp_count, r2, 3 | ||
|  | #ifdef __BIG_ENDIAN__ | ||
|  | 	asl.ne	r5, r5, 24 | ||
|  | #endif | ||
|  | 	lpnz	@copy8bytes_3
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ld.ab	r6, [r1, 4] | ||
|  | 	prefetch [r1, 28]	;Prefetch the next read location
 | ||
|  | 	ld.ab	r8, [r1,4] | ||
|  | 	prefetch [r3, 32]	;Prefetch the next write location
 | ||
|  | 
 | ||
|  | 	SHIFT_1	(r7, r6, 8) | ||
|  | 	or	r7, r7, r5 | ||
|  | 	SHIFT_2	(r5, r6, 24) | ||
|  | 
 | ||
|  | 	SHIFT_1	(r9, r8, 8) | ||
|  | 	or	r9, r9, r5 | ||
|  | 	SHIFT_2	(r5, r8, 24) | ||
|  | 
 | ||
|  | 	st.ab	r7, [r3, 4] | ||
|  | 	st.ab	r9, [r3, 4] | ||
|  | copy8bytes_3: | ||
|  | 
 | ||
|  | #ifdef __BIG_ENDIAN__ | ||
|  | 	lsr.nz	r5, r5, 24 | ||
|  | #endif | ||
|  | 	stb.ab	r5, [r3, 1] | ||
|  | 
 | ||
|  | 	and.f	lp_count, r2, 0x07 ;Last 8bytes
 | ||
|  | 	lpnz	@copybytewise_3
 | ||
|  | 	;; LOOP START
 | ||
|  | 	ldb.ab	r6, [r1,1] | ||
|  | 	stb.ab	r6, [r3,1] | ||
|  | copybytewise_3: | ||
|  | 	j	[blink] | ||
|  | 
 | ||
|  | END(memcpy) |