204 lines
		
	
	
	
		
			4.2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
		
		
			
		
	
	
			204 lines
		
	
	
	
		
			4.2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
|   | /* | ||
|  |  * arch/alpha/lib/ev6-copy_page.S | ||
|  |  * | ||
|  |  * Copy an entire page. | ||
|  |  */ | ||
|  | 
 | ||
|  | /* The following comparison of this routine vs the normal copy_page.S | ||
|  |    was written by an unnamed ev6 hardware designer and forwarded to me | ||
|  |    via Steven Hobbs <hobbs@steven.zko.dec.com>.
 | ||
|  |   | ||
|  |    First Problem: STQ overflows. | ||
|  |    ----------------------------- | ||
|  | 
 | ||
|  | 	It would be nice if EV6 handled every resource overflow efficiently, | ||
|  | 	but for some it doesn't.  Including store queue overflows.  It causes | ||
|  | 	a trap and a restart of the pipe. | ||
|  | 
 | ||
|  | 	To get around this we sometimes use (to borrow a term from a VSSAD | ||
|  | 	researcher) "aeration".  The idea is to slow the rate at which the | ||
|  | 	processor receives valid instructions by inserting nops in the fetch | ||
|  | 	path.  In doing so, you can prevent the overflow and actually make | ||
|  | 	the code run faster.  You can, of course, take advantage of the fact | ||
|  | 	that the processor can fetch at most 4 aligned instructions per cycle. | ||
|  | 
 | ||
|  | 	I inserted enough nops to force it to take 10 cycles to fetch the | ||
|  | 	loop code.  In theory, EV6 should be able to execute this loop in | ||
|  | 	9 cycles but I was not able to get it to run that fast -- the initial | ||
|  | 	conditions were such that I could not reach this optimum rate on | ||
|  | 	(chaotic) EV6.  I wrote the code such that everything would issue | ||
|  | 	in order.  | ||
|  | 
 | ||
|  |    Second Problem: Dcache index matches. | ||
|  |    ------------------------------------- | ||
|  | 
 | ||
|  | 	If you are going to use this routine on random aligned pages, there | ||
|  | 	is a 25% chance that the pages will be at the same dcache indices. | ||
|  | 	This results in many nasty memory traps without care. | ||
|  | 
 | ||
|  | 	The solution is to schedule the prefetches to avoid the memory | ||
|  | 	conflicts.  I schedule the wh64 prefetches farther ahead of the | ||
|  | 	read prefetches to avoid this problem. | ||
|  | 
 | ||
|  |    Third Problem: Needs more prefetching. | ||
|  |    -------------------------------------- | ||
|  | 
 | ||
|  | 	In order to improve the code I added deeper prefetching to take the | ||
|  | 	most advantage of EV6's bandwidth. | ||
|  | 
 | ||
|  | 	I also prefetched the read stream. Note that adding the read prefetch | ||
|  | 	forced me to add another cycle to the inner-most kernel - up to 11 | ||
|  | 	from the original 8 cycles per iteration.  We could improve performance | ||
|  | 	further by unrolling the loop and doing multiple prefetches per cycle. | ||
|  | 
 | ||
|  |    I think that the code below will be very robust and fast code for the | ||
|  |    purposes of copying aligned pages.  It is slower when both source and | ||
|  |    destination pages are in the dcache, but it is my guess that this is | ||
|  |    less important than the dcache miss case.  */ | ||
|  | 
 | ||
|  | 
 | ||
|  | 	.text | ||
|  | 	.align 4
 | ||
|  | 	.global copy_page
 | ||
|  | 	.ent copy_page
 | ||
|  | copy_page: | ||
|  | 	.prologue 0
 | ||
|  | 
 | ||
|  | 	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */ | ||
|  | 	wh64	($16) | ||
|  | 	ldl	$31,0($17) | ||
|  | 	ldl	$31,64($17) | ||
|  | 	lda	$1,1*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	ldl	$31,128($17) | ||
|  | 	ldl	$31,192($17) | ||
|  | 	lda	$1,2*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	ldl	$31,256($17) | ||
|  | 	lda	$18,118 | ||
|  | 	lda	$1,3*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	nop | ||
|  | 	lda	$1,4*64($16) | ||
|  | 	lda	$2,5*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	wh64	($2) | ||
|  | 	lda	$1,6*64($16) | ||
|  | 	lda	$2,7*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	wh64	($2) | ||
|  | 	lda	$1,8*64($16) | ||
|  | 	lda	$2,9*64($16) | ||
|  | 
 | ||
|  | 	wh64	($1) | ||
|  | 	wh64	($2) | ||
|  | 	lda	$19,10*64($16) | ||
|  | 	nop | ||
|  | 
 | ||
|  | 	/* Main prefetching/write-hinting loop.  */ | ||
|  | 1:	ldq	$0,0($17) | ||
|  | 	ldq	$1,8($17) | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 	ldq	$2,16($17) | ||
|  | 	ldq	$3,24($17) | ||
|  | 
 | ||
|  | 	ldq	$4,32($17) | ||
|  | 	ldq	$5,40($17) | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 	ldq	$6,48($17) | ||
|  | 	ldq	$7,56($17) | ||
|  | 
 | ||
|  | 	ldl	$31,320($17) | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	/* This gives the extra cycle of aeration above the minimum.  */ | ||
|  | 	unop			 | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	wh64	($19) | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	stq	$0,0($16) | ||
|  | 	subq	$18,1,$18 | ||
|  | 	stq	$1,8($16) | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	unop | ||
|  | 	stq	$2,16($16) | ||
|  | 	addq	$17,64,$17 | ||
|  | 	stq	$3,24($16) | ||
|  | 
 | ||
|  | 	stq	$4,32($16) | ||
|  | 	stq	$5,40($16) | ||
|  | 	addq	$19,64,$19 | ||
|  | 	unop | ||
|  | 
 | ||
|  | 	stq	$6,48($16) | ||
|  | 	stq	$7,56($16) | ||
|  | 	addq	$16,64,$16 | ||
|  | 	bne	$18, 1b | ||
|  | 
 | ||
|  | 	/* Prefetch the final 5 cache lines of the read stream.  */ | ||
|  | 	lda	$18,10 | ||
|  | 	ldl	$31,320($17) | ||
|  | 	ldl	$31,384($17) | ||
|  | 	ldl	$31,448($17) | ||
|  | 
 | ||
|  | 	ldl	$31,512($17) | ||
|  | 	ldl	$31,576($17) | ||
|  | 	nop | ||
|  | 	nop | ||
|  | 
 | ||
|  | 	/* Non-prefetching, non-write-hinting cleanup loop for the | ||
|  | 	   final 10 cache lines.  */ | ||
|  | 2:	ldq	$0,0($17) | ||
|  | 	ldq	$1,8($17) | ||
|  | 	ldq	$2,16($17) | ||
|  | 	ldq	$3,24($17) | ||
|  | 
 | ||
|  | 	ldq	$4,32($17) | ||
|  | 	ldq	$5,40($17) | ||
|  | 	ldq	$6,48($17) | ||
|  | 	ldq	$7,56($17) | ||
|  | 
 | ||
|  | 	stq	$0,0($16) | ||
|  | 	subq	$18,1,$18 | ||
|  | 	stq	$1,8($16) | ||
|  | 	addq	$17,64,$17 | ||
|  | 
 | ||
|  | 	stq	$2,16($16) | ||
|  | 	stq	$3,24($16) | ||
|  | 	stq	$4,32($16) | ||
|  | 	stq	$5,40($16) | ||
|  | 
 | ||
|  | 	stq	$6,48($16) | ||
|  | 	stq	$7,56($16) | ||
|  | 	addq	$16,64,$16 | ||
|  | 	bne	$18, 2b | ||
|  | 
 | ||
|  | 	ret | ||
|  | 	nop | ||
|  | 	unop | ||
|  | 	nop | ||
|  | 
 | ||
|  | 	.end copy_page
 |