121 lines
		
	
	
	
		
			3.2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
		
		
			
		
	
	
			121 lines
		
	
	
	
		
			3.2 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| 
								 | 
							
									.section	.text..SHmedia32,"ax"
							 | 
						||
| 
								 | 
							
									.align	2
							 | 
						||
| 
								 | 
							
									.global	__udivdi3
							 | 
						||
| 
								 | 
							
								__udivdi3:
							 | 
						||
| 
								 | 
							
									shlri r3,1,r4
							 | 
						||
| 
								 | 
							
									nsb r4,r22
							 | 
						||
| 
								 | 
							
									shlld r3,r22,r6
							 | 
						||
| 
								 | 
							
									shlri r6,49,r5
							 | 
						||
| 
								 | 
							
									movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
							 | 
						||
| 
								 | 
							
									sub r21,r5,r1
							 | 
						||
| 
								 | 
							
									mmulfx.w r1,r1,r4
							 | 
						||
| 
								 | 
							
									mshflo.w r1,r63,r1
							 | 
						||
| 
								 | 
							
									sub r63,r22,r20 // r63 == 64 % 64
							 | 
						||
| 
								 | 
							
									mmulfx.w r5,r4,r4
							 | 
						||
| 
								 | 
							
									pta large_divisor,tr0
							 | 
						||
| 
								 | 
							
									addi r20,32,r9
							 | 
						||
| 
								 | 
							
									msub.w r1,r4,r1
							 | 
						||
| 
								 | 
							
									madd.w r1,r1,r1
							 | 
						||
| 
								 | 
							
									mmulfx.w r1,r1,r4
							 | 
						||
| 
								 | 
							
									shlri r6,32,r7
							 | 
						||
| 
								 | 
							
									bgt/u r9,r63,tr0 // large_divisor
							 | 
						||
| 
								 | 
							
									mmulfx.w r5,r4,r4
							 | 
						||
| 
								 | 
							
									shlri r2,32+14,r19
							 | 
						||
| 
								 | 
							
									addi r22,-31,r0
							 | 
						||
| 
								 | 
							
									msub.w r1,r4,r1
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									mulu.l r1,r7,r4
							 | 
						||
| 
								 | 
							
									addi r1,-3,r5
							 | 
						||
| 
								 | 
							
									mulu.l r5,r19,r5
							 | 
						||
| 
								 | 
							
									sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
							 | 
						||
| 
								 | 
							
									shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
							 | 
						||
| 
								 | 
							
									                 the case may be, %0000000000000000 000.11111111111, still */
							 | 
						||
| 
								 | 
							
									muls.l r1,r4,r4 /* leaving at least one sign bit.  */
							 | 
						||
| 
								 | 
							
									mulu.l r5,r3,r8
							 | 
						||
| 
								 | 
							
									mshalds.l r1,r21,r1
							 | 
						||
| 
								 | 
							
									shari r4,26,r4
							 | 
						||
| 
								 | 
							
									shlld r8,r0,r8
							 | 
						||
| 
								 | 
							
									add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
							 | 
						||
| 
								 | 
							
									sub r2,r8,r2
							 | 
						||
| 
								 | 
							
									/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									shlri r2,22,r21
							 | 
						||
| 
								 | 
							
									mulu.l r21,r1,r21
							 | 
						||
| 
								 | 
							
									shlld r5,r0,r8
							 | 
						||
| 
								 | 
							
									addi r20,30-22,r0
							 | 
						||
| 
								 | 
							
									shlrd r21,r0,r21
							 | 
						||
| 
								 | 
							
									mulu.l r21,r3,r5
							 | 
						||
| 
								 | 
							
									add r8,r21,r8
							 | 
						||
| 
								 | 
							
									mcmpgt.l r21,r63,r21 // See Note 1
							 | 
						||
| 
								 | 
							
									addi r20,30,r0
							 | 
						||
| 
								 | 
							
									mshfhi.l r63,r21,r21
							 | 
						||
| 
								 | 
							
									sub r2,r5,r2
							 | 
						||
| 
								 | 
							
									andc r2,r21,r2
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									/* small divisor: need a third divide step */
							 | 
						||
| 
								 | 
							
									mulu.l r2,r1,r7
							 | 
						||
| 
								 | 
							
									ptabs r18,tr0
							 | 
						||
| 
								 | 
							
									addi r2,1,r2
							 | 
						||
| 
								 | 
							
									shlrd r7,r0,r7
							 | 
						||
| 
								 | 
							
									mulu.l r7,r3,r5
							 | 
						||
| 
								 | 
							
									add r8,r7,r8
							 | 
						||
| 
								 | 
							
									sub r2,r3,r2
							 | 
						||
| 
								 | 
							
									cmpgt r2,r5,r5
							 | 
						||
| 
								 | 
							
									add r8,r5,r2
							 | 
						||
| 
								 | 
							
									/* could test r3 here to check for divide by zero.  */
							 | 
						||
| 
								 | 
							
									blink tr0,r63
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								large_divisor:
							 | 
						||
| 
								 | 
							
									mmulfx.w r5,r4,r4
							 | 
						||
| 
								 | 
							
									shlrd r2,r9,r25
							 | 
						||
| 
								 | 
							
									shlri r25,32,r8
							 | 
						||
| 
								 | 
							
									msub.w r1,r4,r1
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									mulu.l r1,r7,r4
							 | 
						||
| 
								 | 
							
									addi r1,-3,r5
							 | 
						||
| 
								 | 
							
									mulu.l r5,r8,r5
							 | 
						||
| 
								 | 
							
									sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
							 | 
						||
| 
								 | 
							
									shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
							 | 
						||
| 
								 | 
							
									                 the case may be, %0000000000000000 000.11111111111, still */
							 | 
						||
| 
								 | 
							
									muls.l r1,r4,r4 /* leaving at least one sign bit.  */
							 | 
						||
| 
								 | 
							
									shlri r5,14-1,r8
							 | 
						||
| 
								 | 
							
									mulu.l r8,r7,r5
							 | 
						||
| 
								 | 
							
									mshalds.l r1,r21,r1
							 | 
						||
| 
								 | 
							
									shari r4,26,r4
							 | 
						||
| 
								 | 
							
									add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
							 | 
						||
| 
								 | 
							
									sub r25,r5,r25
							 | 
						||
| 
								 | 
							
									/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									shlri r25,22,r21
							 | 
						||
| 
								 | 
							
									mulu.l r21,r1,r21
							 | 
						||
| 
								 | 
							
									pta no_lo_adj,tr0
							 | 
						||
| 
								 | 
							
									addi r22,32,r0
							 | 
						||
| 
								 | 
							
									shlri r21,40,r21
							 | 
						||
| 
								 | 
							
									mulu.l r21,r7,r5
							 | 
						||
| 
								 | 
							
									add r8,r21,r8
							 | 
						||
| 
								 | 
							
									shlld r2,r0,r2
							 | 
						||
| 
								 | 
							
									sub r25,r5,r25
							 | 
						||
| 
								 | 
							
									bgtu/u r7,r25,tr0 // no_lo_adj
							 | 
						||
| 
								 | 
							
									addi r8,1,r8
							 | 
						||
| 
								 | 
							
									sub r25,r7,r25
							 | 
						||
| 
								 | 
							
								no_lo_adj:
							 | 
						||
| 
								 | 
							
									mextr4 r2,r25,r2
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									/* large_divisor: only needs a few adjustments.  */
							 | 
						||
| 
								 | 
							
									mulu.l r8,r6,r5
							 | 
						||
| 
								 | 
							
									ptabs r18,tr0
							 | 
						||
| 
								 | 
							
									/* bubble */
							 | 
						||
| 
								 | 
							
									cmpgtu r5,r2,r5
							 | 
						||
| 
								 | 
							
									sub r8,r5,r2
							 | 
						||
| 
								 | 
							
									blink tr0,r63
							 | 
						||
| 
								 | 
							
									
							 | 
						||
| 
								 | 
							
								/* Note 1: To shift the result of the second divide stage so that the result
							 | 
						||
| 
								 | 
							
								   always fits into 32 bits, yet we still reduce the rest sufficiently
							 | 
						||
| 
								 | 
							
								   would require a lot of instructions to do the shifts just right.  Using
							 | 
						||
| 
								 | 
							
								   the full 64 bit shift result to multiply with the divisor would require
							 | 
						||
| 
								 | 
							
								   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
							 | 
						||
| 
								 | 
							
								   Fortunately, if the upper 32 bits of the shift result are nonzero, we
							 | 
						||
| 
								 | 
							
								   know that the rest after taking this partial result into account will
							 | 
						||
| 
								 | 
							
								   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
							 | 
						||
| 
								 | 
							
								   upper 32 bits of the partial result are nonzero.  */
							 |