With CPU_DADDI_WORKAROUNDS enabled __delay assembles with a macro in a
branch delay slot:
{standard input}: Assembler messages:
{standard input}:18: Warning: Macro instruction expanded into multiple
instructions in a branch delay slot
and broken code results:
0000000000000000 <__delay>:
   0:	1480ffff 	bnez	a0,0 <__delay>
   4:	24010001 	li	at,1
   8:	0081202f 	dsubu	a0,a0,at
   c:	03e00008 	jr	ra
  10:	00000000 	nop
  14:	00000000 	nop
Consequently the function loops indefinitely, showing up prominently as a
hang in the delay loop calibration at bootstrap.
This change corrects the problem by forcing the immediate 1 into a
register while keeping code produced identical where CPU_DADDI_WORKAROUNDS
is disabled.
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/6669/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
		
	
			
		
			
				
	
	
		
			66 lines
		
	
	
	
		
			1.6 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			66 lines
		
	
	
	
		
			1.6 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * This file is subject to the terms and conditions of the GNU General Public
 | 
						|
 * License.  See the file "COPYING" in the main directory of this archive
 | 
						|
 * for more details.
 | 
						|
 *
 | 
						|
 * Copyright (C) 1994 by Waldorf Electronics
 | 
						|
 * Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle
 | 
						|
 * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
 | 
						|
 * Copyright (C) 2007, 2014 Maciej W. Rozycki
 | 
						|
 */
 | 
						|
#include <linux/module.h>
 | 
						|
#include <linux/param.h>
 | 
						|
#include <linux/smp.h>
 | 
						|
 | 
						|
#include <asm/compiler.h>
 | 
						|
#include <asm/war.h>
 | 
						|
 | 
						|
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 | 
						|
#define GCC_DADDI_IMM_ASM() "I"
 | 
						|
#else
 | 
						|
#define GCC_DADDI_IMM_ASM() "r"
 | 
						|
#endif
 | 
						|
 | 
						|
void __delay(unsigned long loops)
 | 
						|
{
 | 
						|
	__asm__ __volatile__ (
 | 
						|
	"	.set	noreorder				\n"
 | 
						|
	"	.align	3					\n"
 | 
						|
	"1:	bnez	%0, 1b					\n"
 | 
						|
#if BITS_PER_LONG == 32
 | 
						|
	"	subu	%0, %1					\n"
 | 
						|
#else
 | 
						|
	"	dsubu	%0, %1					\n"
 | 
						|
#endif
 | 
						|
	"	.set	reorder					\n"
 | 
						|
	: "=r" (loops)
 | 
						|
	: GCC_DADDI_IMM_ASM() (1), "0" (loops));
 | 
						|
}
 | 
						|
EXPORT_SYMBOL(__delay);
 | 
						|
 | 
						|
/*
 | 
						|
 * Division by multiplication: you don't have to worry about
 | 
						|
 * loss of precision.
 | 
						|
 *
 | 
						|
 * Use only for very small delays ( < 1 msec).	Should probably use a
 | 
						|
 * lookup table, really, as the multiplications take much too long with
 | 
						|
 * short delays.  This is a "reasonable" implementation, though (and the
 | 
						|
 * first constant multiplications gets optimized away if the delay is
 | 
						|
 * a constant)
 | 
						|
 */
 | 
						|
 | 
						|
void __udelay(unsigned long us)
 | 
						|
{
 | 
						|
	unsigned int lpj = raw_current_cpu_data.udelay_val;
 | 
						|
 | 
						|
	__delay((us * 0x000010c7ull * HZ * lpj) >> 32);
 | 
						|
}
 | 
						|
EXPORT_SYMBOL(__udelay);
 | 
						|
 | 
						|
void __ndelay(unsigned long ns)
 | 
						|
{
 | 
						|
	unsigned int lpj = raw_current_cpu_data.udelay_val;
 | 
						|
 | 
						|
	__delay((ns * 0x00000005ull * HZ * lpj) >> 32);
 | 
						|
}
 | 
						|
EXPORT_SYMBOL(__ndelay);
 |