This may fix a reported bug where an R_TILEGX_64 in a module was not pointing to an aligned address. Reported-by: Simon Marchi <simon.marchi@polymtl.ca> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
		
			
				
	
	
		
			197 lines
		
	
	
	
		
			6.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
	
		
			6.6 KiB
			
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*
 | 
						|
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
 | 
						|
 *
 | 
						|
 *   This program is free software; you can redistribute it and/or
 | 
						|
 *   modify it under the terms of the GNU General Public License
 | 
						|
 *   as published by the Free Software Foundation, version 2.
 | 
						|
 *
 | 
						|
 *   This program is distributed in the hope that it will be useful, but
 | 
						|
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 | 
						|
 *   NON INFRINGEMENT.  See the GNU General Public License for
 | 
						|
 *   more details.
 | 
						|
 *
 | 
						|
 * Support routines for atomic operations.  Each function takes:
 | 
						|
 *
 | 
						|
 * r0: address to manipulate
 | 
						|
 * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
 | 
						|
 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
 | 
						|
 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
 | 
						|
 *     (atomic64 ops) high word of value to write
 | 
						|
 * r4/r5: (cmpxchg64/add_unless64) new value to write or add
 | 
						|
 *
 | 
						|
 * The 32-bit routines return a "struct __get_user" so that the futex code
 | 
						|
 * has an opportunity to return -EFAULT to the user if needed.
 | 
						|
 * The 64-bit routines just return a "long long" with the value,
 | 
						|
 * since they are only used from kernel space and don't expect to fault.
 | 
						|
 * Support for 16-bit ops is included in the framework but we don't provide
 | 
						|
 * any (x86_64 has an atomic_inc_short(), so we might want to some day).
 | 
						|
 *
 | 
						|
 * Note that the caller is advised to issue a suitable L1 or L2
 | 
						|
 * prefetch on the address being manipulated to avoid extra stalls.
 | 
						|
 * In addition, the hot path is on two icache lines, and we start with
 | 
						|
 * a jump to the second line to make sure they are both in cache so
 | 
						|
 * that we never stall waiting on icache fill while holding the lock.
 | 
						|
 * (This doesn't work out with most 64-bit ops, since they consume
 | 
						|
 * too many bundles, so may take an extra i-cache stall.)
 | 
						|
 *
 | 
						|
 * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
 | 
						|
 * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
 | 
						|
 * the code, just page faults.
 | 
						|
 *
 | 
						|
 * If the load or store faults in a way that can be directly fixed in
 | 
						|
 * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
 | 
						|
 * directly, return to the instruction that faulted, and retry it.
 | 
						|
 *
 | 
						|
 * If the load or store faults in a way that potentially requires us
 | 
						|
 * to release the atomic lock, then retry (e.g. a migrating PTE), we
 | 
						|
 * reset the PC in do_page_fault_ics() to the "tns" instruction so
 | 
						|
 * that on return we will reacquire the lock and restart the op.  We
 | 
						|
 * are somewhat overloading the exception_table_entry notion by doing
 | 
						|
 * this, since those entries are not normally used for migrating PTEs.
 | 
						|
 *
 | 
						|
 * If the main page fault handler discovers a bad address, it will see
 | 
						|
 * the PC pointing to the "tns" instruction (due to the earlier
 | 
						|
 * exception_table_entry processing in do_page_fault_ics), and
 | 
						|
 * re-reset the PC to the fault handler, atomic_bad_address(), which
 | 
						|
 * effectively takes over from the atomic op and can either return a
 | 
						|
 * bad "struct __get_user" (for user addresses) or can just panic (for
 | 
						|
 * bad kernel addresses).
 | 
						|
 *
 | 
						|
 * Note that if the value we would store is the same as what we
 | 
						|
 * loaded, we bypass the store.  Other platforms with true atomics can
 | 
						|
 * make the guarantee that a non-atomic __clear_bit(), for example,
 | 
						|
 * can safely race with an atomic test_and_set_bit(); this example is
 | 
						|
 * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
 | 
						|
 * that on Tile since the "atomic" op is really just a
 | 
						|
 * read/modify/write, and can race with the non-atomic
 | 
						|
 * read/modify/write.  However, if we can short-circuit the write when
 | 
						|
 * it is not needed, in the atomic case, we avoid the race.
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/linkage.h>
 | 
						|
#include <asm/atomic_32.h>
 | 
						|
#include <asm/page.h>
 | 
						|
#include <asm/processor.h>
 | 
						|
 | 
						|
	.section .text.atomic,"ax"
 | 
						|
ENTRY(__start_atomic_asm_code)
 | 
						|
 | 
						|
	.macro  atomic_op, name, bitwidth, body
 | 
						|
	.align  64
 | 
						|
STD_ENTRY_SECTION(__atomic\name, .text.atomic)
 | 
						|
	{
 | 
						|
	 movei  r24, 1
 | 
						|
	 j      4f		/* branch to second cache line */
 | 
						|
	}
 | 
						|
1:	{
 | 
						|
	 .ifc \bitwidth,16
 | 
						|
	 lh     r22, r0
 | 
						|
	 .else
 | 
						|
	 lw     r22, r0
 | 
						|
	 addi   r28, r0, 4
 | 
						|
	 .endif
 | 
						|
	}
 | 
						|
	.ifc \bitwidth,64
 | 
						|
	lw      r23, r28
 | 
						|
	.endif
 | 
						|
	\body /* set r24, and r25 if 64-bit */
 | 
						|
	{
 | 
						|
	 seq    r26, r22, r24
 | 
						|
	 seq    r27, r23, r25
 | 
						|
	}
 | 
						|
	.ifc \bitwidth,64
 | 
						|
	bbnst   r27, 2f
 | 
						|
	.endif
 | 
						|
	bbs     r26, 3f		/* skip write-back if it's the same value */
 | 
						|
2:	{
 | 
						|
	 .ifc \bitwidth,16
 | 
						|
	 sh     r0, r24
 | 
						|
	 .else
 | 
						|
	 sw     r0, r24
 | 
						|
	 .endif
 | 
						|
	}
 | 
						|
	.ifc \bitwidth,64
 | 
						|
	sw      r28, r25
 | 
						|
	.endif
 | 
						|
	mf
 | 
						|
3:	{
 | 
						|
	 move   r0, r22
 | 
						|
	 .ifc \bitwidth,64
 | 
						|
	 move   r1, r23
 | 
						|
	 .else
 | 
						|
	 move   r1, zero
 | 
						|
	 .endif
 | 
						|
	 sw     ATOMIC_LOCK_REG_NAME, zero
 | 
						|
	}
 | 
						|
	mtspr   INTERRUPT_CRITICAL_SECTION, zero
 | 
						|
	jrp     lr
 | 
						|
4:	{
 | 
						|
	 move   ATOMIC_LOCK_REG_NAME, r1
 | 
						|
	 mtspr  INTERRUPT_CRITICAL_SECTION, r24
 | 
						|
	}
 | 
						|
#ifndef CONFIG_SMP
 | 
						|
	j       1b		/* no atomic locks */
 | 
						|
#else
 | 
						|
	{
 | 
						|
	 tns    r21, ATOMIC_LOCK_REG_NAME
 | 
						|
	 moveli r23, 2048       /* maximum backoff time in cycles */
 | 
						|
	}
 | 
						|
	{
 | 
						|
	 bzt    r21, 1b		/* branch if lock acquired */
 | 
						|
	 moveli r25, 32         /* starting backoff time in cycles */
 | 
						|
	}
 | 
						|
5:	mtspr   INTERRUPT_CRITICAL_SECTION, zero
 | 
						|
	mfspr   r26, CYCLE_LOW  /* get start point for this backoff */
 | 
						|
6:	mfspr   r22, CYCLE_LOW  /* test to see if we've backed off enough */
 | 
						|
	sub     r22, r22, r26
 | 
						|
	slt     r22, r22, r25
 | 
						|
	bbst    r22, 6b
 | 
						|
	{
 | 
						|
	 mtspr  INTERRUPT_CRITICAL_SECTION, r24
 | 
						|
	 shli   r25, r25, 1     /* double the backoff; retry the tns */
 | 
						|
	}
 | 
						|
	{
 | 
						|
	 tns    r21, ATOMIC_LOCK_REG_NAME
 | 
						|
	 slt    r26, r23, r25   /* is the proposed backoff too big? */
 | 
						|
	}
 | 
						|
	{
 | 
						|
	 bzt    r21, 1b		/* branch if lock acquired */
 | 
						|
	 mvnz   r25, r26, r23
 | 
						|
	}
 | 
						|
	j       5b
 | 
						|
#endif
 | 
						|
	STD_ENDPROC(__atomic\name)
 | 
						|
	.ifc \bitwidth,32
 | 
						|
	.pushsection __ex_table,"a"
 | 
						|
	.align  4
 | 
						|
	.word   1b, __atomic\name
 | 
						|
	.word   2b, __atomic\name
 | 
						|
	.word   __atomic\name, __atomic_bad_address
 | 
						|
	.popsection
 | 
						|
	.endif
 | 
						|
	.endm
 | 
						|
 | 
						|
atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
 | 
						|
atomic_op _xchg, 32, "move r24, r2"
 | 
						|
atomic_op _xchg_add, 32, "add r24, r22, r2"
 | 
						|
atomic_op _xchg_add_unless, 32, \
 | 
						|
	"sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
 | 
						|
atomic_op _or, 32, "or r24, r22, r2"
 | 
						|
atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
 | 
						|
atomic_op _xor, 32, "xor r24, r22, r2"
 | 
						|
 | 
						|
atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
 | 
						|
	{ bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
 | 
						|
atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
 | 
						|
atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
 | 
						|
	slt_u r26, r24, r22; add r25, r25, r26"
 | 
						|
atomic_op 64_xchg_add_unless, 64, \
 | 
						|
	"{ sne r26, r22, r2; sne r27, r23, r3 }; \
 | 
						|
	{ bbns r26, 3f; add r24, r22, r4 }; \
 | 
						|
	{ bbns r27, 3f; add r25, r23, r5 }; \
 | 
						|
	slt_u r26, r24, r22; add r25, r25, r26"
 | 
						|
 | 
						|
	jrp     lr              /* happy backtracer */
 | 
						|
 | 
						|
ENTRY(__end_atomic_asm_code)
 |