 9678cdaae9
			
		
	
	
	9678cdaae9
	
	
	
		
			
			The POWER8 processor has a Micro Partition Prefetch Engine, which is a fancy way of saying "has way to store and load contents of L2 or L2+MRU way of L3 cache". We initiate the storing of the log (list of addresses) using the logmpp instruction and start restore by writing to a SPR. The logmpp instruction takes parameters in a single 64bit register: - starting address of the table to store log of L2/L2+L3 cache contents - 32kb for L2 - 128kb for L2+L3 - Aligned relative to maximum size of the table (32kb or 128kb) - Log control (no-op, L2 only, L2 and L3, abort logout) We should abort any ongoing logging before initiating one. To initiate restore, we write to the MPPR SPR. The format of what to write to the SPR is similar to the logmpp instruction parameter: - starting address of the table to read from (same alignment requirements) - table size (no data, until end of table) - prefetch rate (from fastest possible to slower. about every 8, 16, 24 or 32 cycles) The idea behind loading and storing the contents of L2/L3 cache is to reduce memory latency in a system that is frequently swapping vcores on a physical CPU. The best case scenario for doing this is when some vcores are doing very cache heavy workloads. The worst case is when they have about 0 cache hits, so we just generate needless memory operations. This implementation just does L2 store/load. In my benchmarks this proves to be useful. Benchmark 1: - 16 core POWER8 - 3x Ubuntu 14.04LTS guests (LE) with 8 VCPUs each - No split core/SMT - two guests running sysbench memory test. sysbench --test=memory --num-threads=8 run - one guest running apache bench (of default HTML page) ab -n 490000 -c 400 http://localhost/ This benchmark aims to measure performance of real world application (apache) where other guests are cache hot with their own workloads. The sysbench memory benchmark does pointer sized writes to a (small) memory buffer in a loop. In this benchmark with this patch I can see an improvement both in requests per second (~5%) and in mean and median response times (again, about 5%). The spread of minimum and maximum response times were largely unchanged. benchmark 2: - Same VM config as benchmark 1 - all three guests running sysbench memory benchmark This benchmark aims to see if there is a positive or negative affect to this cache heavy benchmark. Although due to the nature of the benchmark (stores) we may not see a difference in performance, but rather hopefully an improvement in consistency of performance (when vcore switched in, don't have to wait many times for cachelines to be pulled in) The results of this benchmark are improvements in consistency of performance rather than performance itself. With this patch, the few outliers in duration go away and we get more consistent performance in each guest. benchmark 3: - same 3 guests and CPU configuration as benchmark 1 and 2. - two idle guests - 1 guest running STREAM benchmark This scenario also saw performance improvement with this patch. On Copy and Scale workloads from STREAM, I got 5-6% improvement with this patch. For Add and triad, it was around 10% (or more). benchmark 4: - same 3 guests as previous benchmarks - two guests running sysbench --memory, distinctly different cache heavy workload - one guest running STREAM benchmark. Similar improvements to benchmark 3. benchmark 5: - 1 guest, 8 VCPUs, Ubuntu 14.04 - Host configured with split core (SMT8, subcores-per-core=4) - STREAM benchmark In this benchmark, we see a 10-20% performance improvement across the board of STREAM benchmark results with this patch. Based on preliminary investigation and microbenchmarks by Prerna Saxena <prerna@linux.vnet.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com> Acked-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
		
			
				
	
	
		
			84 lines
		
	
	
	
		
			2 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
	
		
			2 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _ASM_POWERPC_CACHE_H
 | |
| #define _ASM_POWERPC_CACHE_H
 | |
| 
 | |
| #ifdef __KERNEL__
 | |
| 
 | |
| #include <asm/reg.h>
 | |
| 
 | |
| /* bytes per L1 cache line */
 | |
| #if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
 | |
| #define L1_CACHE_SHIFT		4
 | |
| #define MAX_COPY_PREFETCH	1
 | |
| #elif defined(CONFIG_PPC_E500MC)
 | |
| #define L1_CACHE_SHIFT		6
 | |
| #define MAX_COPY_PREFETCH	4
 | |
| #elif defined(CONFIG_PPC32)
 | |
| #define MAX_COPY_PREFETCH	4
 | |
| #if defined(CONFIG_PPC_47x)
 | |
| #define L1_CACHE_SHIFT		7
 | |
| #else
 | |
| #define L1_CACHE_SHIFT		5
 | |
| #endif
 | |
| #else /* CONFIG_PPC64 */
 | |
| #define L1_CACHE_SHIFT		7
 | |
| #endif
 | |
| 
 | |
| #define	L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 | |
| 
 | |
| #define	SMP_CACHE_BYTES		L1_CACHE_BYTES
 | |
| 
 | |
| #if defined(__powerpc64__) && !defined(__ASSEMBLY__)
 | |
| struct ppc64_caches {
 | |
| 	u32	dsize;			/* L1 d-cache size */
 | |
| 	u32	dline_size;		/* L1 d-cache line size	*/
 | |
| 	u32	log_dline_size;
 | |
| 	u32	dlines_per_page;
 | |
| 	u32	isize;			/* L1 i-cache size */
 | |
| 	u32	iline_size;		/* L1 i-cache line size	*/
 | |
| 	u32	log_iline_size;
 | |
| 	u32	ilines_per_page;
 | |
| };
 | |
| 
 | |
| extern struct ppc64_caches ppc64_caches;
 | |
| 
 | |
| static inline void logmpp(u64 x)
 | |
| {
 | |
| 	asm volatile(PPC_LOGMPP(R1) : : "r" (x));
 | |
| }
 | |
| 
 | |
| #endif /* __powerpc64__ && ! __ASSEMBLY__ */
 | |
| 
 | |
| #if defined(__ASSEMBLY__)
 | |
| /*
 | |
|  * For a snooping icache, we still need a dummy icbi to purge all the
 | |
|  * prefetched instructions from the ifetch buffers. We also need a sync
 | |
|  * before the icbi to order the the actual stores to memory that might
 | |
|  * have modified instructions with the icbi.
 | |
|  */
 | |
| #define PURGE_PREFETCHED_INS	\
 | |
| 	sync;			\
 | |
| 	icbi	0,r3;		\
 | |
| 	sync;			\
 | |
| 	isync
 | |
| 
 | |
| #else
 | |
| #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 | |
| 
 | |
| #ifdef CONFIG_6xx
 | |
| extern long _get_L2CR(void);
 | |
| extern long _get_L3CR(void);
 | |
| extern void _set_L2CR(unsigned long);
 | |
| extern void _set_L3CR(unsigned long);
 | |
| #else
 | |
| #define _get_L2CR()	0L
 | |
| #define _get_L3CR()	0L
 | |
| #define _set_L2CR(val)	do { } while(0)
 | |
| #define _set_L3CR(val)	do { } while(0)
 | |
| #endif
 | |
| 
 | |
| extern void cacheable_memzero(void *p, unsigned int nb);
 | |
| extern void *cacheable_memcpy(void *, const void *, unsigned int);
 | |
| 
 | |
| #endif /* !__ASSEMBLY__ */
 | |
| #endif /* __KERNEL__ */
 | |
| #endif /* _ASM_POWERPC_CACHE_H */
 |