mirror of
https://github.com/gnif/LookingGlass.git
synced 2025-01-03 11:17:10 +00:00
[common] add runtime detection and selection of AVX/AVX2 support
This commit is contained in:
parent
5d4c1d348c
commit
3330f83af6
6 changed files with 245 additions and 14 deletions
|
@ -29,4 +29,19 @@ bool cpuInfo_get(char * model, size_t modelSize, int * procs, int * cores,
|
||||||
|
|
||||||
void cpuInfo_log(void);
|
void cpuInfo_log(void);
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
bool sse, sse2, sse3, ssse3;
|
||||||
|
bool fma;
|
||||||
|
bool sse4_1, sse4_2;
|
||||||
|
bool popcnt;
|
||||||
|
bool aes;
|
||||||
|
bool xsave, osxsave;
|
||||||
|
bool avx, avx2;
|
||||||
|
bool bmi1, bmi2;
|
||||||
|
}
|
||||||
|
CPUInfoFeatures;
|
||||||
|
|
||||||
|
const CPUInfoFeatures * cpuInfo_getFeatures(void);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -70,7 +70,8 @@ void framebuffer_prepare(FrameBuffer * frame);
|
||||||
/**
|
/**
|
||||||
* Write data from the src buffer into the KVMFRFrame
|
* Write data from the src buffer into the KVMFRFrame
|
||||||
*/
|
*/
|
||||||
bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size);
|
extern bool (*framebuffer_write)(FrameBuffer * frame,
|
||||||
|
const void * restrict src, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the underlying data buffer of the framebuffer.
|
* Gets the underlying data buffer of the framebuffer.
|
||||||
|
|
|
@ -27,18 +27,8 @@
|
||||||
#include "common/framebuffer.h"
|
#include "common/framebuffer.h"
|
||||||
#include "common/types.h"
|
#include "common/types.h"
|
||||||
|
|
||||||
inline static void rectCopyUnaligned(uint8_t * dst, const uint8_t * src,
|
extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src,
|
||||||
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width);
|
||||||
{
|
|
||||||
src += ystart * srcPitch + dx;
|
|
||||||
dst += ystart * dstPitch + dx;
|
|
||||||
for (int i = ystart; i < yend; ++i)
|
|
||||||
{
|
|
||||||
memcpy(dst, src, width);
|
|
||||||
src += srcPitch;
|
|
||||||
dst += dstPitch;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
|
void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
|
||||||
FrameBuffer * frame, int dstPitch, int height,
|
FrameBuffer * frame, int dstPitch, int height,
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
|
|
||||||
#include "common/cpuinfo.h"
|
#include "common/cpuinfo.h"
|
||||||
#include "common/debug.h"
|
#include "common/debug.h"
|
||||||
|
#include "common/util.h"
|
||||||
|
|
||||||
void cpuInfo_log(void)
|
void cpuInfo_log(void)
|
||||||
{
|
{
|
||||||
|
@ -37,3 +38,73 @@ void cpuInfo_log(void)
|
||||||
DEBUG_INFO("CPU Model: %s", model);
|
DEBUG_INFO("CPU Model: %s", model);
|
||||||
DEBUG_INFO("CPU: %d sockets, %d cores, %d threads", sockets, cores, procs);
|
DEBUG_INFO("CPU: %d sockets, %d cores, %d threads", sockets, cores, procs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const CPUInfoFeatures * cpuInfo_getFeatures(void)
|
||||||
|
{
|
||||||
|
static bool initialized = false;
|
||||||
|
static CPUInfoFeatures features;
|
||||||
|
|
||||||
|
if (likely(initialized))
|
||||||
|
return &features;
|
||||||
|
|
||||||
|
int cpuid[4] = {0};
|
||||||
|
|
||||||
|
// leaf1
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"cpuid;"
|
||||||
|
: "=a" (cpuid[0]),
|
||||||
|
"=b" (cpuid[1]),
|
||||||
|
"=c" (cpuid[2]),
|
||||||
|
"=d" (cpuid[3])
|
||||||
|
: "a" (1)
|
||||||
|
);
|
||||||
|
|
||||||
|
features.sse = cpuid[3] & (1 << 25);
|
||||||
|
features.sse2 = cpuid[3] & (1 << 26);
|
||||||
|
features.sse3 = cpuid[2] & (1 << 0);
|
||||||
|
features.ssse3 = cpuid[2] & (1 << 9);
|
||||||
|
features.fma = cpuid[2] & (1 << 12);
|
||||||
|
features.sse4_1 = cpuid[2] & (1 << 19);
|
||||||
|
features.sse4_2 = cpuid[2] & (1 << 20);
|
||||||
|
features.popcnt = cpuid[2] & (1 << 23);
|
||||||
|
features.aes = cpuid[2] & (1 << 25);
|
||||||
|
features.xsave = cpuid[2] & (1 << 26);
|
||||||
|
features.osxsave = cpuid[2] & (1 << 27);
|
||||||
|
features.avx = cpuid[2] & (1 << 28);
|
||||||
|
|
||||||
|
// leaf7
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"cpuid;"
|
||||||
|
: "=a" (cpuid[0]),
|
||||||
|
"=b" (cpuid[1]),
|
||||||
|
"=c" (cpuid[2]),
|
||||||
|
"=d" (cpuid[3])
|
||||||
|
: "a" (7), "c" (0)
|
||||||
|
);
|
||||||
|
|
||||||
|
features.avx2 = cpuid[1] & (1 << 5);
|
||||||
|
features.bmi1 = cpuid[2] & (1 << 3);
|
||||||
|
features.bmi2 = cpuid[2] & (1 << 8);
|
||||||
|
|
||||||
|
if (features.osxsave && features.avx)
|
||||||
|
{
|
||||||
|
int xgetbv = 0;
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"xgetbv;"
|
||||||
|
: "=a" (xgetbv)
|
||||||
|
: "c" (0)
|
||||||
|
: "edx"
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!(xgetbv & 0x6))
|
||||||
|
{
|
||||||
|
features.avx = false;
|
||||||
|
features.avx2 = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &features;
|
||||||
|
};
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "common/framebuffer.h"
|
#include "common/framebuffer.h"
|
||||||
|
#include "common/cpuinfo.h"
|
||||||
#include "common/debug.h"
|
#include "common/debug.h"
|
||||||
|
|
||||||
//#define FB_PROFILE
|
//#define FB_PROFILE
|
||||||
|
@ -29,6 +30,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
#include <immintrin.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
bool framebuffer_wait(const FrameBuffer * frame, size_t size)
|
bool framebuffer_wait(const FrameBuffer * frame, size_t size)
|
||||||
|
@ -165,7 +167,8 @@ void framebuffer_prepare(FrameBuffer * frame)
|
||||||
atomic_store_explicit(&frame->wp, 0, memory_order_release);
|
atomic_store_explicit(&frame->wp, 0, memory_order_release);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size)
|
static bool framebuffer_write_sse4_1(FrameBuffer * frame,
|
||||||
|
const void * restrict src, size_t size)
|
||||||
{
|
{
|
||||||
#ifdef FB_PROFILE
|
#ifdef FB_PROFILE
|
||||||
static RunningAvg ra = NULL;
|
static RunningAvg ra = NULL;
|
||||||
|
@ -222,6 +225,100 @@ bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t si
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma GCC push_options
|
||||||
|
#pragma GCC target ("avx2")
|
||||||
|
bool framebuffer_write_avx2(FrameBuffer * frame,
|
||||||
|
const void * restrict src, size_t size)
|
||||||
|
{
|
||||||
|
#ifdef FB_PROFILE
|
||||||
|
static RunningAvg ra = NULL;
|
||||||
|
static int raCount = 0;
|
||||||
|
const uint64_t ts = microtime();
|
||||||
|
if (!ra)
|
||||||
|
ra = runningavg_new(100);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__m256i *restrict s = (__m256i *)src;
|
||||||
|
__m256i *restrict d = (__m256i *)frame->data;
|
||||||
|
size_t wp = 0;
|
||||||
|
|
||||||
|
_mm_mfence();
|
||||||
|
|
||||||
|
/* copy in chunks */
|
||||||
|
while (size > 127)
|
||||||
|
{
|
||||||
|
__m256i *_d = (__m256i *)d;
|
||||||
|
__m256i *_s = (__m256i *)s;
|
||||||
|
__m256i v1 = _mm256_stream_load_si256(_s + 0);
|
||||||
|
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
||||||
|
__m256i v3 = _mm256_stream_load_si256(_s + 2);
|
||||||
|
__m256i v4 = _mm256_stream_load_si256(_s + 3);
|
||||||
|
|
||||||
|
_mm256_stream_si256(_d + 0, v1);
|
||||||
|
_mm256_stream_si256(_d + 1, v2);
|
||||||
|
_mm256_stream_si256(_d + 2, v3);
|
||||||
|
_mm256_stream_si256(_d + 3, v4);
|
||||||
|
|
||||||
|
s += 4;
|
||||||
|
d += 4;
|
||||||
|
size -= 128;
|
||||||
|
wp += 128;
|
||||||
|
|
||||||
|
if (wp % FB_CHUNK_SIZE == 0)
|
||||||
|
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size > 63)
|
||||||
|
{
|
||||||
|
__m256i *_d = (__m256i *)d;
|
||||||
|
__m256i *_s = (__m256i *)s;
|
||||||
|
__m256i v1 = _mm256_stream_load_si256(_s);
|
||||||
|
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
||||||
|
|
||||||
|
_mm256_stream_si256(_d, v1);
|
||||||
|
_mm256_stream_si256(_d + 1, v2);
|
||||||
|
|
||||||
|
s += 2;
|
||||||
|
d += 2;
|
||||||
|
size -= 64;
|
||||||
|
wp += 64;
|
||||||
|
|
||||||
|
if (wp % FB_CHUNK_SIZE == 0)
|
||||||
|
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size)
|
||||||
|
{
|
||||||
|
memcpy(frame->data + wp, s, size);
|
||||||
|
wp += size;
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||||
|
|
||||||
|
#ifdef FB_PROFILE
|
||||||
|
runningavg_push(ra, microtime() - ts);
|
||||||
|
if (++raCount % 100 == 0)
|
||||||
|
DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#pragma GCC pop_options
|
||||||
|
|
||||||
|
static bool _framebuffer_write(FrameBuffer * frame,
|
||||||
|
const void * restrict src, size_t size)
|
||||||
|
{
|
||||||
|
if (cpuInfo_getFeatures()->avx2)
|
||||||
|
framebuffer_write = &framebuffer_write_avx2;
|
||||||
|
else
|
||||||
|
framebuffer_write = &framebuffer_write_sse4_1;
|
||||||
|
|
||||||
|
return framebuffer_write(frame, src, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool (*framebuffer_write)(FrameBuffer * frame,
|
||||||
|
const void * restrict src, size_t size) = &_framebuffer_write;
|
||||||
|
|
||||||
const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame)
|
const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame)
|
||||||
{
|
{
|
||||||
return frame->data;
|
return frame->data;
|
||||||
|
|
|
@ -20,8 +20,10 @@
|
||||||
|
|
||||||
#include "common/rects.h"
|
#include "common/rects.h"
|
||||||
#include "common/util.h"
|
#include "common/util.h"
|
||||||
|
#include "common/cpuinfo.h"
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
struct Corner
|
struct Corner
|
||||||
{
|
{
|
||||||
|
@ -298,3 +300,58 @@ int rectsRejectContained(FrameDamageRect * rects, int count)
|
||||||
|
|
||||||
return removeRects(rects, count, removed);
|
return removeRects(rects, count, removed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
|
||||||
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
|
{
|
||||||
|
src += ystart * srcPitch + dx;
|
||||||
|
dst += ystart * dstPitch + dx;
|
||||||
|
for (int i = ystart; i < yend; ++i)
|
||||||
|
{
|
||||||
|
memcpy(dst, src, width);
|
||||||
|
src += srcPitch;
|
||||||
|
dst += dstPitch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma GCC push_options
|
||||||
|
#pragma GCC target ("avx2")
|
||||||
|
static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
|
||||||
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
|
{
|
||||||
|
src += ystart * srcPitch + dx;
|
||||||
|
dst += ystart * dstPitch + dx;
|
||||||
|
for (int i = ystart; i < yend; ++i)
|
||||||
|
{
|
||||||
|
int col;
|
||||||
|
for(col = 0; col <= width - 32; col += 32)
|
||||||
|
{
|
||||||
|
_mm_prefetch(src + col + 256, _MM_HINT_T0);
|
||||||
|
__m256i srcData = _mm256_loadu_si256((__m256i*)(src + col));
|
||||||
|
_mm256_storeu_si256((__m256i*)(dst + col), srcData);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(; col < width; ++col)
|
||||||
|
dst[col] = src[col];
|
||||||
|
|
||||||
|
src += srcPitch;
|
||||||
|
dst += dstPitch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#pragma GCC pop_options
|
||||||
|
|
||||||
|
static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src,
|
||||||
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
|
{
|
||||||
|
if (cpuInfo_getFeatures()->avx)
|
||||||
|
rectCopyUnaligned = &rectCopyUnaligned_avx;
|
||||||
|
else
|
||||||
|
rectCopyUnaligned = &rectCopyUnaligned_memcpy;
|
||||||
|
|
||||||
|
return rectCopyUnaligned(
|
||||||
|
dst, src, ystart, yend, dx, dstPitch, srcPitch, width);
|
||||||
|
}
|
||||||
|
|
||||||
|
void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src,
|
||||||
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) =
|
||||||
|
&_rectCopyUnaligned;
|
||||||
|
|
Loading…
Reference in a new issue