From 660b4b8ec8cbc7ba3bf787dc54b8141b9dba0d65 Mon Sep 17 00:00:00 2001
From: Geoffrey McRae <geoff@hostfission.com>
Date: Sun, 19 Nov 2023 17:15:44 +1100
Subject: [PATCH] [common] rects: fix avx implementation for unaligned accesses

---
 common/src/rects.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/common/src/rects.c b/common/src/rects.c
index d86ab7fd..07c6d68b 100644
--- a/common/src/rects.c
+++ b/common/src/rects.c
@@ -328,29 +328,35 @@ static void rectCopyUnaligned_avx(
   src += ystart * srcPitch + dx;
   dst += ystart * dstPitch + dx;
 
-  const int nvec = width / sizeof(__m256i);
-  const int rem  = width % sizeof(__m256i);
+  const int nvec  = width / sizeof(__m256i);
+  const int rem   = width % sizeof(__m256i);
+  const int align = (uintptr_t)dst & 31;
 
   for (int i = ystart; i < yend; ++i)
   {
-    const __m256i *restrict s = (__m256i*)src;
-          __m256i *restrict d = (__m256i*)dst;
+    // copy the unaligned bytes
+    for(int col = align; col > 0; --col)
+      dst[col] = src[col];
+
+    const __m256i *restrict s = (__m256i*)(src + align);
+          __m256i *restrict d = (__m256i*)ALIGN_TO((uintptr_t)dst, 32);
 
     int vec;
     for(vec = nvec; vec > 3; vec -= 4)
     {
-      _mm256_stream_si256(d + 0, _mm256_load_si256(s + 0));
-      _mm256_stream_si256(d + 1, _mm256_load_si256(s + 1));
-      _mm256_stream_si256(d + 2, _mm256_load_si256(s + 2));
-      _mm256_stream_si256(d + 3, _mm256_load_si256(s + 3));
+      _mm256_stream_si256(d + 0, _mm256_loadu_si256(s + 0));
+      _mm256_stream_si256(d + 1, _mm256_loadu_si256(s + 1));
+      _mm256_stream_si256(d + 2, _mm256_loadu_si256(s + 2));
+      _mm256_stream_si256(d + 3, _mm256_loadu_si256(s + 3));
 
       s += 4;
       d += 4;
     }
 
     for(; vec > 0; --vec, ++d, ++s)
-      _mm256_stream_si256(d, _mm256_load_si256(s));
+      _mm256_stream_si256(d, _mm256_loadu_si256(s));
 
+    // copy any remaining bytes
     for(int col = width - rem; col < width; ++col)
       dst[col] = src[col];