Here's the code:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | static f32 HorizontalAdd(__m128 PackedSingle) { f32* PackedSinglePtr = (f32*)&PackedSingle; f32 Result = (PackedSinglePtr[0] + PackedSinglePtr[1] + PackedSinglePtr[2] + PackedSinglePtr[3]); return(Result); } static v4 GetMeanColor(loaded_raster* Raster) { __m128i MaskFF_4x = _mm_set1_epi32(0xFF); __m128 Inv255_4x = _mm_set1_ps(1.0f / 255.0f); __m128 Accumulator = _mm_set1_ps(0.0f); u32* SourceDest = (u32*)Raster->Address; for(s32 Y = 0; Y < Raster->Height; Y++) { for(s32 X = 0; X < Raster->Width; X += 4) { __m128i C = _mm_loadu_si128((__m128i*)SourceDest); __m128 Texelb = _mm_cvtepi32_ps(_mm_and_si128(C, MaskFF_4x)); __m128 Texelg = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 8), MaskFF_4x)); __m128 Texelr = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 16), MaskFF_4x)); __m128 Texela = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 24), MaskFF_4x)); Texelb = _mm_mul_ps(Texelb, Inv255_4x); Texelg = _mm_mul_ps(Texelg, Inv255_4x); Texelr = _mm_mul_ps(Texelr, Inv255_4x); Texela = _mm_mul_ps(Texela, Inv255_4x); Accumulator = _mm_add_ps(Accumulator, _mm_set_ps(HorizontalAdd(Texela), HorizontalAdd(Texelb), HorizontalAdd(Texelg), HorizontalAdd(Texelr))); SourceDest += 4; } } __m128 InvPixelCount = _mm_set1_ps(1.0f / (Raster->Width*Raster->Height)); Accumulator = _mm_mul_ps(Accumulator, InvPixelCount); v4 Result; _mm_storeu_ps((f32*)&Result, Accumulator); return(Result); } |