Web lists-archives.com

[MPlayer-dev-eng] [PATCH 3/3] vf_eq.c, vf_eq2.c: Add SSE2 support.




The code is actually identical, it would be
even nicer if it was actually shared.
---
 libmpcodecs/vf_eq.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 libmpcodecs/vf_eq2.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/libmpcodecs/vf_eq.c b/libmpcodecs/vf_eq.c
index 1ee3a5970..eff657290 100644
--- a/libmpcodecs/vf_eq.c
+++ b/libmpcodecs/vf_eq.c
@@ -104,6 +104,45 @@ static void process_MMX(unsigned char *dest, int dstride, unsigned char *src, in
 }
 #endif

+#if HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+ATTR_TARGET_SSE2
+static void process_SSE2(unsigned char *dest, int dstride, unsigned char *src, int sstride,
+                    int w, int h, int brightness, int contrast)
+{
+    int scaled_contrast = ((contrast+100)*256*16)/100;
+    int scaled_brightness = ((brightness+100)*511)/200-128 - scaled_contrast/32;
+    __m128i mmcontrast = _mm_set1_epi16(scaled_contrast);
+    __m128i mmbrightness = _mm_set1_epi16(scaled_brightness);
+    __m128i zero = _mm_setzero_si128();
+    while (h--) {
+        int i;
+        for (i = 0; i < w - 15; i += 16)
+        {
+            __m128i mmsrc = _mm_loadu_si128((const __m128i *)(src + i));
+            __m128i srclo = _mm_unpacklo_epi8(mmsrc, zero);
+            __m128i srchi = _mm_unpackhi_epi8(mmsrc, zero);
+            srclo = _mm_slli_epi16(srclo, 4);
+            srchi = _mm_slli_epi16(srchi, 4);
+            srclo = _mm_mulhi_epu16(srclo, mmcontrast);
+            srchi = _mm_mulhi_epu16(srchi, mmcontrast);
+            srclo = _mm_add_epi16(srclo, mmbrightness);
+            srchi = _mm_add_epi16(srchi, mmbrightness);
+            _mm_storeu_si128((__m128i *)(dest + i), _mm_packus_epi16(srclo, srchi));
+        }
+        for (; i < w; i++)
+        {
+            int pel = ((src[i] * scaled_contrast)>>12) + scaled_brightness;
+            if(pel&768) pel = (-pel)>>31;
+            dest[i] = pel;
+        }
+        src += sstride;
+        dest += dstride;
+    }
+}
+#endif
+
 static void process_C(unsigned char *dest, int dstride, unsigned char *src, int sstride,
                     int w, int h, int brightness, int contrast)
 {
@@ -230,6 +269,9 @@ static int vf_open(vf_instance_t *vf, char *args)
 #if HAVE_MMX_INLINE
         if(gCpuCaps.hasMMX) process = process_MMX;
 #endif
+#if HAVE_EMMINTRIN_H
+        if(gCpuCaps.hasSSE2) process = process_SSE2;
+#endif

         return 1;
 }
diff --git a/libmpcodecs/vf_eq2.c b/libmpcodecs/vf_eq2.c
index e02a59c88..d32e23bd3 100644
--- a/libmpcodecs/vf_eq2.c
+++ b/libmpcodecs/vf_eq2.c
@@ -188,6 +188,45 @@ void affine_1d_MMX (eq2_param_t *par, unsigned char *dst, unsigned char *src,
 }
 #endif

+#if HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+ATTR_TARGET_SSE2
+static void affine_1d_SSE2(eq2_param_t *par, unsigned char *dst, unsigned char *src,
+  unsigned w, unsigned h, unsigned dstride, unsigned sstride)
+{
+    int scaled_contrast = par->c * 256 * 16;
+    int scaled_brightness = ((par->b+1.0)*511)/2-128 - scaled_contrast/32;
+    __m128i mmcontrast = _mm_set1_epi16(scaled_contrast);
+    __m128i mmbrightness = _mm_set1_epi16(scaled_brightness);
+    __m128i zero = _mm_setzero_si128();
+    while (h--) {
+        int i;
+        for (i = 0; i < w - 15; i += 16)
+        {
+            __m128i mmsrc = _mm_loadu_si128((const __m128i *)(src + i));
+            __m128i srclo = _mm_unpacklo_epi8(mmsrc, zero);
+            __m128i srchi = _mm_unpackhi_epi8(mmsrc, zero);
+            srclo = _mm_slli_epi16(srclo, 4);
+            srchi = _mm_slli_epi16(srchi, 4);
+            srclo = _mm_mulhi_epu16(srclo, mmcontrast);
+            srchi = _mm_mulhi_epu16(srchi, mmcontrast);
+            srclo = _mm_add_epi16(srclo, mmbrightness);
+            srchi = _mm_add_epi16(srchi, mmbrightness);
+            _mm_storeu_si128((__m128i *)(dst + i), _mm_packus_epi16(srclo, srchi));
+        }
+        for (; i < w; i++)
+        {
+            int pel = ((src[i] * scaled_contrast)>>12) + scaled_brightness;
+            if(pel&768) pel = (-pel)>>31;
+            dst[i] = pel;
+        }
+        src += sstride;
+        dst += dstride;
+    }
+}
+#endif
+
 static
 void apply_lut (eq2_param_t *par, unsigned char *dst, unsigned char *src,
   unsigned w, unsigned h, unsigned dstride, unsigned sstride)
@@ -289,6 +328,11 @@ void check_values (eq2_param_t *par)
   if ((par->c == 1.0) && (par->b == 0.0) && (par->g == 1.0)) {
     par->adjust = NULL;
   }
+#if HAVE_EMMINTRIN_H
+  else if (par->g == 1.0 && gCpuCaps.hasSSE2) {
+    par->adjust = &affine_1d_SSE2;
+  }
+#endif
 #if HAVE_MMX_INLINE
   else if (par->g == 1.0 && gCpuCaps.hasMMX) {
     par->adjust = &affine_1d_MMX;
--
2.20.1

_______________________________________________
MPlayer-dev-eng mailing list
MPlayer-dev-eng@xxxxxxxxxxxx
https://lists.mplayerhq.hu/mailman/listinfo/mplayer-dev-eng