So icc is smart enough to know that B is in memory so it doesn't load it and
store it back to stack.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+ return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+typedef short v8i16 __attribute__ ((__vector_size__ (16)));
+v8i16 test(v8i16 x, v8i16 y) {
+ return x + y;
+}
+
+compiles to
+
+_test:
+ paddw %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ ret
+
+It should be
+
+ paddw %xmm1, %xmm0
+ ret
+
+since paddw is commutative.