; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE: # BB#0:
; SSE-NEXT: movq 24(%rdi), %rax
; SSE-NEXT: movdqu 8(%rdi), %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, (%rax)
-; SSE-NEXT: movaps %xmm1, 16(%rax)
+; SSE-NEXT: movaps %xmm0, 16(%rax)
+; SSE-NEXT: movaps %xmm1, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: