-; SSSE3-LABEL: @stress_test2
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = zero,zero,xmm0[2],zero,zero,zero,xmm0[11],zero,zero,xmm0[3,4,5],zero,zero,xmm0[15,5]
-; SSSE3-NEXT: movdqa %xmm1, %[[X1:xmm[0-9]+]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X1]] = [[X1]][13,14],zero,[[X1]][0,10,5],zero,[[X1]][10,10],zero,zero,zero,[[X1]][14,12],zero,zero
-; SSSE3-NEXT: por %xmm0, %[[X1]]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = zero,zero,zero,zero,zero,zero,xmm0[1,6,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %xmm1, %[[X2:xmm[0-9]+]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = [[X2]][1,12,5,9,1,5],zero,zero,[[X2]][u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: por %xmm0, %[[X2]]
-; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,xmm1[2],zero,zero,xmm1[6,15,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pshufb {{.*}} # xmm2 = xmm2[15,8,12],zero,xmm2[13,15],zero,zero,xmm2[u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*}} # xmm2 = xmm2[0],[[X2]][0],xmm2[1],[[X2]][1],xmm2[2],[[X2]][2],xmm2[3],[[X2]][3],xmm2[4],[[X2]][4],xmm2[5],[[X2]][5],xmm2[6],[[X2]][6],xmm2[7],[[X2]][7]
-; SSSE3-NEXT: movdqa %xmm2, %[[X3:xmm[0-9]+]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X3]] = [[X3]][6],zero,[[X3]][14,14],zero,zero,[[X3]][11,0,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %[[X1]], %xmm0
-; SSSE3-NEXT: pshufb {{.*}} # xmm0 = zero,xmm0[12],zero,zero,xmm0[1,14],zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: por %[[X3]], %xmm0
-; SSSE3-NEXT: pshufb {{.*}} # xmm2 = zero,xmm2[u,2,3,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %[[X1]], %[[X3]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X3]] = [[X3]][3,u],zero,zero,[[X3]][u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: por %xmm2, %[[X3]]
-; SSSE3-NEXT: pshufb {{.*}} # [[X1]] = [[X1]][1,4,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: punpcklbw {{.*}} # [[X1]] = [[X1]][0],[[X3]][0],[[X1]][1],[[X3]][1],[[X1]][2],[[X3]][2],[[X1]][3],[[X3]][3],[[X1]][4],[[X3]][4],[[X1]][5],[[X3]][5],[[X1]][6],[[X3]][6],[[X1]][7],[[X3]][7]
-; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSE2-LABEL: stress_test2:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
+; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm7[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: packuswb %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,2,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,1,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,0,0,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: stress_test2:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,zero,zero,xmm0[11],zero,zero,xmm0[3,4,5],zero,zero,xmm0[15,5]
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[13,14],zero,xmm3[0,10,5],zero,xmm3[10,10],zero,zero,zero,xmm3[14,12],zero,zero
+; SSSE3-NEXT: por %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[1,6,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,12,5,9,1,5],zero,zero,xmm4[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2],zero,zero,xmm1[6,15,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,8,12],zero,xmm2[13,15],zero,zero,xmm2[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[14,14],zero,zero,xmm1[11,0,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[12],zero,zero,xmm0[1,14],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[u,2,3,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[3,u],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]