Custom lower the memory barrier instructions and add support

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 2b867f4d8dc4428a5ab812e27de461bbd8ea6a8b..6034a091d144d42b648ca088d2f7b065bdcffd72 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,6 +2,8 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?)  This is available on Atom processors.
  
  //===---------------------------------------------------------------------===//
  
@@ -121,20 +123,6 @@ when it can invert the result of the compare for free.
  
  //===---------------------------------------------------------------------===//
  
-How about intrinsics? An example is:
-  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
-
-compiles to
-       pmuludq (%eax), %xmm0
-       movl 8(%esp), %eax
-       movdqa (%eax), %xmm1
-       pmulhuw %xmm0, %xmm1
-
-The transformation probably requires a X86 specific pass or a DAG combiner
-target specific hook.
-
-//===---------------------------------------------------------------------===//
-
  In many cases, LLVM generates code like this:
  
  _test:
@@ -233,102 +221,12 @@ Optimize copysign(x, *y) to use an integer load from y.
  
  //===---------------------------------------------------------------------===//
  
-%X = weak global int 0
-
-void %foo(int %N) {
-       %N = cast int %N to uint
-       %tmp.24 = setgt int %N, 0
-       br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
-       %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
-       %i.0.0 = cast uint %indvar to int
-       volatile store int %i.0.0, int* %X
-       %indvar.next = add uint %indvar, 1
-       %exitcond = seteq uint %indvar.next, %N
-       br bool %exitcond, label %return, label %no_exit
-
-return:
-       ret void
-}
-
-compiles into:
-
-       .text
-       .align  4
-       .globl  _foo
-_foo:
-       movl 4(%esp), %eax
-       cmpl $1, %eax
-       jl LBB_foo_4    # return
-LBB_foo_1:     # no_exit.preheader
-       xorl %ecx, %ecx
-LBB_foo_2:     # no_exit
-       movl L_X$non_lazy_ptr, %edx
-       movl %ecx, (%edx)
-       incl %ecx
-       cmpl %eax, %ecx
-       jne LBB_foo_2   # no_exit
-LBB_foo_3:     # return.loopexit
-LBB_foo_4:     # return
-       ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
-
-//===---------------------------------------------------------------------===//
-
  The following tests perform worse with LSR:
  
  lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
  
  //===---------------------------------------------------------------------===//
  
-We are generating far worse code than gcc:
-
-volatile short X, Y;
-
-void foo(int N) {
-  int i;
-  for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-LBB1_1:        # entry.bb_crit_edge
-       xorl    %ecx, %ecx
-       xorw    %dx, %dx
-LBB1_2:        # bb
-       movl    L_X$non_lazy_ptr, %esi
-       movw    %cx, (%esi)
-       movl    L_Y$non_lazy_ptr, %esi
-       movw    %dx, (%esi)
-       addw    $4, %dx
-       incl    %ecx
-       cmpl    %eax, %ecx
-       jne     LBB1_2  # bb
-
-vs.
-
-       xorl    %edx, %edx
-       movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
-       movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
-L4:
-       movw    %dx, (%esi)
-       leal    0(,%edx,4), %eax
-       movw    %ax, (%ecx)
-       addl    $1, %edx
-       cmpl    %edx, %edi
-       jne     L4
-
-This is due to the lack of post regalloc LICM.
-
-//===---------------------------------------------------------------------===//
-
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
  Adding to the list of cmp / test poor codegen issues:
  
  int test(__m128 *A, __m128 *B) {
@@ -565,35 +463,6 @@ _usesbb:
  
  //===---------------------------------------------------------------------===//
  
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
-       call fastcc void %test1( )
-       call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
-       ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
-       subl $16, %esp
-       call _test5
-       addl $12, %esp
-       subl $16, %esp
-       movl $_test5, (%esp)
-       call _test6
-       addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
  Consider the expansion of:
  
  define i32 @test3(i32 %X) {
@@ -656,7 +525,7 @@ We should inline lrintf and probably other libc functions.
  
  //===---------------------------------------------------------------------===//
  
-Start using the flags more.  For example, compile:
+Use the FLAGS values from arithmetic instructions more.  For example, compile:
  
  int add_zf(int *x, int y, int a, int b) {
       if ((*x += y) == 0)
@@ -680,31 +549,8 @@ _add_zf:
          movl %ecx, %eax
          ret
  
-and:
-
-int add_zf(int *x, int y, int a, int b) {
-     if ((*x + y) < 0)
-          return a;
-     else
-          return b;
-}
-
-to:
-
-add_zf:
-        addl    (%rdi), %esi
-        movl    %edx, %eax
-        cmovns  %ecx, %eax
-        ret
-
-instead of:
-
-_add_zf:
-        addl (%rdi), %esi
-        testl %esi, %esi
-        cmovs %edx, %ecx
-        movl %ecx, %eax
-        ret
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
  
  //===---------------------------------------------------------------------===//
  
@@ -811,55 +657,6 @@ Though this probably isn't worth it.
  
  //===---------------------------------------------------------------------===//
  
-We need to teach the codegen to convert two-address INC instructions to LEA
-when the flags are dead (likewise dec).  For example, on X86-64, compile:
-
-int foo(int A, int B) {
-  return A+1;
-}
-
-to:
-
-_foo:
-        leal    1(%edi), %eax
-        ret
-
-instead of:
-
-_foo:
-        incl %edi
-        movl %edi, %eax
-        ret
-
-Another example is:
-
-;; X's live range extends beyond the shift, so the register allocator
-;; cannot coalesce it with Y.  Because of this, a copy needs to be
-;; emitted before the shift to save the register value before it is
-;; clobbered.  However, this copy is not needed if the register
-;; allocator turns the shift into an LEA.  This also occurs for ADD.
-
-; Check that the shift gets turned into an LEA.
-; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov E.X, E.X}
-
-@G = external global i32               ; <i32*> [#uses=3]
-
-define i32 @test1(i32 %X, i32 %Y) {
-       %Z = add i32 %X, %Y             ; <i32> [#uses=1]
-       volatile store i32 %Y, i32* @G
-       volatile store i32 %Z, i32* @G
-       ret i32 %X
-}
-
-define i32 @test2(i32 %X) {
-       %Z = add i32 %X, 1              ; <i32> [#uses=1]
-       volatile store i32 %Z, i32* @G
-       ret i32 %X
-}
-
-//===---------------------------------------------------------------------===//
-
  Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
  a neg instead of a sub instruction.  Consider:
  
@@ -980,39 +777,6 @@ __Z11no_overflowjj:
  
  //===---------------------------------------------------------------------===//
  
-Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
-condition register is dead. xor reg reg is shorter than mov reg, #0.
-
-//===---------------------------------------------------------------------===//
-
-We aren't matching RMW instructions aggressively
-enough.  Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
-        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
-        %B = load i32* %target_ptr              ; <i32> [#uses=1]
-        %C = or i32 %A, %B              ; <i32> [#uses=1]
-        store i32 %C, i32* %target_ptr
-        ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
-        movl (%rdi), %eax
-        orl (%rsi), %eax
-        movl %eax, (%rsi)
-        ret
-
-That should be something like:
-
-_test:
-        movl (%rdi), %eax
-        orl %eax, (%rsi)
-        ret
-
-//===---------------------------------------------------------------------===//
-
  The following code:
  
  bb114.preheader:               ; preds = %cond_next94
@@ -1339,57 +1103,6 @@ be folded into: shl [mem], 1
  
  //===---------------------------------------------------------------------===//
  
-This testcase misses a read/modify/write opportunity (from PR1425):
-
-void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We compile it down to:
-
-LBB1_2:        # bb
-       movl    (%esi,%edi,4), %ebx
-       addl    (%ecx,%edi,4), %ebx
-       addl    (%edx,%edi,4), %ebx
-       movl    %ebx, (%ecx,%edi,4)
-       incl    %edi
-       cmpl    %eax, %edi
-       jne     LBB1_2  # bb
-
-the inner loop should add to the memory location (%ecx,%edi,4), saving
-a mov.  Something like:
-
-        movl    (%esi,%edi,4), %ebx
-        addl    (%edx,%edi,4), %ebx
-        addl    %ebx, (%ecx,%edi,4)
-
-Here is another interesting example:
-
-void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
-
-LBB9_2:        # bb
-       movl    (%ecx,%edi,4), %ebx
-       subl    (%esi,%edi,4), %ebx
-       subl    (%edx,%edi,4), %ebx
-       movl    %ebx, (%ecx,%edi,4)
-       incl    %edi
-       cmpl    %eax, %edi
-       jne     LBB9_2  # bb
-
-Additionally, LSR should rewrite the exit condition of these loops to use
-a stride-4 IV, would would allow all the scales in the loop to go away.
-This would result in smaller code and more efficient microops.
-
-//===---------------------------------------------------------------------===//
-
  In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
  or and instruction, for example:
  
@@ -1422,13 +1135,6 @@ void test(double *P) {
  
  //===---------------------------------------------------------------------===//
  
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
  The generated code on x86 for checking for signed overflow on a multiply the
  obvious way is much longer than it needs to be.
  
@@ -1537,15 +1243,8 @@ FirstOnet:
          xorl    %eax, %eax
          ret
  
-There are a few possible improvements here:
-1. We should be able to eliminate the dead load into %ecx
-2. We could change the "movl 8(%esp), %eax" into
-   "movzwl 10(%esp), %eax"; this lets us change the cmpl
-   into a testl, which is shorter, and eliminate the shift.
-
-We could also in theory eliminate the branch by using a conditional
-for the address of the load, but that seems unlikely to be worthwhile
-in general.
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
  
  //===---------------------------------------------------------------------===//
  
@@ -1567,22 +1266,23 @@ bb7:            ; preds = %entry
  
  to:
  
-_foo:
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       movl    4(%esp), %ecx
         cmpb    $0, 16(%esp)
-       movl    12(%esp), %ecx
+       je      .LBB0_2
+# BB#1:                                 # %bb
         movl    8(%esp), %eax
-       movl    4(%esp), %edx
-       je      LBB1_2  # bb7
-LBB1_1:        # bb
-       addl    %edx, %eax
+       addl    %ecx, %eax
         ret
-LBB1_2:        # bb7
-       movl    %edx, %eax
-       subl    %ecx, %eax
+.LBB0_2:                                # %bb7
+       movl    12(%esp), %edx
+       movl    %ecx, %eax
+       subl    %edx, %eax
         ret
  
-The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
-if it commuted the addl in LBB1_1.
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
  
  //===---------------------------------------------------------------------===//
  
@@ -1632,8 +1332,7 @@ Also check why xmm7 is not used at all in the function.
  
  //===---------------------------------------------------------------------===//
  
-Legalize loses track of the fact that bools are always zero extended when in
-memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+Take the following:
  
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
  target triple = "i386-apple-darwin8"
@@ -1652,16 +1351,15 @@ bb4.i:          ; preds = %entry
  }
  declare void @exit(i32) noreturn nounwind 
  
-into:
-
-_abort_gzip:
+This compiles into:
+_abort_gzip:                            ## @abort_gzip
+## BB#0:                                ## %entry
         subl    $12, %esp
         movb    _in_exit.4870.b, %al
-       notb    %al
-       testb   $1, %al
-       jne     LBB1_2  ## bb4.i
-LBB1_1:        ## bb.i
-  ...
+       cmpb    $1, %al
+       jne     LBB0_2
+
+We somehow miss folding the movb into the cmpb.
  
  //===---------------------------------------------------------------------===//
  
@@ -1862,3 +1560,306 @@ LBB1_2: ## overflow
  it would be nice to produce "into" someday.
  
  //===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+       movl    %ebx, %eax
+       imull   (%edi,%ecx,4)
+       shrdl   $31, %edx, %eax
+       addl    %eax, (%esi,%ecx,4)
+       incl    %ecx
+       cmpl    $149, %ecx
+       jle     .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:        ## bb1
+       movl    24(%esp), %eax
+       movl    (%eax,%edi,4), %ebx
+       movl    %ebx, %ebp
+       imull   %esi, %ebp
+       movl    %ebx, %eax
+       mull    %ecx
+       addl    %ebp, %edx
+       sarl    $31, %ebx
+       imull   %ecx, %ebx
+       addl    %edx, %ebx
+       shldl   $1, %eax, %ebx
+       movl    20(%esp), %eax
+       addl    %ebx, (%eax,%edi,4)
+       incl    %edi
+       cmpl    $150, %edi
+       jne     LBB1_1  ## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+       %tmp = alloca <16 x float>, align 16
+       %tmp2 = alloca <16 x float>, align 16
+       store <16 x float> %A, <16 x float>* %tmp
+       %s = bitcast <16 x float>* %tmp to i8*
+       %s2 = bitcast <16 x float>* %tmp2 to i8*
+       call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+       %R = load <16 x float>* %tmp2
+       ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+       subl    $140, %esp
+       movaps  %xmm3, 112(%esp)
+       movaps  %xmm2, 96(%esp)
+       movaps  %xmm1, 80(%esp)
+       movaps  %xmm0, 64(%esp)
+       movl    60(%esp), %eax
+       movl    %eax, 124(%esp)
+       movl    56(%esp), %eax
+       movl    %eax, 120(%esp)
+       movl    52(%esp), %eax
+        <many many more 32-bit copies>
+       movaps  (%esp), %xmm0
+       movaps  16(%esp), %xmm1
+       movaps  32(%esp), %xmm2
+       movaps  48(%esp), %xmm3
+       addl    $140, %esp
+       ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+       movl    4(%esp), %eax
+       andl    $127, %eax
+       cmpl    $31, %eax
+       seta    %al
+       movzbl  %al, %eax
+       ret
+
+Ideal output:
+       xorl    %eax, %eax
+       testl   $96, 4(%esp)
+       setne   %al
+       ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition.  We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+       %0 = and i32 %x, 127            ; <i32> [#uses=1]
+       %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
+       %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
+       ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that.  This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+       movl    4(%esp), %eax
+       shrl    $4, %eax
+       andl    $15, %eax
+       ret
+
+Ideal output:
+       movzbl  4(%esp), %eax
+       shrl    $4, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+       testl   $128, 4(%esp)
+       setne   %al
+       movzbl  %al, %eax
+       shll    $8, %eax
+       ret
+
+Better:
+       movl    4(%esp), %eax
+       addl    %eax, %eax
+       andl    $256, %eax
+       ret
+
+This is another general instcombine transformation that is profitable on all
+targets.  In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+       %0 = and i32 %a, 128
+       %1 = icmp eq i32 %0, 0
+       %iftmp.0.0 = select i1 %1, i32 0, i32 256
+       ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+       %0 = shl i32 %a, 1
+       %1 = and i32 %0, 256
+       ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+_Bool bar(int *x) { return *x & 1; }
+
+define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
+entry:
+  %tmp1 = load i32* %x                            ; <i32> [#uses=1]
+  %and = and i32 %tmp1, 1                         ; <i32> [#uses=1]
+  %tobool = icmp ne i32 %and, 0                   ; <i1> [#uses=1]
+  ret i1 %tobool
+}
+
+bar:                                                        # @bar
+# BB#0:                                                     # %entry
+       movl    4(%esp), %eax
+       movb    (%eax), %al
+       andb    $1, %al
+       movzbl  %al, %eax
+       ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+       movl    4(%esp), %eax
+       testb   $4, (%eax)
+       sete    %al
+       movzbl  %al, %eax
+       ret
+
+bar:
+       movl    4(%esp), %eax
+       movl    (%eax), %eax
+       shrl    $2, %eax
+       andl    $1, %eax
+       xorl    $1, %eax
+       ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int x(int y) { return (y & 63) << 14; }
+
+Code produced by gcc:
+       andl    $63, %edi
+       sall    $14, %edi
+       movl    %edi, %eax
+       ret
+
+Code produced by clang:
+       shll    $14, %edi
+       movl    %edi, %eax
+       andl    $1032192, %eax
+       ret
+
+The code produced by gcc is 3 bytes shorter.  This sort of construct often
+shows up with bitfields.
+
+//===---------------------------------------------------------------------===//