AMDGPU/SI: Emit global variable sizes when targeting HSA

[oota-llvm.git] / lib / Target / X86 / README-X86-64.txt
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt

index b7ebc461f36cc74c5f812af85f1926f4df3e212d..bcfdf0bc56b28ca4853244aa3669c96c8c968e09 100644 (file)
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -36,55 +36,11 @@ _conv:
         cmovb %rcx, %rax
         ret
  
-Seems like the jb branch has high likelyhood of being taken. It would have
+Seems like the jb branch has high likelihood of being taken. It would have
  saved a few instructions.
  
  //===---------------------------------------------------------------------===//
  
-Poor codegen:
-
-int X[2];
-int b;
-void test(void) {
-  memset(X, b, 2*sizeof(X[0]));
-}
-
-llc:
-       movq _b@GOTPCREL(%rip), %rax
-       movzbq (%rax), %rax
-       movq %rax, %rcx
-       shlq $8, %rcx
-       orq %rax, %rcx
-       movq %rcx, %rax
-       shlq $16, %rax
-       orq %rcx, %rax
-       movq %rax, %rcx
-       shlq $32, %rcx
-       movq _X@GOTPCREL(%rip), %rdx
-       orq %rax, %rcx
-       movq %rcx, (%rdx)
-       ret
-
-gcc:
-       movq    _b@GOTPCREL(%rip), %rax
-       movabsq $72340172838076673, %rdx
-       movzbq  (%rax), %rax
-       imulq   %rdx, %rax
-       movq    _X@GOTPCREL(%rip), %rdx
-       movq    %rax, (%rdx)
-       ret
-
-And the codegen is even worse for the following
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103):
-  void fill1(char *s, int a)
-  {
-    __builtin_memset(s, a, 15);
-  }
-
-For this version, we duplicate the computation of the constant to store.
-
-//===---------------------------------------------------------------------===//
-
  It's not possible to reference AH, BH, CH, and DH registers in an instruction
  requiring REX prefix. However, divb and mulb both produce results in AH. If isel
  emits a CopyFromReg which gets turned into a movb and that can be allocated a
@@ -168,51 +124,6 @@ if we have whole-function selectiondags.
  
  //===---------------------------------------------------------------------===//
  
-Take the following C code
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):
-
-struct u1
-{
-        float x;
-        float y;
-};
-
-float foo(struct u1 u)
-{
-        return u.x + u.y;
-}
-
-Optimizes to the following IR:
-define float @foo(double %u.0) nounwind readnone {
-entry:
-  %tmp8 = bitcast double %u.0 to i64              ; <i64> [#uses=2]
-  %tmp6 = trunc i64 %tmp8 to i32                  ; <i32> [#uses=1]
-  %tmp7 = bitcast i32 %tmp6 to float              ; <float> [#uses=1]
-  %tmp2 = lshr i64 %tmp8, 32                      ; <i64> [#uses=1]
-  %tmp3 = trunc i64 %tmp2 to i32                  ; <i32> [#uses=1]
-  %tmp4 = bitcast i32 %tmp3 to float              ; <float> [#uses=1]
-  %0 = fadd float %tmp7, %tmp4                    ; <float> [#uses=1]
-  ret float %0
-}
-
-And current llvm-gcc/clang output:
-       movd    %xmm0, %rax
-       movd    %eax, %xmm1
-       shrq    $32, %rax
-       movd    %eax, %xmm0
-       addss   %xmm1, %xmm0
-       ret
-
-We really shouldn't move the floats to RAX, only to immediately move them
-straight back to the XMM registers.
-
-There really isn't any good way to handle this purely in IR optimizers; it
-could possibly be handled by changing the output of the fronted, though.  It
-would also be feasible to add a x86-specific DAGCombine to optimize the
-bitcast+trunc+(lshr+)bitcast combination.
-
-//===---------------------------------------------------------------------===//
-
  Take the following code
  (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
  extern unsigned long table[];
@@ -240,3 +151,34 @@ Issues:
     expensive addressing mode.
  
  //===---------------------------------------------------------------------===//
+
+Consider the following (contrived testcase, but contains common factors):
+
+#include <stdarg.h>
+int test(int x, ...) {
+  int sum, i;
+  va_list l;
+  va_start(l, x);
+  for (i = 0; i < x; i++)
+    sum += va_arg(l, int);
+  va_end(l);
+  return sum;
+}
+
+Testcase given in C because fixing it will likely involve changing the IR
+generated for it.  The primary issue with the result is that it doesn't do any
+of the optimizations which are possible if we know the address of a va_list
+in the current function is never taken:
+1. We shouldn't spill the XMM registers because we only call va_arg with "int".
+2. It would be nice if we could scalarrepl the va_list.
+3. Probably overkill, but it'd be cool if we could peel off the first five
+iterations of the loop.
+
+Other optimizations involving functions which use va_arg on floats which don't
+have the address of a va_list taken:
+1. Conversely to the above, we shouldn't spill general registers if we only
+   call va_arg on "double".
+2. If we know nothing more than 64 bits wide is read from the XMM registers,
+   we can change the spilling code to reduce the amount of stack used by half.
+
+//===---------------------------------------------------------------------===//