From: Chris Lattner Date: Mon, 17 Apr 2006 00:30:41 +0000 (+0000) Subject: Initial checking of a perfect shuffle generation program for 4-element X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=27e98aa07c836276fc7f546da62c791ad63863d1;hp=f0fc7fed66d724d05eac0c518f0a573bccf26aed;p=oota-llvm.git Initial checking of a perfect shuffle generation program for 4-element Altivec vectors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27736 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/utils/PerfectShuffle/Makefile b/utils/PerfectShuffle/Makefile new file mode 100644 index 00000000000..332c580b98b --- /dev/null +++ b/utils/PerfectShuffle/Makefile @@ -0,0 +1,13 @@ +##===- utils/BuildShuffleTable/Makefile --------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by Chris Lattner and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../.. +TOOLNAME = llvm-BuildShuffleTable +include $(LEVEL)/Makefile.common + diff --git a/utils/PerfectShuffle/PerfectShuffle.cpp b/utils/PerfectShuffle/PerfectShuffle.cpp new file mode 100644 index 00000000000..ae2e05f1d0a --- /dev/null +++ b/utils/PerfectShuffle/PerfectShuffle.cpp @@ -0,0 +1,474 @@ +//===-- BuildShuffleTable.cpp - Perfect Shuffle Generator -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file computes an optimal sequence of instructions for doing all shuffles +// of two 4-element vectors. With a release build and when configured to emit +// an altivec instruction table, this takes about 30s to run on a 2.7Ghz +// PowerPC G5. +// +//===----------------------------------------------------------------------===// + +#include +#include + +struct Operator; + +// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes +// an element from that value of the input vectors. A value of 8 means the +// entry is undefined. + +// Mask manipulation functions. +static inline unsigned short MakeMask(unsigned V0, unsigned V1, + unsigned V2, unsigned V3) { + return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4)); +} + +/// getMaskElt - Return element N of the specified mask. +static unsigned getMaskElt(unsigned Mask, unsigned Elt) { + return (Mask >> ((3-Elt)*4)) & 0xF; +} + +static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) { + unsigned FieldShift = ((3-Elt)*4); + return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift); +} + +// Reject elements where the values are 9-15. +static bool isValidMask(unsigned short Mask) { + unsigned short UndefBits = Mask & 0x8888; + return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0; +} + +/// hasUndefElements - Return true if any of the elements in the mask are undefs +/// +static bool hasUndefElements(unsigned short Mask) { + return (Mask & 0x8888) != 0; +} + +/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not +/// including undef values.. +static bool isOnlyLHSMask(unsigned short Mask) { + return (Mask & 0x4444) == 0; +} + +/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to +/// refer to the LHS only (for when one argument value is passed into the same +/// function twice). +static unsigned short getLHSOnlyMask(unsigned short Mask) { + return Mask & 0xBBBB; // Keep only LHS and Undefs. +} + +/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4 +/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9. +static unsigned getCompressedMask(unsigned short Mask) { + return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 + + getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3); +} + +static void PrintMask(unsigned i, std::ostream &OS) { + OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0))) + << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1))) + << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2))) + << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3))) + << ">"; +} + +/// ShuffleVal - This represents a shufflevector operation. +struct ShuffleVal { + unsigned Cost; // Number of instrs used to generate this value. + Operator *Op; // The Operation used to generate this value. + unsigned short Arg0, Arg1; // Input operands for this value. + + ShuffleVal() : Cost(1000000) {} +}; + + +/// ShufTab - This is the actual shuffle table that we are trying to generate. +/// +static ShuffleVal ShufTab[65536]; + +/// TheOperators - All of the operators that this target supports. +static std::vector TheOperators; + +/// Operator - This is a vector operation that is available for use. +struct Operator { + unsigned short ShuffleMask; + unsigned short OpNum; + const char *Name; + + Operator(unsigned short shufflemask, const char *name) + : ShuffleMask(shufflemask), Name(name) { + OpNum = TheOperators.size(); + TheOperators.push_back(this); + } + ~Operator() { + assert(TheOperators.back() == this); + TheOperators.pop_back(); + } + + bool isOnlyLHSOperator() const { + return isOnlyLHSMask(ShuffleMask); + } + + const char *getName() const { return Name; } + + unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) { + // Extract the elements from LHSMask and RHSMask, as appropriate. + unsigned Result = 0; + for (unsigned i = 0; i != 4; ++i) { + unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF; + unsigned ResElt; + if (SrcElt < 4) + ResElt = getMaskElt(LHSMask, SrcElt); + else if (SrcElt < 8) + ResElt = getMaskElt(RHSMask, SrcElt-4); + else { + assert(SrcElt == 8 && "Bad src elt!"); + ResElt = 8; + } + Result |= ResElt << (4*i); + } + return Result; + } +}; + +static const char *getZeroCostOpName(unsigned short Op) { + if (ShufTab[Op].Arg0 == 0x0123) + return "LHS"; + else if (ShufTab[Op].Arg0 == 0x4567) + return "RHS"; + else { + assert(0 && "bad zero cost operation"); + abort(); + } +} + +static void PrintOperation(unsigned ValNo, unsigned short Vals[]) { + unsigned short ThisOp = Vals[ValNo]; + std::cerr << "t" << ValNo; + PrintMask(ThisOp, std::cerr); + std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "("; + + if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) { + std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0); + PrintMask(ShufTab[ThisOp].Arg0, std::cerr); + } else { + // Figure out what tmp # it is. + for (unsigned i = 0; ; ++i) + if (Vals[i] == ShufTab[ThisOp].Arg0) { + std::cerr << "t" << i; + break; + } + } + + if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) { + std::cerr << ", "; + if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) { + std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1); + PrintMask(ShufTab[ThisOp].Arg1, std::cerr); + } else { + // Figure out what tmp # it is. + for (unsigned i = 0; ; ++i) + if (Vals[i] == ShufTab[ThisOp].Arg1) { + std::cerr << "t" << i; + break; + } + } + } + std::cerr << ") "; +} + +static unsigned getNumEntered() { + unsigned Count = 0; + for (unsigned i = 0; i != 65536; ++i) + Count += ShufTab[i].Cost < 100; + return Count; +} + +static void EvaluateOps(unsigned short Elt, unsigned short Vals[], + unsigned &NumVals) { + if (ShufTab[Elt].Cost == 0) return; + + // If this value has already been evaluated, it is free. FIXME: match undefs. + for (unsigned i = 0, e = NumVals; i != e; ++i) + if (Vals[i] == Elt) return; + + // Otherwise, get the operands of the value, then add it. + unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1; + if (ShufTab[Arg0].Cost) + EvaluateOps(Arg0, Vals, NumVals); + if (Arg0 != Arg1 && ShufTab[Arg1].Cost) + EvaluateOps(Arg1, Vals, NumVals); + + Vals[NumVals++] = Elt; +} + + +int main() { + // Seed the table with accesses to the LHS and RHS. + ShufTab[0x0123].Cost = 0; + ShufTab[0x0123].Op = 0; + ShufTab[0x0123].Arg0 = 0x0123; + ShufTab[0x4567].Cost = 0; + ShufTab[0x4567].Op = 0; + ShufTab[0x4567].Arg0 = 0x4567; + + // Seed the first-level of shuffles, shuffles whose inputs are the input to + // the vectorshuffle operation. + bool MadeChange = true; + unsigned OpCount = 0; + while (MadeChange) { + MadeChange = false; + ++OpCount; + std::cerr << "Starting iteration #" << OpCount << " with " + << getNumEntered() << " entries established.\n"; + + // Scan the table for two reasons: First, compute the maximum cost of any + // operation left in the table. Second, make sure that values with undefs + // have the cheapest alternative that they match. + unsigned MaxCost = ShufTab[0].Cost; + for (unsigned i = 1; i != 0x8889; ++i) { + if (!isValidMask(i)) continue; + if (ShufTab[i].Cost > MaxCost) + MaxCost = ShufTab[i].Cost; + + // If this value has an undef, make it be computed the cheapest possible + // way of any of the things that it matches. + if (hasUndefElements(i)) { + // This code is a little bit tricky, so here's the idea: consider some + // permutation, like 7u4u. To compute the lowest cost for 7u4u, we + // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If + // there are 3 undefs, the number rises to 729 entries we have to scan, + // and for the 4 undef case, we have to scan the whole table. + // + // Instead of doing this huge amount of scanning, we process the table + // entries *in order*, and use the fact that 'u' is 8, larger than any + // valid index. Given an entry like 7u4u then, we only need to scan + // 7[0-7]4u - 8 entries. We can get away with this, because we already + // know that each of 704u, 714u, 724u, etc contain the minimum value of + // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively. + unsigned UndefIdx; + if (i & 0x8000) + UndefIdx = 0; + else if (i & 0x0800) + UndefIdx = 1; + else if (i & 0x0080) + UndefIdx = 2; + else if (i & 0x0008) + UndefIdx = 3; + else + abort(); + + unsigned MinVal = i; + unsigned MinCost = ShufTab[i].Cost; + + // Scan the 8 entries. + for (unsigned j = 0; j != 8; ++j) { + unsigned NewElt = setMaskElt(i, UndefIdx, j); + if (ShufTab[NewElt].Cost < MinCost) { + MinCost = ShufTab[NewElt].Cost; + MinVal = NewElt; + } + } + + // If we found something cheaper than what was here before, use it. + if (i != MinVal) { + MadeChange = true; + ShufTab[i] = ShufTab[MinVal]; + } + } + } + + for (unsigned LHS = 0; LHS != 0x8889; ++LHS) { + if (!isValidMask(LHS)) continue; + if (ShufTab[LHS].Cost > 1000) continue; + + // If nothing involving this operand could possibly be cheaper than what + // we already have, don't consider it. + if (ShufTab[LHS].Cost + 1 >= MaxCost) + continue; + + for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) { + Operator *Op = TheOperators[opnum]; + unsigned short Mask = Op->ShuffleMask; + + // Evaluate op(LHS,LHS) + unsigned ResultMask = Op->getTransformedMask(LHS, LHS); + + unsigned Cost = ShufTab[LHS].Cost + 1; + if (Cost < ShufTab[ResultMask].Cost) { + ShufTab[ResultMask].Cost = Cost; + ShufTab[ResultMask].Op = Op; + ShufTab[ResultMask].Arg0 = LHS; + ShufTab[ResultMask].Arg1 = LHS; + MadeChange = true; + } + + // If this is a two input instruction, include the op(x,y) cases. If + // this is a one input instruction, skip this. + if (Op->isOnlyLHSOperator()) continue; + + for (unsigned RHS = 0; RHS != 0x8889; ++RHS) { + if (!isValidMask(RHS)) continue; + if (ShufTab[RHS].Cost > 1000) continue; + + // If nothing involving this operand could possibly be cheaper than + // what we already have, don't consider it. + if (ShufTab[RHS].Cost + 1 >= MaxCost) + continue; + + + // Evaluate op(LHS,RHS) + unsigned ResultMask = Op->getTransformedMask(LHS, RHS); + + if (ShufTab[ResultMask].Cost <= OpCount || + ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost || + ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost) + continue; + + // Figure out the cost to evaluate this, knowing that CSE's only need + // to be evaluated once. + unsigned short Vals[30]; + unsigned NumVals = 0; + EvaluateOps(LHS, Vals, NumVals); + EvaluateOps(RHS, Vals, NumVals); + + unsigned Cost = NumVals + 1; + if (Cost < ShufTab[ResultMask].Cost) { + ShufTab[ResultMask].Cost = Cost; + ShufTab[ResultMask].Op = Op; + ShufTab[ResultMask].Arg0 = LHS; + ShufTab[ResultMask].Arg1 = RHS; + MadeChange = true; + } + } + } + } + } + + std::cerr << "Finished Table has " << getNumEntered() + << " entries established.\n"; + + unsigned CostArray[10] = { 0 }; + + // Compute a cost histogram. + for (unsigned i = 0; i != 65536; ++i) { + if (!isValidMask(i)) continue; + if (ShufTab[i].Cost > 9) + ++CostArray[9]; + else + ++CostArray[ShufTab[i].Cost]; + } + + for (unsigned i = 0; i != 9; ++i) + if (CostArray[i]) + std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n"; + if (CostArray[9]) + std::cout << "// " << CostArray[9] << " entries have higher cost!\n"; + + + // Build up the table to emit. + std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n"; + std::cout << "static const unsigned InstrTab[6561+1] = {\n"; + + for (unsigned i = 0; i != 0x8889; ++i) { + if (!isValidMask(i)) continue; + + // CostSat - The cost of this operation saturated to two bits. + unsigned CostSat = ShufTab[i].Cost; + if (CostSat > 3) CostSat = 3; + + unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0; + assert(OpNum < 16 && "Too few bits to encode operation!"); + + unsigned LHS = getCompressedMask(ShufTab[i].Arg0); + unsigned RHS = getCompressedMask(ShufTab[i].Arg1); + + // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of + // LHS, and 13 bits of RHS = 32 bits. + unsigned Val = (CostSat << 30) | (OpNum << 27) | (LHS << 13) | RHS; + + std::cout << " " << Val << "U,\t// "; + PrintMask(i, std::cout); + std::cout << ": Cost " << ShufTab[i].Cost; + std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy"); + std::cout << " "; + if (ShufTab[ShufTab[i].Arg0].Cost == 0) { + std::cout << getZeroCostOpName(ShufTab[i].Arg0); + } else { + PrintMask(ShufTab[i].Arg0, std::cout); + } + + if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) { + std::cout << ", "; + if (ShufTab[ShufTab[i].Arg1].Cost == 0) { + std::cout << getZeroCostOpName(ShufTab[i].Arg1); + } else { + PrintMask(ShufTab[i].Arg1, std::cout); + } + } + std::cout << "\n"; + } + std::cout << " 0\n};\n"; + + if (0) { + // Print out the table. + for (unsigned i = 0; i != 0x8889; ++i) { + if (!isValidMask(i)) continue; + if (ShufTab[i].Cost < 1000) { + PrintMask(i, std::cerr); + std::cerr << " - Cost " << ShufTab[i].Cost << " - "; + + unsigned short Vals[30]; + unsigned NumVals = 0; + EvaluateOps(i, Vals, NumVals); + + for (unsigned j = 0, e = NumVals; j != e; ++j) + PrintOperation(j, Vals); + std::cerr << "\n"; + } + } + } +} + + + +///===---------------------------------------------------------------------===// +/// The altivec instruction definitions. This is the altivec-specific part of +/// this file. +///===---------------------------------------------------------------------===// + +struct vmrghw : public Operator { + vmrghw() : Operator(0x0415, "vmrghw") {} +} the_vmrghw; + +struct vmrglw : public Operator { + vmrglw() : Operator(0x2637, "vmrglw") {} +} the_vmrglw; + +template +struct vspltisw : public Operator { + vspltisw(const char *N) : Operator(MakeMask(Elt, Elt, Elt, Elt), N) {} +}; + +vspltisw<0> the_vspltisw0("vspltisw0"); +vspltisw<1> the_vspltisw1("vspltisw1"); +vspltisw<2> the_vspltisw2("vspltisw2"); +vspltisw<3> the_vspltisw3("vspltisw3"); + +template +struct vsldoi : public Operator { + vsldoi(const char *n) : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), n){ + } +}; + +vsldoi<1> the_vsldoi1("vsldoi4"); +vsldoi<2> the_vsldoi2("vsldoi8"); +vsldoi<3> the_vsldoi3("vsldoi12"); +