lib/Support/StringRef.cpp

   1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 #include "llvm/ADT/StringRef.h"
  11 #include "llvm/ADT/APInt.h"
  12 #include "llvm/ADT/OwningPtr.h"
  13 #include <bitset>
  14
  15 using namespace llvm;
  16
  17 // MSVC emits references to this into the translation units which reference it.
  18 #ifndef _MSC_VER
  19 const size_t StringRef::npos;
  20 #endif
  21
  22 static char ascii_tolower(char x) {
  23   if (x >= 'A' && x <= 'Z')
  24     return x - 'A' + 'a';
  25   return x;
  26 }
  27
  28 static bool ascii_isdigit(char x) {
  29   return x >= '0' && x <= '9';
  30 }
  31
  32 /// compare_lower - Compare strings, ignoring case.
  33 int StringRef::compare_lower(StringRef RHS) const {
  34   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
  35     unsigned char LHC = ascii_tolower(Data[I]);
  36     unsigned char RHC = ascii_tolower(RHS.Data[I]);
  37     if (LHC != RHC)
  38       return LHC < RHC ? -1 : 1;
  39   }
  40
  41   if (Length == RHS.Length)
  42     return 0;
  43   return Length < RHS.Length ? -1 : 1;
  44 }
  45
  46 /// compare_numeric - Compare strings, handle embedded numbers.
  47 int StringRef::compare_numeric(StringRef RHS) const {
  48   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
  49     if (Data[I] == RHS.Data[I])
  50       continue;
  51     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
  52       // The longer sequence of numbers is larger. This doesn't really handle
  53       // prefixed zeros well.
  54       for (size_t J = I+1; J != E+1; ++J) {
  55         bool ld = J < Length && ascii_isdigit(Data[J]);
  56         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
  57         if (ld != rd)
  58           return rd ? -1 : 1;
  59         if (!rd)
  60           break;
  61       }
  62     }
  63     return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
  64   }
  65   if (Length == RHS.Length)
  66     return 0;
  67   return Length < RHS.Length ? -1 : 1;
  68 }
  69
  70 // Compute the edit distance between the two given strings.
  71 unsigned StringRef::edit_distance(llvm::StringRef Other,
  72                                   bool AllowReplacements,
  73                                   unsigned MaxEditDistance) {
  74   // The algorithm implemented below is the "classic"
  75   // dynamic-programming algorithm for computing the Levenshtein
  76   // distance, which is described here:
  77   //
  78   //   http://en.wikipedia.org/wiki/Levenshtein_distance
  79   //
  80   // Although the algorithm is typically described using an m x n
  81   // array, only two rows are used at a time, so this implemenation
  82   // just keeps two separate vectors for those two rows.
  83   size_type m = size();
  84   size_type n = Other.size();
  85
  86   const unsigned SmallBufferSize = 64;
  87   unsigned SmallBuffer[SmallBufferSize];
  88   llvm::OwningArrayPtr<unsigned> Allocated;
  89   unsigned *previous = SmallBuffer;
  90   if (2*(n + 1) > SmallBufferSize) {
  91     previous = new unsigned [2*(n+1)];
  92     Allocated.reset(previous);
  93   }
  94   unsigned *current = previous + (n + 1);
  95
  96   for (unsigned i = 0; i <= n; ++i)
  97     previous[i] = i;
  98
  99   for (size_type y = 1; y <= m; ++y) {
 100     current[0] = y;
 101     unsigned BestThisRow = current[0];
 102
 103     for (size_type x = 1; x <= n; ++x) {
 104       if (AllowReplacements) {
 105         current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
 106                          min(current[x-1], previous[x])+1);
 107       }
 108       else {
 109         if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
 110         else current[x] = min(current[x-1], previous[x]) + 1;
 111       }
 112       BestThisRow = min(BestThisRow, current[x]);
 113     }
 114
 115     if (MaxEditDistance && BestThisRow > MaxEditDistance)
 116       return MaxEditDistance + 1;
 117
 118     unsigned *tmp = current;
 119     current = previous;
 120     previous = tmp;
 121   }
 122
 123   unsigned Result = previous[n];
 124   return Result;
 125 }
 126
 127 //===----------------------------------------------------------------------===//
 128 // String Searching
 129 //===----------------------------------------------------------------------===//
 130
 131
 132 /// find - Search for the first string \arg Str in the string.
 133 ///
 134 /// \return - The index of the first occurence of \arg Str, or npos if not
 135 /// found.
 136 size_t StringRef::find(StringRef Str, size_t From) const {
 137   size_t N = Str.size();
 138   if (N > Length)
 139     return npos;
 140   for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
 141     if (substr(i, N).equals(Str))
 142       return i;
 143   return npos;
 144 }
 145
 146 /// rfind - Search for the last string \arg Str in the string.
 147 ///
 148 /// \return - The index of the last occurence of \arg Str, or npos if not
 149 /// found.
 150 size_t StringRef::rfind(StringRef Str) const {
 151   size_t N = Str.size();
 152   if (N > Length)
 153     return npos;
 154   for (size_t i = Length - N + 1, e = 0; i != e;) {
 155     --i;
 156     if (substr(i, N).equals(Str))
 157       return i;
 158   }
 159   return npos;
 160 }
 161
 162 /// find_first_of - Find the first character in the string that is in \arg
 163 /// Chars, or npos if not found.
 164 ///
 165 /// Note: O(size() + Chars.size())
 166 StringRef::size_type StringRef::find_first_of(StringRef Chars,
 167                                               size_t From) const {
 168   std::bitset<1 << CHAR_BIT> CharBits;
 169   for (size_type i = 0; i != Chars.size(); ++i)
 170     CharBits.set((unsigned char)Chars[i]);
 171
 172   for (size_type i = min(From, Length), e = Length; i != e; ++i)
 173     if (CharBits.test((unsigned char)Data[i]))
 174       return i;
 175   return npos;
 176 }
 177
 178 /// find_first_not_of - Find the first character in the string that is not
 179 /// \arg C or npos if not found.
 180 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 181   for (size_type i = min(From, Length), e = Length; i != e; ++i)
 182     if (Data[i] != C)
 183       return i;
 184   return npos;
 185 }
 186
 187 /// find_first_not_of - Find the first character in the string that is not
 188 /// in the string \arg Chars, or npos if not found.
 189 ///
 190 /// Note: O(size() + Chars.size())
 191 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
 192                                                   size_t From) const {
 193   std::bitset<1 << CHAR_BIT> CharBits;
 194   for (size_type i = 0; i != Chars.size(); ++i)
 195     CharBits.set((unsigned char)Chars[i]);
 196
 197   for (size_type i = min(From, Length), e = Length; i != e; ++i)
 198     if (!CharBits.test((unsigned char)Data[i]))
 199       return i;
 200   return npos;
 201 }
 202
 203 /// find_last_of - Find the last character in the string that is in \arg C,
 204 /// or npos if not found.
 205 ///
 206 /// Note: O(size() + Chars.size())
 207 StringRef::size_type StringRef::find_last_of(StringRef Chars,
 208                                              size_t From) const {
 209   std::bitset<1 << CHAR_BIT> CharBits;
 210   for (size_type i = 0; i != Chars.size(); ++i)
 211     CharBits.set((unsigned char)Chars[i]);
 212
 213   for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
 214     if (CharBits.test((unsigned char)Data[i]))
 215       return i;
 216   return npos;
 217 }
 218
 219 //===----------------------------------------------------------------------===//
 220 // Helpful Algorithms
 221 //===----------------------------------------------------------------------===//
 222
 223 /// count - Return the number of non-overlapped occurrences of \arg Str in
 224 /// the string.
 225 size_t StringRef::count(StringRef Str) const {
 226   size_t Count = 0;
 227   size_t N = Str.size();
 228   if (N > Length)
 229     return 0;
 230   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
 231     if (substr(i, N).equals(Str))
 232       ++Count;
 233   return Count;
 234 }
 235
 236 static unsigned GetAutoSenseRadix(StringRef &Str) {
 237   if (Str.startswith("0x")) {
 238     Str = Str.substr(2);
 239     return 16;
 240   } else if (Str.startswith("0b")) {
 241     Str = Str.substr(2);
 242     return 2;
 243   } else if (Str.startswith("0")) {
 244     return 8;
 245   } else {
 246     return 10;
 247   }
 248 }
 249
 250
 251 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
 252 /// sequence of radix up to 36 to an unsigned long long value.
 253 static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
 254                                  unsigned long long &Result) {
 255   // Autosense radix if not specified.
 256   if (Radix == 0)
 257     Radix = GetAutoSenseRadix(Str);
 258
 259   // Empty strings (after the radix autosense) are invalid.
 260   if (Str.empty()) return true;
 261
 262   // Parse all the bytes of the string given this radix.  Watch for overflow.
 263   Result = 0;
 264   while (!Str.empty()) {
 265     unsigned CharVal;
 266     if (Str[0] >= '0' && Str[0] <= '9')
 267       CharVal = Str[0]-'0';
 268     else if (Str[0] >= 'a' && Str[0] <= 'z')
 269       CharVal = Str[0]-'a'+10;
 270     else if (Str[0] >= 'A' && Str[0] <= 'Z')
 271       CharVal = Str[0]-'A'+10;
 272     else
 273       return true;
 274
 275     // If the parsed value is larger than the integer radix, the string is
 276     // invalid.
 277     if (CharVal >= Radix)
 278       return true;
 279
 280     // Add in this character.
 281     unsigned long long PrevResult = Result;
 282     Result = Result*Radix+CharVal;
 283
 284     // Check for overflow.
 285     if (Result < PrevResult)
 286       return true;
 287
 288     Str = Str.substr(1);
 289   }
 290
 291   return false;
 292 }
 293
 294 bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
 295   return GetAsUnsignedInteger(*this, Radix, Result);
 296 }
 297
 298
 299 bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
 300   unsigned long long ULLVal;
 301
 302   // Handle positive strings first.
 303   if (empty() || front() != '-') {
 304     if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
 305         // Check for value so large it overflows a signed value.
 306         (long long)ULLVal < 0)
 307       return true;
 308     Result = ULLVal;
 309     return false;
 310   }
 311
 312   // Get the positive part of the value.
 313   if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
 314       // Reject values so large they'd overflow as negative signed, but allow
 315       // "-0".  This negates the unsigned so that the negative isn't undefined
 316       // on signed overflow.
 317       (long long)-ULLVal > 0)
 318     return true;
 319
 320   Result = -ULLVal;
 321   return false;
 322 }
 323
 324 bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
 325   long long Val;
 326   if (getAsInteger(Radix, Val) ||
 327       (int)Val != Val)
 328     return true;
 329   Result = Val;
 330   return false;
 331 }
 332
 333 bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
 334   unsigned long long Val;
 335   if (getAsInteger(Radix, Val) ||
 336       (unsigned)Val != Val)
 337     return true;
 338   Result = Val;
 339   return false;
 340 }
 341
 342 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 343   StringRef Str = *this;
 344
 345   // Autosense radix if not specified.
 346   if (Radix == 0)
 347     Radix = GetAutoSenseRadix(Str);
 348
 349   assert(Radix > 1 && Radix <= 36);
 350
 351   // Empty strings (after the radix autosense) are invalid.
 352   if (Str.empty()) return true;
 353
 354   // Skip leading zeroes.  This can be a significant improvement if
 355   // it means we don't need > 64 bits.
 356   while (!Str.empty() && Str.front() == '0')
 357     Str = Str.substr(1);
 358
 359   // If it was nothing but zeroes....
 360   if (Str.empty()) {
 361     Result = APInt(64, 0);
 362     return false;
 363   }
 364
 365   // (Over-)estimate the required number of bits.
 366   unsigned Log2Radix = 0;
 367   while ((1U << Log2Radix) < Radix) Log2Radix++;
 368   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
 369
 370   unsigned BitWidth = Log2Radix * Str.size();
 371   if (BitWidth < Result.getBitWidth())
 372     BitWidth = Result.getBitWidth(); // don't shrink the result
 373   else
 374     Result = Result.zext(BitWidth);
 375
 376   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
 377   if (!IsPowerOf2Radix) {
 378     // These must have the same bit-width as Result.
 379     RadixAP = APInt(BitWidth, Radix);
 380     CharAP = APInt(BitWidth, 0);
 381   }
 382
 383   // Parse all the bytes of the string given this radix.
 384   Result = 0;
 385   while (!Str.empty()) {
 386     unsigned CharVal;
 387     if (Str[0] >= '0' && Str[0] <= '9')
 388       CharVal = Str[0]-'0';
 389     else if (Str[0] >= 'a' && Str[0] <= 'z')
 390       CharVal = Str[0]-'a'+10;
 391     else if (Str[0] >= 'A' && Str[0] <= 'Z')
 392       CharVal = Str[0]-'A'+10;
 393     else
 394       return true;
 395
 396     // If the parsed value is larger than the integer radix, the string is
 397     // invalid.
 398     if (CharVal >= Radix)
 399       return true;
 400
 401     // Add in this character.
 402     if (IsPowerOf2Radix) {
 403       Result <<= Log2Radix;
 404       Result |= CharVal;
 405     } else {
 406       Result *= RadixAP;
 407       CharAP = CharVal;
 408       Result += CharAP;
 409     }
 410
 411     Str = Str.substr(1);
 412   }
 413
 414   return false;
 415 }