X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FSupport%2FStringRef.cpp;h=ddece087a9e7114a1ab8fdaa01eadef9096ebc84;hb=46f5c11bedb8e6ef1f9e540e99cf1495557d99da;hp=53980519645022f58e01bfc09361c3b0d7e32085;hpb=40f8f6264d5af2c38e797e0dc59827cd231e8ff7;p=oota-llvm.git diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index 53980519645..ddece087a9e 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -9,7 +9,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/APInt.h" -#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/edit_distance.h" #include using namespace llvm; @@ -25,33 +26,58 @@ static char ascii_tolower(char x) { return x; } +static char ascii_toupper(char x) { + if (x >= 'a' && x <= 'z') + return x - 'a' + 'A'; + return x; +} + static bool ascii_isdigit(char x) { return x >= '0' && x <= '9'; } -/// compare_lower - Compare strings, ignoring case. -int StringRef::compare_lower(StringRef RHS) const { - for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { - unsigned char LHC = ascii_tolower(Data[I]); - unsigned char RHC = ascii_tolower(RHS.Data[I]); +// strncasecmp() is not available on non-POSIX systems, so define an +// alternative function here. +static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { + for (size_t I = 0; I < Length; ++I) { + unsigned char LHC = ascii_tolower(LHS[I]); + unsigned char RHC = ascii_tolower(RHS[I]); if (LHC != RHC) return LHC < RHC ? -1 : 1; } + return 0; +} +/// compare_lower - Compare strings, ignoring case. +int StringRef::compare_lower(StringRef RHS) const { + if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) + return Res; if (Length == RHS.Length) return 0; return Length < RHS.Length ? -1 : 1; } +/// Check if this string starts with the given \p Prefix, ignoring case. +bool StringRef::startswith_lower(StringRef Prefix) const { + return Length >= Prefix.Length && + ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; +} + +/// Check if this string ends with the given \p Suffix, ignoring case. +bool StringRef::endswith_lower(StringRef Suffix) const { + return Length >= Suffix.Length && + ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; +} + /// compare_numeric - Compare strings, handle embedded numbers. int StringRef::compare_numeric(StringRef RHS) const { - for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { - if (Data[I] == RHS.Data[I]) - continue; + for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { + // Check for sequences of digits. if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { - // The longer sequence of numbers is larger. This doesn't really handle - // prefixed zeros well. - for (size_t J = I+1; J != E+1; ++J) { + // The longer sequence of numbers is considered larger. + // This doesn't really handle prefixed zeros well. + size_t J; + for (J = I + 1; J != E + 1; ++J) { bool ld = J < Length && ascii_isdigit(Data[J]); bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); if (ld != rd) @@ -59,8 +85,15 @@ int StringRef::compare_numeric(StringRef RHS) const { if (!rd) break; } + // The two number sequences have the same length (J-I), just memcmp them. + if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) + return Res < 0 ? -1 : 1; + // Identical number sequences, continue search after the numbers. + I = J - 1; + continue; } - return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; + if (Data[I] != RHS.Data[I]) + return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; } if (Length == RHS.Length) return 0; @@ -70,57 +103,30 @@ int StringRef::compare_numeric(StringRef RHS) const { // Compute the edit distance between the two given strings. unsigned StringRef::edit_distance(llvm::StringRef Other, bool AllowReplacements, - unsigned MaxEditDistance) { - // The algorithm implemented below is the "classic" - // dynamic-programming algorithm for computing the Levenshtein - // distance, which is described here: - // - // http://en.wikipedia.org/wiki/Levenshtein_distance - // - // Although the algorithm is typically described using an m x n - // array, only two rows are used at a time, so this implemenation - // just keeps two separate vectors for those two rows. - size_type m = size(); - size_type n = Other.size(); - - const unsigned SmallBufferSize = 64; - unsigned SmallBuffer[SmallBufferSize]; - llvm::OwningArrayPtr Allocated; - unsigned *previous = SmallBuffer; - if (2*(n + 1) > SmallBufferSize) { - previous = new unsigned [2*(n+1)]; - Allocated.reset(previous); - } - unsigned *current = previous + (n + 1); - - for (unsigned i = 0; i <= n; ++i) - previous[i] = i; - - for (size_type y = 1; y <= m; ++y) { - current[0] = y; - unsigned BestThisRow = current[0]; - - for (size_type x = 1; x <= n; ++x) { - if (AllowReplacements) { - current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u), - min(current[x-1], previous[x])+1); - } - else { - if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1]; - else current[x] = min(current[x-1], previous[x]) + 1; - } - BestThisRow = min(BestThisRow, current[x]); - } + unsigned MaxEditDistance) const { + return llvm::ComputeEditDistance( + makeArrayRef(data(), size()), + makeArrayRef(Other.data(), Other.size()), + AllowReplacements, MaxEditDistance); +} - if (MaxEditDistance && BestThisRow > MaxEditDistance) - return MaxEditDistance + 1; +//===----------------------------------------------------------------------===// +// String Operations +//===----------------------------------------------------------------------===// - unsigned *tmp = current; - current = previous; - previous = tmp; +std::string StringRef::lower() const { + std::string Result(size(), char()); + for (size_type i = 0, e = size(); i != e; ++i) { + Result[i] = ascii_tolower(Data[i]); } + return Result; +} - unsigned Result = previous[n]; +std::string StringRef::upper() const { + std::string Result(size(), char()); + for (size_type i = 0, e = size(); i != e; ++i) { + Result[i] = ascii_toupper(Data[i]); + } return Result; } @@ -131,21 +137,47 @@ unsigned StringRef::edit_distance(llvm::StringRef Other, /// find - Search for the first string \arg Str in the string. /// -/// \return - The index of the first occurence of \arg Str, or npos if not +/// \return - The index of the first occurrence of \arg Str, or npos if not /// found. size_t StringRef::find(StringRef Str, size_t From) const { size_t N = Str.size(); if (N > Length) return npos; - for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) - if (substr(i, N).equals(Str)) - return i; + + // For short haystacks or unsupported needles fall back to the naive algorithm + if (Length < 16 || N > 255 || N == 0) { + for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i) + if (substr(i, N).equals(Str)) + return i; + return npos; + } + + if (From >= Length) + return npos; + + // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. + uint8_t BadCharSkip[256]; + std::memset(BadCharSkip, N, 256); + for (unsigned i = 0; i != N-1; ++i) + BadCharSkip[(uint8_t)Str[i]] = N-1-i; + + unsigned Len = Length-From, Pos = From; + while (Len >= N) { + if (substr(Pos, N).equals(Str)) // See if this is the correct substring. + return Pos; + + // Otherwise skip the appropriate number of bytes. + uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; + Len -= Skip; + Pos += Skip; + } + return npos; } /// rfind - Search for the last string \arg Str in the string. /// -/// \return - The index of the last occurence of \arg Str, or npos if not +/// \return - The index of the last occurrence of \arg Str, or npos if not /// found. size_t StringRef::rfind(StringRef Str) const { size_t N = Str.size(); @@ -169,7 +201,7 @@ StringRef::size_type StringRef::find_first_of(StringRef Chars, for (size_type i = 0; i != Chars.size(); ++i) CharBits.set((unsigned char)Chars[i]); - for (size_type i = min(From, Length), e = Length; i != e; ++i) + for (size_type i = std::min(From, Length), e = Length; i != e; ++i) if (CharBits.test((unsigned char)Data[i])) return i; return npos; @@ -178,7 +210,7 @@ StringRef::size_type StringRef::find_first_of(StringRef Chars, /// find_first_not_of - Find the first character in the string that is not /// \arg C or npos if not found. StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { - for (size_type i = min(From, Length), e = Length; i != e; ++i) + for (size_type i = std::min(From, Length), e = Length; i != e; ++i) if (Data[i] != C) return i; return npos; @@ -194,7 +226,7 @@ StringRef::size_type StringRef::find_first_not_of(StringRef Chars, for (size_type i = 0; i != Chars.size(); ++i) CharBits.set((unsigned char)Chars[i]); - for (size_type i = min(From, Length), e = Length; i != e; ++i) + for (size_type i = std::min(From, Length), e = Length; i != e; ++i) if (!CharBits.test((unsigned char)Data[i])) return i; return npos; @@ -210,12 +242,58 @@ StringRef::size_type StringRef::find_last_of(StringRef Chars, for (size_type i = 0; i != Chars.size(); ++i) CharBits.set((unsigned char)Chars[i]); - for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) + for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) if (CharBits.test((unsigned char)Data[i])) return i; return npos; } +/// find_last_not_of - Find the last character in the string that is not +/// \arg C, or npos if not found. +StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { + for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) + if (Data[i] != C) + return i; + return npos; +} + +/// find_last_not_of - Find the last character in the string that is not in +/// \arg Chars, or npos if not found. +/// +/// Note: O(size() + Chars.size()) +StringRef::size_type StringRef::find_last_not_of(StringRef Chars, + size_t From) const { + std::bitset<1 << CHAR_BIT> CharBits; + for (size_type i = 0, e = Chars.size(); i != e; ++i) + CharBits.set((unsigned char)Chars[i]); + + for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) + if (!CharBits.test((unsigned char)Data[i])) + return i; + return npos; +} + +void StringRef::split(SmallVectorImpl &A, + StringRef Separators, int MaxSplit, + bool KeepEmpty) const { + StringRef rest = *this; + + // rest.data() is used to distinguish cases like "a," that splits into + // "a" + "" and "a" that splits into "a" + 0. + for (int splits = 0; + rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit); + ++splits) { + std::pair p = rest.split(Separators); + + if (KeepEmpty || p.first.size() != 0) + A.push_back(p.first); + rest = p.second; + } + // If we have a tail left, add it. + if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty)) + A.push_back(rest); +} + //===----------------------------------------------------------------------===// // Helpful Algorithms //===----------------------------------------------------------------------===// @@ -237,21 +315,29 @@ static unsigned GetAutoSenseRadix(StringRef &Str) { if (Str.startswith("0x")) { Str = Str.substr(2); return 16; - } else if (Str.startswith("0b")) { + } + + if (Str.startswith("0b")) { Str = Str.substr(2); return 2; - } else if (Str.startswith("0")) { + } + + if (Str.startswith("0o")) { + Str = Str.substr(2); return 8; - } else { - return 10; } + + if (Str.startswith("0")) + return 8; + + return 10; } /// GetAsUnsignedInteger - Workhorse method that converts a integer character /// sequence of radix up to 36 to an unsigned long long value. -static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, - unsigned long long &Result) { +bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, + unsigned long long &Result) { // Autosense radix if not specified. if (Radix == 0) Radix = GetAutoSenseRadix(Str); @@ -281,8 +367,8 @@ static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, unsigned long long PrevResult = Result; Result = Result*Radix+CharVal; - // Check for overflow. - if (Result < PrevResult) + // Check for overflow by shifting back and seeing if bits were lost. + if (Result/Radix < PrevResult) return true; Str = Str.substr(1); @@ -291,17 +377,13 @@ static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, return false; } -bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const { - return GetAsUnsignedInteger(*this, Radix, Result); -} - - -bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { +bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, + long long &Result) { unsigned long long ULLVal; // Handle positive strings first. - if (empty() || front() != '-') { - if (GetAsUnsignedInteger(*this, Radix, ULLVal) || + if (Str.empty() || Str.front() != '-') { + if (getAsUnsignedInteger(Str, Radix, ULLVal) || // Check for value so large it overflows a signed value. (long long)ULLVal < 0) return true; @@ -310,7 +392,7 @@ bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { } // Get the positive part of the value. - if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) || + if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || // Reject values so large they'd overflow as negative signed, but allow // "-0". This negates the unsigned so that the negative isn't undefined // on signed overflow. @@ -321,24 +403,6 @@ bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { return false; } -bool StringRef::getAsInteger(unsigned Radix, int &Result) const { - long long Val; - if (getAsInteger(Radix, Val) || - (int)Val != Val) - return true; - Result = Val; - return false; -} - -bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const { - unsigned long long Val; - if (getAsInteger(Radix, Val) || - (unsigned)Val != Val) - return true; - Result = Val; - return false; -} - bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { StringRef Str = *this; @@ -370,7 +434,7 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { unsigned BitWidth = Log2Radix * Str.size(); if (BitWidth < Result.getBitWidth()) BitWidth = Result.getBitWidth(); // don't shrink the result - else + else if (BitWidth > Result.getBitWidth()) Result = Result.zext(BitWidth); APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix @@ -413,3 +477,9 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { return false; } + + +// Implementation of StringRef hashing. +hash_code llvm::hash_value(StringRef S) { + return hash_combine_range(S.begin(), S.end()); +}