| // Copyright 2017-2020 The Verible Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef VERIBLE_COMMON_TEXT_TOKEN_INFO_H_ |
| #define VERIBLE_COMMON_TEXT_TOKEN_INFO_H_ |
| |
| #include <algorithm> // for std::distance, std::copy |
| #include <functional> // for std::function |
| #include <iosfwd> |
| #include <string> |
| #include <vector> |
| |
| #include "absl/strings/string_view.h" |
| #include "common/text/constants.h" |
| #include "common/util/iterator_range.h" |
| |
| namespace verible { |
| |
| // TokenInfo describes the text and location of a lexed token. |
| // TokenInfo is a unit returned by the FlexLexerAdapter. |
| // Reminder: The text string_view doesn't own its memory, so the owner must |
| // always out-live the token. |
| // |
| class TokenInfo { |
| public: |
| // Construct an EOF token. |
| // Note, however, that the bounds of the internal string_view in this case |
| // do not correspond to any subrange of valid string. |
| // If you need the string_view range to refer to (the end of) another string, |
| // use the following overload that accepts a string. |
| static TokenInfo EOFToken(); |
| |
| // Construct an EOF token that points to the end of a string buffer. |
| static TokenInfo EOFToken(absl::string_view); |
| |
| // Hide default constructor, force explicit initialization or call to |
| // EOFToken(). |
| TokenInfo() = delete; |
| |
| TokenInfo(int token_enum, absl::string_view text) |
| : token_enum_(token_enum), text_(text) {} |
| |
| TokenInfo(const TokenInfo&) = default; |
| TokenInfo(TokenInfo&&) = default; |
| TokenInfo& operator=(const TokenInfo&) = default; |
| |
| // Context contains the information needed to display meaningful information |
| // about a TokenInfo. |
| struct Context { |
| // Full range of text in which a token appears. |
| // This is used to calculate byte offsets. |
| absl::string_view base; |
| |
| // Prints a human-readable interpretation form of a token enumeration. |
| std::function<void(std::ostream&, int)> token_enum_translator; |
| |
| explicit Context(absl::string_view b); |
| |
| Context(absl::string_view b, |
| std::function<void(std::ostream&, int)> translator) |
| : base(b), token_enum_translator(translator) {} |
| |
| Context(const Context&) = default; |
| }; |
| |
| int token_enum() const { return token_enum_; } |
| void set_token_enum(int t) { token_enum_ = t; } |
| absl::string_view text() const { return text_; } |
| void set_text(absl::string_view t) { text_ = t; } |
| |
| // Return position of this token's text start relative to a base buffer. |
| int left(absl::string_view base) const { |
| return std::distance(base.begin(), text_.begin()); |
| } |
| |
| // Return position of this token's text end relative to a base buffer. |
| int right(absl::string_view base) const { |
| return std::distance(base.begin(), text_.end()); |
| } |
| |
| // Advances the text range along the same memory buffer to span the |
| // next token of size token_length. Successive calls to this yield |
| // a series of abutting substring ranges. Useful for lexer operation. |
| void AdvanceText(int token_length) { |
| // The end of the previous token is the beginning of the next. |
| text_ = absl::string_view(text_.end(), token_length); |
| } |
| |
| // Writes a human-readable string representation of the token. |
| std::ostream& ToStream(std::ostream&, const Context& context) const; |
| |
| // Prints token representation without byte offsets. |
| std::ostream& ToStream(std::ostream&) const; |
| |
| // Returns a human-readable string representation of the token. |
| std::string ToString(const Context&) const; |
| |
| // Prints token representation without byte offsets. |
| std::string ToString() const; |
| |
| // 'Moves' text string_view to point to another buffer, where the |
| // contents still matches. This is useful for analyzing different copies |
| // of text, and transplanting to buffers that belong to different |
| // memory owners. |
| // This is a potentially dangerous operation, which can be validated |
| // using a combination of object lifetime management and range-checking. |
| // It is the caller's responsibility that it points to valid memory. |
| void RebaseStringView(absl::string_view new_text); |
| |
| // This overload assumes that the string of interest from other has the |
| // same length as the current string_view. |
| // string_view::iterator happens to be const char*, but don't rely on that |
| // fact as it can be implementation-dependent. |
| void RebaseStringView(const char* new_text) { |
| RebaseStringView(absl::string_view(new_text, text_.length())); |
| } |
| |
| // Joins the text from a sequence of (text-disjoint) tokens, and also |
| // transforms the sequence of tokens in-place to point to corresponding |
| // substrings of the newly concatenated string (out). The updated tokens' |
| // string_views will be abutting *subranges* of 'out', and their left/right |
| // offsets will be updated to be relative to out->begin(). |
| // This is very useful for lexer test case construction. |
| static void Concatenate(std::string* out, std::vector<TokenInfo>* tokens); |
| |
| // The default comparison operator requires that not only the contents |
| // of the internal string_view be equal, but that they point to the |
| // same buffer range. See EquivalentWithoutLocation() for the variant that |
| // doesn't require range equality. |
| bool operator==(const TokenInfo& token) const; |
| bool operator!=(const TokenInfo& token) const { return !(*this == token); } |
| |
| // Returns true if tokens are considered equivalent, ignoring location. |
| bool EquivalentWithoutLocation(const TokenInfo& token) const { |
| return token_enum_ == token.token_enum_ && |
| (token_enum_ == TK_EOF || text_ == token.text_); |
| } |
| |
| // Returns true if tokens have equal enum and equal string length (but |
| // otherwise ignoring string contents). This is useful for verifying |
| // space-preserving obfuscation transformations. |
| bool EquivalentBySpace(const TokenInfo& token) const { |
| return token_enum_ == token.token_enum_ && |
| (token_enum_ == TK_EOF || text_.length() == token.text_.length()); |
| } |
| |
| bool isEOF() const { return token_enum_ == TK_EOF; } |
| |
| protected: // protected, as ExpectedTokenInfo accesses it. |
| int token_enum_; |
| |
| // The substring of a larger text that this token represents. |
| absl::string_view text_; |
| }; |
| |
| std::ostream& operator<<(std::ostream&, const TokenInfo&); |
| |
| // Streamable structure that combines a token with its detailed context. |
| struct TokenWithContext { |
| TokenInfo token; |
| TokenInfo::Context context; |
| }; |
| |
| std::ostream& operator<<(std::ostream&, const TokenWithContext&); |
| |
| // Joins a range of TokenInfo-like objects to form a string whose contents |
| // match those of the elements's ranges, and also points the elements |
| // to the corresponding matching substrings of the new string (rebase). |
| // TokenInfo must be a write-able iterator. |
| // TokenInfo's element type must have the same interface as TokenInfo, |
| // e.g. a (public) subclass of TokenInfo. |
| template <class TokenIter> |
| void ConcatenateTokenInfos(std::string* out, TokenIter begin, TokenIter end) { |
| // Inspired by absl::StrCat implementation details. |
| |
| // Calculate total string length, used to allocate one-time. |
| const auto token_range = make_range(begin, end); |
| size_t total_length = 0; |
| for (const auto& token : token_range) { |
| total_length += token.text().length(); |
| } |
| out->resize(total_length); |
| const absl::string_view out_view(*out); |
| |
| // Copy text into new buffer. |
| auto code_iter = out->begin(); // writeable iterator (like char*) |
| int offset = 0; |
| for (auto& token : token_range) { |
| // Expect library/compiler to optimize this to a strcpy()/memcpy(). |
| code_iter = std::copy(token.text().begin(), token.text().end(), code_iter); |
| const auto new_text = out_view.substr(offset, token.text().length()); |
| // Adjust locations relative to newly concatenated string. |
| token.RebaseStringView(new_text); |
| offset += token.text().length(); |
| } |
| } |
| |
| } // namespace verible |
| |
| #endif // VERIBLE_COMMON_TEXT_TOKEN_INFO_H_ |