| // Copyright 2017-2020 The Verible Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // TextStructure is a class responsible for managing the structural |
| // information of a block of text: tokenized view, syntax tree. |
| // It retains a shared pointer to the backing memory referenced by its |
| // string_views, and is suitable for breaking out substring analyses. |
| // |
| // See test_structure_test_utils.h for utilities for constructing fake |
| // (valid) TextStructures without a lexer or parser. |
| // |
| // TODO(fangism): object serialization/deserialization for TextStructure or |
| // TextStructureView. Could also be related to protocol-buffer-ization. |
| |
| #ifndef VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_ |
| #define VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_ |
| |
| #include <cstddef> |
| #include <map> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "absl/status/status.h" |
| #include "absl/strings/string_view.h" |
| #include "common/strings/line_column_map.h" |
| #include "common/text/concrete_syntax_tree.h" |
| #include "common/text/symbol.h" |
| #include "common/text/token_stream_view.h" |
| #include "common/text/tree_utils.h" |
| |
| namespace verible { |
| |
| class TextStructure; |
| |
| // TextStructureView contains sequences of tokens and a tree, but all |
| // string_views in this structure rely on string memory owned elsewhere, |
| // for example in TextStructure. |
| class TextStructureView { |
| public: |
| // Deferred in-place expansion of the syntax tree. |
| // TODO(b/136014603): Replace with expandable token stream view abstraction. |
| struct DeferredExpansion { |
| // Position in the syntax tree to expand (leaf or node). |
| std::unique_ptr<Symbol>* expansion_point; |
| |
| // Analysis of the substring that corresponds to the expansion_point. |
| std::unique_ptr<TextStructure> subanalysis; |
| }; |
| |
| // NodeExpansionMap is a map of offsets to substring analysis results |
| // that are to be expanded. The rationale is that it is more efficient to |
| // collect expansions and process them in bulk rather than as each |
| // expansion is encountered. |
| using NodeExpansionMap = std::map<int, DeferredExpansion>; |
| |
| explicit TextStructureView(absl::string_view contents); |
| |
| ~TextStructureView(); |
| |
| // Do not copy/assign. This contains pointers/iterators to internals. |
| TextStructureView(const TextStructureView&) = delete; |
| TextStructureView& operator=(const TextStructureView&) = delete; |
| |
| absl::string_view Contents() const { return contents_; } |
| |
| const std::vector<absl::string_view>& Lines() const { return lines_; } |
| |
| const ConcreteSyntaxTree& SyntaxTree() const { return syntax_tree_; } |
| |
| ConcreteSyntaxTree& MutableSyntaxTree() { return syntax_tree_; } |
| |
| const TokenSequence& TokenStream() const { return tokens_; } |
| |
| TokenSequence& MutableTokenStream() { return tokens_; } |
| |
| const TokenStreamView& GetTokenStreamView() const { return tokens_view_; } |
| |
| TokenStreamView& MutableTokenStreamView() { return tokens_view_; } |
| |
| // Creates a stream of modifiable iterators to the filtered tokens. |
| // Uses tokens_view_ to create the iterators. |
| TokenStreamReferenceView MakeTokenStreamReferenceView(); |
| |
| const LineColumnMap& GetLineColumnMap() const { return line_column_map_; } |
| |
| void RecalculateLineColumnMap() { |
| line_column_map_ = LineColumnMap(contents_); |
| } |
| |
| const std::vector<TokenSequence::const_iterator>& GetLineTokenMap() const { |
| return line_token_map_; |
| } |
| |
| // Create the EOF token given the contents. |
| TokenInfo EOFToken() const; |
| |
| // Resets all fields. |
| void Clear(); |
| |
| // Compute the token stream iterators that start each line. |
| // This populates the line_token_map_ array with token iterators. |
| void CalculateFirstTokensPerLine(); |
| |
| // Returns iterator range of tokens that span the given file offsets. |
| // The second iterator points 1-past-the-end of the range. |
| TokenRange TokenRangeSpanningOffsets(size_t lower, size_t upper) const; |
| |
| // Returns an iterator range of tokens that start on the given line number. |
| // The lineno index is 0-based. The last token spanned by the returned |
| // range is the newline token that terminates the line. |
| // Precondition: CalculateFirstTokensPerLine() has already been called. |
| TokenRange TokenRangeOnLine(size_t lineno) const; |
| |
| // Filter out tokens from token stream view before parsing. |
| // Can be called successively with different predicates. |
| void FilterTokens(const TokenFilterPredicate&); |
| |
| // Apply the same transformation to the token sequence, and the tokens |
| // that were copied into the syntax tree. |
| void MutateTokens(const LeafMutator& mutator); |
| |
| // Update tokens to point their text into new (superstring) owner. |
| // This is done to prepare for transfer of ownership of syntax_tree_ |
| // to a new owner. |
| void RebaseTokensToSuperstring(absl::string_view superstring, |
| absl::string_view src_base, int offset); |
| |
| // Narrows the view of text, tokens, and syntax tree to the node that starts |
| // at left_offset. The resulting state looks as if only a snippet of |
| // text were parsed as a particular construct of the larger grammar. |
| // The contents will be pared down to a substring, and irrelevant tokens will |
| // be pruned from the token sequence and syntax tree. |
| void FocusOnSubtreeSpanningSubstring(int left_offset, int length); |
| |
| // ExpandSubtrees performs bulk substitution of syntax tree leaves to |
| // subtrees that result from other analyses. Memory ownership of the |
| // analysis results passed through the expansions is transferred (consumed) |
| // by this function. |
| void ExpandSubtrees(NodeExpansionMap* expansions); |
| |
| // All of this class's consistency checks combined. |
| absl::Status InternalConsistencyCheck() const; |
| |
| protected: |
| // This is the text that is spanned by the token sequence and syntax tree. |
| // This is required for calculating byte offsets to substrings contained |
| // within this structure. Pass this (via Contents()) to TokenInfo::left() and |
| // TokenInfo::right() to calculate byte offsets, useful for diagnostics. |
| absl::string_view contents_; |
| |
| // Line-by-line view of contents_. |
| std::vector<absl::string_view> lines_; |
| |
| // Tokens that constitute the original file (contents_). |
| // This should always be terminated with a sentinel EOF token. |
| TokenSequence tokens_; |
| |
| // Possibly modified view of the tokens_ token sequence. |
| TokenStreamView tokens_view_; |
| |
| // Map to translate byte-offsets to line and column for diagnostics. |
| LineColumnMap line_column_map_; |
| |
| // Index of token iterators that mark the beginnings of each line. |
| std::vector<TokenSequence::const_iterator> line_token_map_; |
| |
| // Tree representation of file contents. |
| ConcreteSyntaxTree syntax_tree_; |
| |
| void TrimSyntaxTree(int first_token_offset, int last_token_offset); |
| |
| void TrimTokensToSubstring(int left_offset, int right_offset); |
| |
| void TrimContents(int left_offset, int length); |
| void SplitLines(); |
| |
| void ConsumeDeferredExpansion( |
| TokenSequence::const_iterator* next_token_iter, |
| TokenStreamView::const_iterator* next_token_view_iter, |
| DeferredExpansion* expansion, TokenSequence* combined_tokens, |
| std::vector<int>* token_view_indices, const char* offset); |
| |
| // Verify that internal iterators point to locations owned by this object, |
| // and that all string_views in the tokens_view_ are substring views of the |
| // contents_ string view. |
| absl::Status FastTokenRangeConsistencyCheck() const; |
| |
| // Verify that line-based view of contents_ is consistent with the |
| // contents_ text itself. |
| absl::Status FastLineRangeConsistencyCheck() const; |
| |
| // Verify that the string views in the syntax tree are contained within |
| // the contents_ string view. |
| absl::Status SyntaxTreeConsistencyCheck() const; |
| }; |
| |
| // TextStructure holds the results of lexing and parsing. |
| // This contains rather than inherits from TextStructureView because |
| // the same owned memory can be used for multiple analysis views. |
| class TextStructure { |
| public: |
| explicit TextStructure(absl::string_view contents); |
| |
| TextStructure(const TextStructure&) = delete; |
| TextStructure& operator=(const TextStructure&) = delete; |
| TextStructure(TextStructure&&) = delete; |
| TextStructure& operator=(TextStructure&&) = delete; |
| |
| // DeferredExpansion::subanalysis requires this destructor to be virtual. |
| virtual ~TextStructure(); |
| |
| const TextStructureView& Data() const { return data_; } |
| |
| TextStructureView& MutableData() { return data_; } |
| |
| const ConcreteSyntaxTree& SyntaxTree() const { return data_.SyntaxTree(); } |
| |
| // Verify that string_views are inside memory owned by owned_contents_. |
| absl::Status StringViewConsistencyCheck() const; |
| |
| // Verify that internal data structures have valid ranges. |
| absl::Status InternalConsistencyCheck() const; |
| |
| protected: |
| // This string owns the memory referenced by all substring string_views |
| // in this object. |
| // TODO(hzeller): avoid local copy and just require that the string-view |
| // we are called with outlives this object? |
| const std::string owned_contents_; |
| |
| // The data_ object's string_views are owned by owned_contents_. |
| TextStructureView data_; |
| }; |
| |
| } // namespace verible |
| |
| #endif // VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_ |