common/text/text_structure.h - third_party/verible - Git at Google

 // Copyright 2017-2020 The Verible Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // TextStructure is a class responsible for managing the structural
 // information of a block of text: tokenized view, syntax tree.
 // It retains a shared pointer to the backing memory referenced by its
 // string_views, and is suitable for breaking out substring analyses.
 //
 // See test_structure_test_utils.h for utilities for constructing fake
 // (valid) TextStructures without a lexer or parser.
 //
 // TODO(fangism): object serialization/deserialization for TextStructure or
 // TextStructureView.  Could also be related to protocol-buffer-ization.

 #ifndef VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_
 #define VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_

 #include <cstddef>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>

 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "common/strings/line_column_map.h"
 #include "common/strings/mem_block.h"
 #include "common/text/concrete_syntax_tree.h"
 #include "common/text/symbol.h"
 #include "common/text/token_stream_view.h"
 #include "common/text/tree_utils.h"

 namespace verilog {
 class VerilogPreprocess;
 }  // namespace verilog

 namespace verible {

 class TextStructure;

 // TextStructureView contains sequences of tokens and a tree, but all
 // string_views in this structure rely on string memory owned elsewhere.
 //
 // TODO(hzeller): This is a kitchen sink and should be split into multiple
 // aspects; tokens, concrete syntax tree or line number mapping are different
 // aspects not needed everywhere.
 class TextStructureView {
  public:
   // Deferred in-place expansion of the syntax tree.
   // TODO(b/136014603): Replace with expandable token stream view abstraction.
   struct DeferredExpansion {
     // Position in the syntax tree to expand (leaf or node).
     std::unique_ptr<Symbol>* expansion_point;

     // Analysis of the substring that corresponds to the expansion_point.
     std::unique_ptr<TextStructure> subanalysis;
   };

   // NodeExpansionMap is a map of offsets to substring analysis results
   // that are to be expanded.  The rationale is that it is more efficient to
   // collect expansions and process them in bulk rather than as each
   // expansion is encountered.
   using NodeExpansionMap = std::map<int, DeferredExpansion>;

   explicit TextStructureView(absl::string_view contents);

   ~TextStructureView();

   // Do not copy/assign.  This contains pointers/iterators to internals.
   TextStructureView(const TextStructureView&) = delete;
   TextStructureView& operator=(const TextStructureView&) = delete;

   absl::string_view Contents() const { return contents_; }

   const std::vector<absl::string_view>& Lines() const {
     return lazy_lines_info_.Get(contents_).lines;
   }

   const ConcreteSyntaxTree& SyntaxTree() const { return syntax_tree_; }

   ConcreteSyntaxTree& MutableSyntaxTree() { return syntax_tree_; }

   const TokenSequence& TokenStream() const { return tokens_; }

   TokenSequence& MutableTokenStream() { return tokens_; }

   const TokenStreamView& GetTokenStreamView() const { return tokens_view_; }

   TokenStreamView& MutableTokenStreamView() { return tokens_view_; }

   // Creates a stream of modifiable iterators to the filtered tokens.
   // Uses tokens_view_ to create the iterators.
   TokenStreamReferenceView MakeTokenStreamReferenceView();

   const LineColumnMap& GetLineColumnMap() const {
     return *lazy_lines_info_.Get(contents_).line_column_map;
   }

   // Given a byte offset, return the line/column
   LineColumn GetLineColAtOffset(int bytes_offset) const {
     return GetLineColumnMap().GetLineColAtOffset(contents_, bytes_offset);
   }

   // Convenience function: Given the token, return the range it covers.
   LineColumnRange GetRangeForToken(const TokenInfo& token) const;

   // Convenience function: Given a text snippet, that needs to be a substring
   // of Contents(), return the range it covers.
   LineColumnRange GetRangeForText(absl::string_view text) const;

   const std::vector<TokenSequence::const_iterator>& GetLineTokenMap() const;

   // Given line/column, find token that is available there. If this is out of
   // range, returns EOF.
   TokenInfo FindTokenAt(const LineColumn& pos) const;

   // Create the EOF token given the contents.
   TokenInfo EOFToken() const;

   // Trigger line token map re-calculation on next request.
   void CalculateFirstTokensPerLine() { lazy_line_token_map_.clear(); }

   // Returns iterator range of tokens that span the given file offsets.
   // The second iterator points 1-past-the-end of the range.
   TokenRange TokenRangeSpanningOffsets(size_t lower, size_t upper) const;

   // Returns an iterator range of tokens that start on the given line number.
   // The lineno index is 0-based.  The last token spanned by the returned
   // range is the newline token that terminates the line.
   // Precondition: CalculateFirstTokensPerLine() has already been called.
   TokenRange TokenRangeOnLine(size_t lineno) const;

   // Filter out tokens from token stream view before parsing.
   // Can be called successively with different predicates.
   void FilterTokens(const TokenFilterPredicate&);

   // Apply the same transformation to the token sequence, and the tokens
   // that were copied into the syntax tree.
   void MutateTokens(const LeafMutator& mutator);

   // Update tokens to point their text into new (superstring) owner.
   // This is done to prepare for transfer of ownership of syntax_tree_
   // to a new owner.
   void RebaseTokensToSuperstring(absl::string_view superstring,
                                  absl::string_view src_base, int offset);

   // Narrows the view of text, tokens, and syntax tree to the node that starts
   // at left_offset.  The resulting state looks as if only a snippet of
   // text were parsed as a particular construct of the larger grammar.
   // The contents will be pared down to a substring, and irrelevant tokens will
   // be pruned from the token sequence and syntax tree.
   void FocusOnSubtreeSpanningSubstring(int left_offset, int length);

   // ExpandSubtrees performs bulk substitution of syntax tree leaves to
   // subtrees that result from other analyses.  Memory ownership of the
   // analysis results passed through the expansions is transferred (consumed)
   // by this function.
   void ExpandSubtrees(NodeExpansionMap* expansions);

   // All of this class's consistency checks combined.
   absl::Status InternalConsistencyCheck() const;

  protected:
   // This is the text that is spanned by the token sequence and syntax tree.
   // This is required for calculating byte offsets to substrings contained
   // within this structure.  Pass this (via Contents()) to TokenInfo::left() and
   // TokenInfo::right() to calculate byte offsets, useful for diagnostics.
   absl::string_view contents_;

   // TODO(hzeller): These lazily generated elements are good candidates
   // for breaking out into their own abstraction.
   struct LinesInfo {
     bool valid = false;

     // Line-by-line view of contents_.
     std::vector<absl::string_view> lines;

     // Map to translate byte-offsets to line and column for diagnostics.
     std::unique_ptr<LineColumnMap> line_column_map;

     const LinesInfo& Get(absl::string_view contents);
   };
   // Mutable as we fill it lazily on request; conceptually the data is const.
   mutable LinesInfo lazy_lines_info_;

   // Tokens that constitute the original file (contents_).
   // This should always be terminated with a sentinel EOF token.
   TokenSequence tokens_;

   // Possibly modified view of the tokens_ token sequence.
   TokenStreamView tokens_view_;

   // Index of token iterators that mark the beginnings of each line.
   // Lazily calculated on request.
   mutable std::vector<TokenSequence::const_iterator> lazy_line_token_map_;

   // Tree representation of file contents.
   ConcreteSyntaxTree syntax_tree_;

   void TrimSyntaxTree(int first_token_offset, int last_token_offset);

   void TrimTokensToSubstring(int left_offset, int right_offset);

   void TrimContents(int left_offset, int length);

   void ConsumeDeferredExpansion(
       TokenSequence::const_iterator* next_token_iter,
       TokenStreamView::const_iterator* next_token_view_iter,
       DeferredExpansion* expansion, TokenSequence* combined_tokens,
       std::vector<int>* token_view_indices, const char* offset);

   // Resets all fields. Only needed in tests.
   void Clear();

   // Verify that internal iterators point to locations owned by this object,
   // and that all string_views in the tokens_view_ are substring views of the
   // contents_ string view.
   absl::Status FastTokenRangeConsistencyCheck() const;

   // Verify that line-based view of contents_ is consistent with the
   // contents_ text itself.
   absl::Status FastLineRangeConsistencyCheck() const;

   // Verify that the string views in the syntax tree are contained within
   // the contents_ string view.
   absl::Status SyntaxTreeConsistencyCheck() const;
 };

 // TextStructure holds the text and the results of lexing, parsing, and other
 // analysis in the corresponding TextStructureView.
 //
 // This class is not providing much benefit as ownership of memory and the
 // parse result are only slightly related; but combining them here makes it
 // harder to actually handle memory ownership and views independently. For
 // instance the FileAnalyzer should keep track of the file content itself and
 // then choose to generate a view (or multiple) on top of that (e.g for
 // fallback parsing). Similar for VerilogSourceFile.
 // The language server already has an in-memory representation which is
 // unnecessarily copied into a TextStructure just to do the rest of the
 // analysis, etc.. Long story short: it is beneficial to separate ownership and
 // views.
 //
 // So: this class is eventually to be removed. For now, make all the
 // constructors private and add explicitly mention all uses as friend classes,
 // so a future refactoring is easier.
 // (If, in the meantime, more TextStructure use is needed in other classes,
 // not to worry: just add them here as friend class. This is merely
 // documentation of use currently).
 class TextStructure {
  private:
   friend class FileAnalyzer;
   friend class TextStructureTokenized;
   friend class TextStructureViewPublicTest_ExpandSubtreesOneLeaf_Test;
   friend class TextStructureViewPublicTest_ExpandSubtreesMultipleLeaves_Test;
   friend class verilog::VerilogPreprocess;

   explicit TextStructure(std::shared_ptr<MemBlock> contents);

   // Convenience constructor in case our input is a string.
   explicit TextStructure(absl::string_view contents);

  public:
   TextStructure(const TextStructure&) = delete;
   TextStructure& operator=(const TextStructure&) = delete;
   TextStructure(TextStructure&&) = delete;
   TextStructure& operator=(TextStructure&&) = delete;

   // DeferredExpansion::subanalysis requires this destructor to be virtual.
   virtual ~TextStructure();

   const TextStructureView& Data() const { return data_; }

   TextStructureView& MutableData() { return data_; }

   const ConcreteSyntaxTree& SyntaxTree() const { return data_.SyntaxTree(); }

   // Verify that string_views are inside memory owned by owned_contents_.
   absl::Status StringViewConsistencyCheck() const;

   // Verify that internal data structures have valid ranges.
   absl::Status InternalConsistencyCheck() const;

  protected:
   // The content of this memblock is referenced in the TextStructureView.
   // The data itself might be shared between multiple entitites
   // (using a heavy shared_ptr might very well intermediate while refactoring
   // the details. https://github.com/chipsalliance/verible/issues/1502 )
   std::shared_ptr<MemBlock> contents_;

   // The data_ object's string_views are owned by owned_contents_.
   TextStructureView data_;
 };

 }  // namespace verible

 #endif  // VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_
	// Copyright 2017-2020 The Verible Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// TextStructure is a class responsible for managing the structural
	// information of a block of text: tokenized view, syntax tree.
	// It retains a shared pointer to the backing memory referenced by its
	// string_views, and is suitable for breaking out substring analyses.
	//
	// See test_structure_test_utils.h for utilities for constructing fake
	// (valid) TextStructures without a lexer or parser.
	//
	// TODO(fangism): object serialization/deserialization for TextStructure or
	// TextStructureView. Could also be related to protocol-buffer-ization.

	#ifndef VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_
	#define VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_

	#include <cstddef>
	#include <map>
	#include <memory>
	#include <string>
	#include <vector>

	#include "absl/status/status.h"
	#include "absl/strings/string_view.h"
	#include "common/strings/line_column_map.h"
	#include "common/strings/mem_block.h"
	#include "common/text/concrete_syntax_tree.h"
	#include "common/text/symbol.h"
	#include "common/text/token_stream_view.h"
	#include "common/text/tree_utils.h"

	namespace verilog {
	class VerilogPreprocess;
	} // namespace verilog

	namespace verible {

	class TextStructure;

	// TextStructureView contains sequences of tokens and a tree, but all
	// string_views in this structure rely on string memory owned elsewhere.
	//
	// TODO(hzeller): This is a kitchen sink and should be split into multiple
	// aspects; tokens, concrete syntax tree or line number mapping are different
	// aspects not needed everywhere.
	class TextStructureView {
	public:
	// Deferred in-place expansion of the syntax tree.
	// TODO(b/136014603): Replace with expandable token stream view abstraction.
	struct DeferredExpansion {
	// Position in the syntax tree to expand (leaf or node).
	std::unique_ptr<Symbol>* expansion_point;

	// Analysis of the substring that corresponds to the expansion_point.
	std::unique_ptr<TextStructure> subanalysis;
	};

	// NodeExpansionMap is a map of offsets to substring analysis results
	// that are to be expanded. The rationale is that it is more efficient to
	// collect expansions and process them in bulk rather than as each
	// expansion is encountered.
	using NodeExpansionMap = std::map<int, DeferredExpansion>;

	explicit TextStructureView(absl::string_view contents);

	~TextStructureView();

	// Do not copy/assign. This contains pointers/iterators to internals.
	TextStructureView(const TextStructureView&) = delete;
	TextStructureView& operator=(const TextStructureView&) = delete;

	absl::string_view Contents() const { return contents_; }

	const std::vector<absl::string_view>& Lines() const {
	return lazy_lines_info_.Get(contents_).lines;
	}

	const ConcreteSyntaxTree& SyntaxTree() const { return syntax_tree_; }

	ConcreteSyntaxTree& MutableSyntaxTree() { return syntax_tree_; }

	const TokenSequence& TokenStream() const { return tokens_; }

	TokenSequence& MutableTokenStream() { return tokens_; }

	const TokenStreamView& GetTokenStreamView() const { return tokens_view_; }

	TokenStreamView& MutableTokenStreamView() { return tokens_view_; }

	// Creates a stream of modifiable iterators to the filtered tokens.
	// Uses tokens_view_ to create the iterators.
	TokenStreamReferenceView MakeTokenStreamReferenceView();

	const LineColumnMap& GetLineColumnMap() const {
	return *lazy_lines_info_.Get(contents_).line_column_map;
	}

	// Given a byte offset, return the line/column
	LineColumn GetLineColAtOffset(int bytes_offset) const {
	return GetLineColumnMap().GetLineColAtOffset(contents_, bytes_offset);
	}

	// Convenience function: Given the token, return the range it covers.
	LineColumnRange GetRangeForToken(const TokenInfo& token) const;

	// Convenience function: Given a text snippet, that needs to be a substring
	// of Contents(), return the range it covers.
	LineColumnRange GetRangeForText(absl::string_view text) const;

	const std::vector<TokenSequence::const_iterator>& GetLineTokenMap() const;

	// Given line/column, find token that is available there. If this is out of
	// range, returns EOF.
	TokenInfo FindTokenAt(const LineColumn& pos) const;

	// Create the EOF token given the contents.
	TokenInfo EOFToken() const;

	// Trigger line token map re-calculation on next request.
	void CalculateFirstTokensPerLine() { lazy_line_token_map_.clear(); }

	// Returns iterator range of tokens that span the given file offsets.
	// The second iterator points 1-past-the-end of the range.
	TokenRange TokenRangeSpanningOffsets(size_t lower, size_t upper) const;

	// Returns an iterator range of tokens that start on the given line number.
	// The lineno index is 0-based. The last token spanned by the returned
	// range is the newline token that terminates the line.
	// Precondition: CalculateFirstTokensPerLine() has already been called.
	TokenRange TokenRangeOnLine(size_t lineno) const;

	// Filter out tokens from token stream view before parsing.
	// Can be called successively with different predicates.
	void FilterTokens(const TokenFilterPredicate&);

	// Apply the same transformation to the token sequence, and the tokens
	// that were copied into the syntax tree.
	void MutateTokens(const LeafMutator& mutator);

	// Update tokens to point their text into new (superstring) owner.
	// This is done to prepare for transfer of ownership of syntax_tree_
	// to a new owner.
	void RebaseTokensToSuperstring(absl::string_view superstring,
	absl::string_view src_base, int offset);

	// Narrows the view of text, tokens, and syntax tree to the node that starts
	// at left_offset. The resulting state looks as if only a snippet of
	// text were parsed as a particular construct of the larger grammar.
	// The contents will be pared down to a substring, and irrelevant tokens will
	// be pruned from the token sequence and syntax tree.
	void FocusOnSubtreeSpanningSubstring(int left_offset, int length);

	// ExpandSubtrees performs bulk substitution of syntax tree leaves to
	// subtrees that result from other analyses. Memory ownership of the
	// analysis results passed through the expansions is transferred (consumed)
	// by this function.
	void ExpandSubtrees(NodeExpansionMap* expansions);

	// All of this class's consistency checks combined.
	absl::Status InternalConsistencyCheck() const;

	protected:
	// This is the text that is spanned by the token sequence and syntax tree.
	// This is required for calculating byte offsets to substrings contained
	// within this structure. Pass this (via Contents()) to TokenInfo::left() and
	// TokenInfo::right() to calculate byte offsets, useful for diagnostics.
	absl::string_view contents_;

	// TODO(hzeller): These lazily generated elements are good candidates
	// for breaking out into their own abstraction.
	struct LinesInfo {
	bool valid = false;

	// Line-by-line view of contents_.
	std::vector<absl::string_view> lines;

	// Map to translate byte-offsets to line and column for diagnostics.
	std::unique_ptr<LineColumnMap> line_column_map;

	const LinesInfo& Get(absl::string_view contents);
	};
	// Mutable as we fill it lazily on request; conceptually the data is const.
	mutable LinesInfo lazy_lines_info_;

	// Tokens that constitute the original file (contents_).
	// This should always be terminated with a sentinel EOF token.
	TokenSequence tokens_;

	// Possibly modified view of the tokens_ token sequence.
	TokenStreamView tokens_view_;

	// Index of token iterators that mark the beginnings of each line.
	// Lazily calculated on request.
	mutable std::vector<TokenSequence::const_iterator> lazy_line_token_map_;

	// Tree representation of file contents.
	ConcreteSyntaxTree syntax_tree_;

	void TrimSyntaxTree(int first_token_offset, int last_token_offset);

	void TrimTokensToSubstring(int left_offset, int right_offset);

	void TrimContents(int left_offset, int length);

	void ConsumeDeferredExpansion(
	TokenSequence::const_iterator* next_token_iter,
	TokenStreamView::const_iterator* next_token_view_iter,
	DeferredExpansion* expansion, TokenSequence* combined_tokens,
	std::vector<int>* token_view_indices, const char* offset);

	// Resets all fields. Only needed in tests.
	void Clear();

	// Verify that internal iterators point to locations owned by this object,
	// and that all string_views in the tokens_view_ are substring views of the
	// contents_ string view.
	absl::Status FastTokenRangeConsistencyCheck() const;

	// Verify that line-based view of contents_ is consistent with the
	// contents_ text itself.
	absl::Status FastLineRangeConsistencyCheck() const;

	// Verify that the string views in the syntax tree are contained within
	// the contents_ string view.
	absl::Status SyntaxTreeConsistencyCheck() const;
	};

	// TextStructure holds the text and the results of lexing, parsing, and other
	// analysis in the corresponding TextStructureView.
	//
	// This class is not providing much benefit as ownership of memory and the
	// parse result are only slightly related; but combining them here makes it
	// harder to actually handle memory ownership and views independently. For
	// instance the FileAnalyzer should keep track of the file content itself and
	// then choose to generate a view (or multiple) on top of that (e.g for
	// fallback parsing). Similar for VerilogSourceFile.
	// The language server already has an in-memory representation which is
	// unnecessarily copied into a TextStructure just to do the rest of the
	// analysis, etc.. Long story short: it is beneficial to separate ownership and
	// views.
	//
	// So: this class is eventually to be removed. For now, make all the
	// constructors private and add explicitly mention all uses as friend classes,
	// so a future refactoring is easier.
	// (If, in the meantime, more TextStructure use is needed in other classes,
	// not to worry: just add them here as friend class. This is merely
	// documentation of use currently).
	class TextStructure {
	private:
	friend class FileAnalyzer;
	friend class TextStructureTokenized;
	friend class TextStructureViewPublicTest_ExpandSubtreesOneLeaf_Test;
	friend class TextStructureViewPublicTest_ExpandSubtreesMultipleLeaves_Test;
	friend class verilog::VerilogPreprocess;

	explicit TextStructure(std::shared_ptr<MemBlock> contents);

	// Convenience constructor in case our input is a string.
	explicit TextStructure(absl::string_view contents);

	public:
	TextStructure(const TextStructure&) = delete;
	TextStructure& operator=(const TextStructure&) = delete;
	TextStructure(TextStructure&&) = delete;
	TextStructure& operator=(TextStructure&&) = delete;

	// DeferredExpansion::subanalysis requires this destructor to be virtual.
	virtual ~TextStructure();

	const TextStructureView& Data() const { return data_; }

	TextStructureView& MutableData() { return data_; }

	const ConcreteSyntaxTree& SyntaxTree() const { return data_.SyntaxTree(); }

	// Verify that string_views are inside memory owned by owned_contents_.
	absl::Status StringViewConsistencyCheck() const;

	// Verify that internal data structures have valid ranges.
	absl::Status InternalConsistencyCheck() const;

	protected:
	// The content of this memblock is referenced in the TextStructureView.
	// The data itself might be shared between multiple entitites
	// (using a heavy shared_ptr might very well intermediate while refactoring
	// the details. https://github.com/chipsalliance/verible/issues/1502 )
	std::shared_ptr<MemBlock> contents_;

	// The data_ object's string_views are owned by owned_contents_.
	TextStructureView data_;
	};

	} // namespace verible

	#endif // VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_