blob: 481056f5cb328abcd2f3e8a06bce305a34cd50aa [file] [log] [blame]
// Copyright 2017-2020 The Verible Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// TextStructure is a class responsible for managing the structural
// information of a block of text: tokenized view, syntax tree.
// It retains a shared pointer to the backing memory referenced by its
// string_views, and is suitable for breaking out substring analyses.
//
// See test_structure_test_utils.h for utilities for constructing fake
// (valid) TextStructures without a lexer or parser.
//
// TODO(fangism): object serialization/deserialization for TextStructure or
// TextStructureView. Could also be related to protocol-buffer-ization.
#ifndef VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_
#define VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_
#include <cstddef>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "absl/status/status.h"
#include "absl/strings/string_view.h"
#include "common/strings/line_column_map.h"
#include "common/strings/mem_block.h"
#include "common/text/concrete_syntax_tree.h"
#include "common/text/symbol.h"
#include "common/text/token_stream_view.h"
#include "common/text/tree_utils.h"
namespace verilog {
class VerilogPreprocess;
} // namespace verilog
namespace verible {
class TextStructure;
// TextStructureView contains sequences of tokens and a tree, but all
// string_views in this structure rely on string memory owned elsewhere.
//
// TODO(hzeller): This is a kitchen sink and should be split into multiple
// aspects; tokens, concrete syntax tree or line number mapping are different
// aspects not needed everywhere.
class TextStructureView {
public:
// Deferred in-place expansion of the syntax tree.
// TODO(b/136014603): Replace with expandable token stream view abstraction.
struct DeferredExpansion {
// Position in the syntax tree to expand (leaf or node).
std::unique_ptr<Symbol>* expansion_point;
// Analysis of the substring that corresponds to the expansion_point.
std::unique_ptr<TextStructure> subanalysis;
};
// NodeExpansionMap is a map of offsets to substring analysis results
// that are to be expanded. The rationale is that it is more efficient to
// collect expansions and process them in bulk rather than as each
// expansion is encountered.
using NodeExpansionMap = std::map<int, DeferredExpansion>;
explicit TextStructureView(absl::string_view contents);
~TextStructureView();
// Do not copy/assign. This contains pointers/iterators to internals.
TextStructureView(const TextStructureView&) = delete;
TextStructureView& operator=(const TextStructureView&) = delete;
absl::string_view Contents() const { return contents_; }
const std::vector<absl::string_view>& Lines() const {
return lazy_lines_info_.Get(contents_).lines;
}
const ConcreteSyntaxTree& SyntaxTree() const { return syntax_tree_; }
ConcreteSyntaxTree& MutableSyntaxTree() { return syntax_tree_; }
const TokenSequence& TokenStream() const { return tokens_; }
TokenSequence& MutableTokenStream() { return tokens_; }
const TokenStreamView& GetTokenStreamView() const { return tokens_view_; }
TokenStreamView& MutableTokenStreamView() { return tokens_view_; }
// Creates a stream of modifiable iterators to the filtered tokens.
// Uses tokens_view_ to create the iterators.
TokenStreamReferenceView MakeTokenStreamReferenceView();
const LineColumnMap& GetLineColumnMap() const {
return *lazy_lines_info_.Get(contents_).line_column_map;
}
// Given a byte offset, return the line/column
LineColumn GetLineColAtOffset(int bytes_offset) const {
return GetLineColumnMap().GetLineColAtOffset(contents_, bytes_offset);
}
// Convenience function: Given the token, return the range it covers.
LineColumnRange GetRangeForToken(const TokenInfo& token) const;
// Convenience function: Given a text snippet, that needs to be a substring
// of Contents(), return the range it covers.
LineColumnRange GetRangeForText(absl::string_view text) const;
const std::vector<TokenSequence::const_iterator>& GetLineTokenMap() const;
// Given line/column, find token that is available there. If this is out of
// range, returns EOF.
TokenInfo FindTokenAt(const LineColumn& pos) const;
// Create the EOF token given the contents.
TokenInfo EOFToken() const;
// Trigger line token map re-calculation on next request.
void CalculateFirstTokensPerLine() { lazy_line_token_map_.clear(); }
// Returns iterator range of tokens that span the given file offsets.
// The second iterator points 1-past-the-end of the range.
TokenRange TokenRangeSpanningOffsets(size_t lower, size_t upper) const;
// Returns an iterator range of tokens that start on the given line number.
// The lineno index is 0-based. The last token spanned by the returned
// range is the newline token that terminates the line.
// Precondition: CalculateFirstTokensPerLine() has already been called.
TokenRange TokenRangeOnLine(size_t lineno) const;
// Filter out tokens from token stream view before parsing.
// Can be called successively with different predicates.
void FilterTokens(const TokenFilterPredicate&);
// Apply the same transformation to the token sequence, and the tokens
// that were copied into the syntax tree.
void MutateTokens(const LeafMutator& mutator);
// Update tokens to point their text into new (superstring) owner.
// This is done to prepare for transfer of ownership of syntax_tree_
// to a new owner.
void RebaseTokensToSuperstring(absl::string_view superstring,
absl::string_view src_base, int offset);
// Narrows the view of text, tokens, and syntax tree to the node that starts
// at left_offset. The resulting state looks as if only a snippet of
// text were parsed as a particular construct of the larger grammar.
// The contents will be pared down to a substring, and irrelevant tokens will
// be pruned from the token sequence and syntax tree.
void FocusOnSubtreeSpanningSubstring(int left_offset, int length);
// ExpandSubtrees performs bulk substitution of syntax tree leaves to
// subtrees that result from other analyses. Memory ownership of the
// analysis results passed through the expansions is transferred (consumed)
// by this function.
void ExpandSubtrees(NodeExpansionMap* expansions);
// All of this class's consistency checks combined.
absl::Status InternalConsistencyCheck() const;
protected:
// This is the text that is spanned by the token sequence and syntax tree.
// This is required for calculating byte offsets to substrings contained
// within this structure. Pass this (via Contents()) to TokenInfo::left() and
// TokenInfo::right() to calculate byte offsets, useful for diagnostics.
absl::string_view contents_;
// TODO(hzeller): These lazily generated elements are good candidates
// for breaking out into their own abstraction.
struct LinesInfo {
bool valid = false;
// Line-by-line view of contents_.
std::vector<absl::string_view> lines;
// Map to translate byte-offsets to line and column for diagnostics.
std::unique_ptr<LineColumnMap> line_column_map;
const LinesInfo& Get(absl::string_view contents);
};
// Mutable as we fill it lazily on request; conceptually the data is const.
mutable LinesInfo lazy_lines_info_;
// Tokens that constitute the original file (contents_).
// This should always be terminated with a sentinel EOF token.
TokenSequence tokens_;
// Possibly modified view of the tokens_ token sequence.
TokenStreamView tokens_view_;
// Index of token iterators that mark the beginnings of each line.
// Lazily calculated on request.
mutable std::vector<TokenSequence::const_iterator> lazy_line_token_map_;
// Tree representation of file contents.
ConcreteSyntaxTree syntax_tree_;
void TrimSyntaxTree(int first_token_offset, int last_token_offset);
void TrimTokensToSubstring(int left_offset, int right_offset);
void TrimContents(int left_offset, int length);
void ConsumeDeferredExpansion(
TokenSequence::const_iterator* next_token_iter,
TokenStreamView::const_iterator* next_token_view_iter,
DeferredExpansion* expansion, TokenSequence* combined_tokens,
std::vector<int>* token_view_indices, const char* offset);
// Resets all fields. Only needed in tests.
void Clear();
// Verify that internal iterators point to locations owned by this object,
// and that all string_views in the tokens_view_ are substring views of the
// contents_ string view.
absl::Status FastTokenRangeConsistencyCheck() const;
// Verify that line-based view of contents_ is consistent with the
// contents_ text itself.
absl::Status FastLineRangeConsistencyCheck() const;
// Verify that the string views in the syntax tree are contained within
// the contents_ string view.
absl::Status SyntaxTreeConsistencyCheck() const;
};
// TextStructure holds the text and the results of lexing, parsing, and other
// analysis in the corresponding TextStructureView.
//
// This class is not providing much benefit as ownership of memory and the
// parse result are only slightly related; but combining them here makes it
// harder to actually handle memory ownership and views independently. For
// instance the FileAnalyzer should keep track of the file content itself and
// then choose to generate a view (or multiple) on top of that (e.g for
// fallback parsing). Similar for VerilogSourceFile.
// The language server already has an in-memory representation which is
// unnecessarily copied into a TextStructure just to do the rest of the
// analysis, etc.. Long story short: it is beneficial to separate ownership and
// views.
//
// So: this class is eventually to be removed. For now, make all the
// constructors private and add explicitly mention all uses as friend classes,
// so a future refactoring is easier.
// (If, in the meantime, more TextStructure use is needed in other classes,
// not to worry: just add them here as friend class. This is merely
// documentation of use currently).
class TextStructure {
private:
friend class FileAnalyzer;
friend class TextStructureTokenized;
friend class TextStructureViewPublicTest_ExpandSubtreesOneLeaf_Test;
friend class TextStructureViewPublicTest_ExpandSubtreesMultipleLeaves_Test;
friend class verilog::VerilogPreprocess;
explicit TextStructure(std::shared_ptr<MemBlock> contents);
// Convenience constructor in case our input is a string.
explicit TextStructure(absl::string_view contents);
public:
TextStructure(const TextStructure&) = delete;
TextStructure& operator=(const TextStructure&) = delete;
TextStructure(TextStructure&&) = delete;
TextStructure& operator=(TextStructure&&) = delete;
// DeferredExpansion::subanalysis requires this destructor to be virtual.
virtual ~TextStructure();
const TextStructureView& Data() const { return data_; }
TextStructureView& MutableData() { return data_; }
const ConcreteSyntaxTree& SyntaxTree() const { return data_.SyntaxTree(); }
// Verify that string_views are inside memory owned by owned_contents_.
absl::Status StringViewConsistencyCheck() const;
// Verify that internal data structures have valid ranges.
absl::Status InternalConsistencyCheck() const;
protected:
// The content of this memblock is referenced in the TextStructureView.
// The data itself might be shared between multiple entitites
// (using a heavy shared_ptr might very well intermediate while refactoring
// the details. https://github.com/chipsalliance/verible/issues/1502 )
std::shared_ptr<MemBlock> contents_;
// The data_ object's string_views are owned by owned_contents_.
TextStructureView data_;
};
} // namespace verible
#endif // VERIBLE_COMMON_TEXT_TEXT_STRUCTURE_H_