blob: 6cd7aae1c361d763dd172a267305ef2d0d86037a [file] [log] [blame]
// Copyright 2017-2020 The Verible Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Implementation of TextStructure methods.
#include "common/text/text_structure.h"
#include <algorithm>
#include <cstddef>
#include <iterator>
#include <memory>
#include <sstream> // IWYU pragma: keep // for ostringstream
#include <string>
#include <utility>
#include <vector>
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "common/strings/line_column_map.h"
#include "common/text/concrete_syntax_leaf.h"
#include "common/text/concrete_syntax_tree.h"
#include "common/text/symbol.h"
#include "common/text/token_info.h"
#include "common/text/token_stream_view.h"
#include "common/text/tree_utils.h"
#include "common/util/iterator_range.h"
#include "common/util/logging.h"
#include "common/util/range.h"
#include "common/util/status_macros.h"
namespace verible {
TextStructureView::TextStructureView(absl::string_view contents)
: contents_(contents) {
// more than sufficient memory as number-of-tokens <= bytes-in-file,
// push_back() should never re-alloc because size <= initial capacity.
tokens_.reserve(contents.length());
const absl::Status status = InternalConsistencyCheck();
CHECK(status.ok())
<< "Failed internal iterator/string_view consistency check in ctor:\n "
<< status.message();
}
TextStructureView::~TextStructureView() {
const absl::Status status = InternalConsistencyCheck();
CHECK(status.ok())
<< "Failed internal iterator/string_view consistency check in dtor:\n "
<< status.message();
}
void TextStructureView::Clear() {
syntax_tree_ = nullptr;
lazy_lines_info_.valid = false;
lazy_line_token_map_.clear();
tokens_view_.clear();
tokens_.clear();
contents_ = contents_.substr(0, 0); // clear
}
static bool TokenLocationLess(const TokenInfo& token, const char* offset) {
return token.text().begin() < offset;
}
// Makes an iterator-writable copy of items_view without using const_cast.
template <class V>
std::vector<typename V::iterator> _CopyWriteableIterators(
V& items, const std::vector<typename V::const_iterator>& items_view) {
// precondition: items_view's iterators all point into items array.
// postcondition: results's iterators point to the same items as items_view.
std::vector<typename V::iterator> result;
result.reserve(items_view.size());
typename V::iterator iter(items.begin());
const typename V::const_iterator const_iter(items.begin());
for (auto view_iter : items_view) {
result.push_back(iter + std::distance(const_iter, view_iter));
}
return result;
}
TokenStreamReferenceView TextStructureView::MakeTokenStreamReferenceView() {
return _CopyWriteableIterators(tokens_, tokens_view_);
}
const std::vector<TokenSequence::const_iterator>&
TextStructureView::GetLineTokenMap() const {
// Lazily calculate the map. It is mutable, so we can modify it here.
if (lazy_line_token_map_.empty()) {
auto token_iter = tokens_.cbegin();
const auto& offset_map = GetLineColumnMap().GetBeginningOfLineOffsets();
for (const auto offset : offset_map) {
// TODO(fangism): linear search might be as competitive as binary search
token_iter =
std::lower_bound(token_iter, tokens_.cend(),
Contents().begin() + offset, &TokenLocationLess);
lazy_line_token_map_.push_back(token_iter);
}
// Add an end() iterator so map has N+1 entries.
// When there is no newline on the last line, this value will be different
// from the previous iterator's index.
lazy_line_token_map_.push_back(tokens_.cend());
}
return lazy_line_token_map_;
}
TokenRange TextStructureView::TokenRangeSpanningOffsets(size_t lower,
size_t upper) const {
const auto text_base = Contents().begin();
// Range is found with two binary searches.
// TODO(fangism): Is there a way to make this work with a single call
// to std::equal_range()?
const auto left = std::lower_bound(tokens_.cbegin(), tokens_.cend(),
text_base + lower, &TokenLocationLess);
const auto right = std::lower_bound(left, tokens_.cend(), text_base + upper,
&TokenLocationLess);
return make_range(left, right);
}
LineColumnRange TextStructureView::GetRangeForToken(
const TokenInfo& token) const {
if (token.isEOF()) {
// In particular some unit tests pass in an artificial EOF token, not a
// EOF token generated from this view. So handle this directly.
const LineColumn eofPos = GetLineColAtOffset(Contents().length());
return {eofPos, eofPos};
}
// TODO(hzeller): This should simply be GetRangeForText(token.text()),
// but the more thorough error checking in GetRangeForText()
// exposes a token overrun in verilog_analyzer_test.cc
// Defer to fix in separate change.
return {GetLineColAtOffset(token.left(Contents())),
GetLineColAtOffset(token.right(Contents()))};
}
LineColumnRange TextStructureView::GetRangeForText(
absl::string_view text) const {
const int from = std::distance(Contents().begin(), text.begin());
const int to = std::distance(Contents().begin(), text.end());
CHECK_GE(from, 0) << '"' << text << '"';
CHECK_LE(to, static_cast<int>(Contents().length())) << '"' << text << '"';
return {GetLineColAtOffset(from), GetLineColAtOffset(to)};
}
TokenRange TextStructureView::TokenRangeOnLine(size_t lineno) const {
const auto& line_token_map = GetLineTokenMap();
if (lineno + 1 < line_token_map.size()) {
return make_range(line_token_map[lineno], line_token_map[lineno + 1]);
} else {
return make_range(tokens_.cend(), tokens_.cend());
}
}
TokenInfo TextStructureView::FindTokenAt(const LineColumn& pos) const {
if (pos.line < 0 || pos.column < 0) return EOFToken();
// Maybe do binary search here if we have a huge amount tokens per line.
for (const TokenInfo& token : TokenRangeOnLine(pos.line)) {
if (GetRangeForToken(token).PositionInRange(pos)) return token;
}
return EOFToken();
}
TokenInfo TextStructureView::EOFToken() const {
return TokenInfo::EOFToken(Contents());
}
// Removes tokens from the TokenStreamView that do not satisfy the keep
// predicate.
void TextStructureView::FilterTokens(const TokenFilterPredicate& keep) {
FilterTokenStreamViewInPlace(keep, &tokens_view_);
}
static void TerminateTokenStream(TokenSequence* tokens) {
if (tokens->empty()) return;
if (tokens->back().isEOF()) return;
// push_back might cause re-alloc.
tokens->push_back(TokenInfo::EOFToken(tokens->back().text()));
}
void TextStructureView::FocusOnSubtreeSpanningSubstring(int left_offset,
int length) {
VLOG(2) << __FUNCTION__ << " at " << left_offset << " +" << length;
const int right_offset = left_offset + length;
TrimSyntaxTree(left_offset, right_offset);
TrimTokensToSubstring(left_offset, right_offset);
TrimContents(left_offset, length);
lazy_lines_info_.valid = false;
CalculateFirstTokensPerLine();
const absl::Status status = InternalConsistencyCheck();
CHECK(status.ok())
<< "Failed internal iterator/string_view consistency check:\n "
<< status.message();
VLOG(2) << "end of " << __FUNCTION__;
}
// Replace the syntax_tree_ field with the largest subtree wholly contained
// inside the offset bounds.
// Discards nodes outside of the referenced subtree.
void TextStructureView::TrimSyntaxTree(int first_token_offset,
int last_token_offset) {
const absl::string_view text_range(Contents().substr(
first_token_offset, last_token_offset - first_token_offset));
verible::TrimSyntaxTree(&syntax_tree_, text_range);
}
// Reduces the set of tokens to that spanned by [left_offset, right_offset).
// The resulting token stream is terminated with an EOF token, whose range
// reflects the right_offset.
void TextStructureView::TrimTokensToSubstring(int left_offset,
int right_offset) {
VLOG(2) << __FUNCTION__ << " [" << left_offset << ',' << right_offset << ')';
// Find first token that starts at or after the offset. (binary_search)
// Find first token that starts beyond the syntax tree. (binary_search)
const auto view_trim_range =
TokenRangeSpanningOffsets(left_offset, right_offset);
CHECK(tokens_.begin() <= view_trim_range.begin());
CHECK(view_trim_range.begin() <= view_trim_range.end());
CHECK(view_trim_range.end() <= tokens_.end());
// Find the view iterators that fall within this range.
const auto iter_trim_begin = std::lower_bound(
tokens_view_.begin(), tokens_view_.end(), view_trim_range.begin());
const auto iter_trim_end = std::lower_bound(
iter_trim_begin, tokens_view_.end(), view_trim_range.end());
// Copy subset of tokens to new token sequence.
TokenSequence trimmed_stream(view_trim_range.begin(), view_trim_range.end());
// If the last token straddles the end-of-range, (possibly due to lexical
// error), then trim its tail, bounded by right_offset.
if (!trimmed_stream.empty()) {
const absl::string_view substr(
contents_.substr(left_offset, right_offset - left_offset));
TokenInfo& last(trimmed_stream.back());
const int overhang = std::distance(substr.end(), last.text().end());
if (!IsSubRange(last.text(), substr)) {
VLOG(2) << "last token overhangs end by " << overhang << ": " << last;
absl::string_view trimmed_tail_token(last.text());
trimmed_tail_token.remove_suffix(overhang);
last.set_text(trimmed_tail_token);
// TODO(fangism): Should the token enum be set to some error value,
// if it is not already an error value?
}
}
TerminateTokenStream(&trimmed_stream); // Append EOF token.
// Recalculate iterators for new token stream view, pointing into new
// token sequence.
const int index_difference =
std::distance(tokens_.cbegin(), view_trim_range.begin());
TokenStreamView trimmed_view;
trimmed_view.reserve(std::distance(iter_trim_begin, iter_trim_end));
for (auto token_iterator : make_range(iter_trim_begin, iter_trim_end)) {
const int old_index = std::distance(tokens_.cbegin(), token_iterator);
const int new_index = old_index - index_difference;
trimmed_view.push_back(trimmed_stream.begin() + new_index);
}
// Swap new-old arrays, which will cause old arrays to be deleted.
tokens_view_.swap(trimmed_view);
tokens_.swap(trimmed_stream);
}
void TextStructureView::TrimContents(int left_offset, int length) {
contents_ = contents_.substr(left_offset, length);
}
const TextStructureView::LinesInfo& TextStructureView::LinesInfo::Get(
absl::string_view contents) {
if (valid) return *this;
lines = absl::StrSplit(contents, '\n');
line_column_map.reset(new LineColumnMap(lines));
valid = true;
return *this;
}
void TextStructureView::RebaseTokensToSuperstring(absl::string_view superstring,
absl::string_view src_base,
int offset) {
MutateTokens([&](TokenInfo* token) {
const int delta = token->left(src_base);
// Superstring must point to separate memory space.
token->RebaseStringView(superstring.begin() + offset + delta);
});
// Assigning superstring for the sake of maintaining range invariants.
contents_ = superstring;
lazy_lines_info_.valid = false;
}
void TextStructureView::MutateTokens(const LeafMutator& mutator) {
for (auto& token : tokens_) {
mutator(&token);
}
// No need to touch tokens_view_, all transformations are in-place.
if (syntax_tree_ != nullptr) {
// The tokens at the leaves of the tree are their own copies, and thus
// need to re-apply the same transformation.
MutateLeaves(&syntax_tree_, mutator);
}
}
// Find the last non-EOF token. Usually searches at most 2 tokens.
static const TokenInfo* FindLastNonEOFToken(const TokenSequence& tokens) {
const auto iter =
std::find_if(tokens.rbegin(), tokens.rend(),
[](const TokenInfo& token) { return !token.isEOF(); });
return iter != tokens.rend() ? &*iter : nullptr;
}
absl::Status TextStructureView::FastTokenRangeConsistencyCheck() const {
VLOG(2) << __FUNCTION__;
// Check the ranges of the first and last element of the critical arrays.
// A more thorough full-check would scan every single token.
const auto lower_bound = contents_.begin();
const auto upper_bound = contents_.end();
if (!tokens_.empty()) {
// Check that extremities of first and last token lie inside contents_.
const TokenInfo& first = tokens_.front();
if (!first.isEOF() && lower_bound > first.text().cbegin()) {
return absl::InternalError(absl::StrCat(
"Token offset points before beginning of string contents. delta=",
std::distance(first.text().cbegin(), lower_bound)));
}
const TokenInfo* last = FindLastNonEOFToken(tokens_);
if (last != nullptr && last->text().cend() > upper_bound) {
return absl::InternalError(absl::StrCat(
"Token offset points past end of string contents. delta=",
std::distance(upper_bound, last->text().cend())));
}
if (!tokens_view_.empty()) {
// Check that TokenSequence iterators point into tokens_.
if (tokens_.begin() > tokens_view_.front()) {
return absl::InternalError(
"First token iterator points before beginning of array.");
}
if (tokens_view_.front() >= tokens_.end()) {
return absl::InternalError(
"First token iterator points past end of array.");
}
if (tokens_.begin() > tokens_view_.back()) {
return absl::InternalError(
"Last token iterator points before beginning of array.");
}
if (tokens_view_.back() >= tokens_.end()) {
return absl::InternalError(
"Last token iterator points past end of array.");
}
}
if (!lazy_line_token_map_.empty()) {
if (lazy_line_token_map_.front() != tokens_.begin()) {
return absl::InternalError(
"Per-line token iterator map does not start with the beginning of "
"the token sequence.");
}
if (lazy_line_token_map_.back() != tokens_.end()) {
return absl::InternalError(
"Per-line token iterator map does not end with to the end of the "
"token sequence.");
}
}
}
return absl::OkStatus();
}
absl::Status TextStructureView::FastLineRangeConsistencyCheck() const {
VLOG(2) << __FUNCTION__;
const auto& lines = Lines();
if (!lines.empty()) {
if (lines.front().cbegin() != contents_.cbegin()) {
return absl::InternalError(
"First line does not match beginning of text.");
}
if (lines.back().cend() != contents_.cend()) {
return absl::InternalError("Last line does not match end of text.");
}
}
return absl::OkStatus();
}
absl::Status TextStructureView::SyntaxTreeConsistencyCheck() const {
VLOG(2) << __FUNCTION__;
// Check that first and last token in syntax_tree_ point to text
// inside contents_.
const char* const lower_bound = contents_.data();
const char* const upper_bound = lower_bound + contents_.length();
if (syntax_tree_ != nullptr) {
const SyntaxTreeLeaf* left = GetLeftmostLeaf(*syntax_tree_);
if (!left) return absl::OkStatus();
const SyntaxTreeLeaf* right = GetRightmostLeaf(*syntax_tree_);
if (lower_bound > left->get().text().cbegin()) {
return absl::InternalError(
"Left-most tree leaf points before beginning of contents.");
}
if (right->get().text().cend() > upper_bound) {
return absl::InternalError(
"Right-most tree leaf points past end of contents.");
}
}
return absl::OkStatus();
}
absl::Status TextStructureView::InternalConsistencyCheck() const {
RETURN_IF_ERROR(FastLineRangeConsistencyCheck());
RETURN_IF_ERROR(FastTokenRangeConsistencyCheck());
return SyntaxTreeConsistencyCheck();
}
// TokenRange can be a container reference or iterator range.
// TokenViewRange can be a container reference or iterator range.
template <typename TokenRange, typename TokenViewRange>
static void CopyTokensAndView(TokenSequence* destination,
std::vector<int>* view_indices,
const TokenRange& token_source,
const TokenViewRange& view_source) {
// Translate token_view's iterators into array indices, adjusting for the
// number of pre-existing tokens.
for (const auto& token_iter : view_source) {
view_indices->push_back(destination->size() +
std::distance(token_source.begin(), token_iter));
}
// Copy tokens up to this expansion point.
for (const auto& token : token_source) {
destination->push_back(token);
}
}
// Incrementally copies a slice of tokens and expands a single subtree.
// This advances the next_token_iter and next_token_view_iter iterators.
// The subtree from the expansion is transferred into this objects's syntax
// tree. Indices into the final token stream view are collected in
// token_view_indices. Offset is the location of each expansion point.
void TextStructureView::ConsumeDeferredExpansion(
TokenSequence::const_iterator* next_token_iter,
TokenStreamView::const_iterator* next_token_view_iter,
DeferredExpansion* expansion, TokenSequence* combined_tokens,
std::vector<int>* token_view_indices, const char* offset) {
auto token_iter = *next_token_iter;
auto token_view_iter = *next_token_view_iter;
// Find the position up to each expansion point.
*next_token_iter =
std::lower_bound(token_iter, tokens_.cend(), offset,
[](const TokenInfo& token, const char* target) {
return std::distance(target, token.text().begin()) < 0;
});
CHECK(*next_token_iter != tokens_.cend());
*next_token_view_iter = std::lower_bound(
token_view_iter, tokens_view_.cend(), offset,
[](TokenStreamView::const_reference token_ref, const char* target) {
return std::distance(target, token_ref->text().begin()) < 0;
});
CHECK(*next_token_view_iter != tokens_view_.cend());
// Copy tokens and partial view into output.
CopyTokensAndView(combined_tokens, token_view_indices,
make_range(token_iter, *next_token_iter),
make_range(token_view_iter, *next_token_view_iter));
// Adjust locations of tokens in the expanded tree by pointing them
// into the original text (contents_).
std::unique_ptr<TextStructure>& subanalysis = expansion->subanalysis;
TextStructureView& sub_data = ABSL_DIE_IF_NULL(subanalysis)->MutableData();
const absl::string_view sub_data_text(sub_data.Contents());
CHECK(!IsSubRange(sub_data_text, contents_));
CHECK_EQ(sub_data_text, absl::string_view(offset, sub_data_text.length()));
CHECK_GE(offset, contents_.begin());
sub_data.RebaseTokensToSuperstring(contents_, sub_data_text,
std::distance(contents_.begin(), offset));
// Translate token_view's iterators into array indices.
if (!sub_data.tokens_.empty() && sub_data.tokens_.back().isEOF()) {
// Remove auxiliary data's end-token sentinel before copying.
// Don't want to splice it into result.
sub_data.tokens_.pop_back();
}
CopyTokensAndView(combined_tokens, token_view_indices, sub_data.tokens_,
sub_data.tokens_view_);
// Transfer ownership of transformed syntax tree to this object's tree.
*expansion->expansion_point = std::move(sub_data.MutableSyntaxTree());
subanalysis->MutableData().Clear();
// Advance one past expansion point to skip over expanded token.
++*next_token_iter;
++*next_token_view_iter;
}
TextStructure::TextStructure(std::shared_ptr<MemBlock> contents)
: contents_(std::move(contents)), data_(contents_->AsStringView()) {
// Internal string_view must point to memory owned by contents_.
const absl::Status status = InternalConsistencyCheck();
CHECK(status.ok()) << status.message() << " (in ctor)";
}
TextStructure::TextStructure(absl::string_view contents)
: TextStructure(std::make_shared<StringMemBlock>(contents)) {}
TextStructure::~TextStructure() {
const absl::Status status = StringViewConsistencyCheck();
CHECK(status.ok()) << status.message() << " (in dtor)";
}
void TextStructureView::ExpandSubtrees(NodeExpansionMap* expansions) {
TokenSequence combined_tokens;
// Gather indices and reconstruct iterators after there are no more
// reallocations due to growing combined_tokens.
std::vector<int> combined_token_view_indices;
auto token_iter = tokens_.cbegin();
auto token_view_iter = tokens_view_.cbegin();
for (auto& expansion_entry : *expansions) {
const auto offset = Contents().begin() + expansion_entry.first;
ConsumeDeferredExpansion(&token_iter, &token_view_iter,
&expansion_entry.second, &combined_tokens,
&combined_token_view_indices, offset);
}
// Copy the remaining tokens beyond the last expansion point.
CopyTokensAndView(&combined_tokens, &combined_token_view_indices,
make_range(token_iter, tokens_.cend()),
make_range(token_view_iter, tokens_view_.cend()));
// Commit the newly expanded sequence of tokens.
tokens_.swap(combined_tokens);
// Reconstruct view iterators from indices into the new sequence.
tokens_view_.clear();
tokens_view_.reserve(combined_token_view_indices.size());
for (const auto index : combined_token_view_indices) {
tokens_view_.push_back(tokens_.cbegin() + index);
}
// Recalculate line-by-line token ranges.
// TODO(fangism): Should be possible to update line_token_map_ incrementally
// as well.
CalculateFirstTokensPerLine();
}
absl::Status TextStructure::StringViewConsistencyCheck() const {
const absl::string_view contents = data_.Contents();
if (!contents.empty() && !IsSubRange(contents, contents_->AsStringView())) {
return absl::InternalError(
"string_view contents_ is not a substring of contents_, "
"contents_ might reference deallocated memory!");
}
return absl::OkStatus();
}
absl::Status TextStructure::InternalConsistencyCheck() const {
RETURN_IF_ERROR(StringViewConsistencyCheck());
return data_.InternalConsistencyCheck();
}
} // namespace verible