common/text/text_structure.cc - third_party/verible - Git at Google

 // Copyright 2017-2020 The Verible Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Implementation of TextStructure methods.

 #include "common/text/text_structure.h"

 #include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <memory>
 #include <sstream>  // IWYU pragma: keep  // for ostringstream
 #include <string>
 #include <utility>
 #include <vector>

 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "common/strings/line_column_map.h"
 #include "common/text/concrete_syntax_leaf.h"
 #include "common/text/concrete_syntax_tree.h"
 #include "common/text/symbol.h"
 #include "common/text/token_info.h"
 #include "common/text/token_stream_view.h"
 #include "common/text/tree_utils.h"
 #include "common/util/iterator_range.h"
 #include "common/util/logging.h"
 #include "common/util/range.h"
 #include "common/util/status_macros.h"

 namespace verible {

 TextStructureView::TextStructureView(absl::string_view contents)
     : contents_(contents) {
   // more than sufficient memory as number-of-tokens <= bytes-in-file,
   // push_back() should never re-alloc because size <= initial capacity.
   tokens_.reserve(contents.length());
   const absl::Status status = InternalConsistencyCheck();
   CHECK(status.ok())
       << "Failed internal iterator/string_view consistency check in ctor:\n  "
       << status.message();
 }

 TextStructureView::~TextStructureView() {
   const absl::Status status = InternalConsistencyCheck();
   CHECK(status.ok())
       << "Failed internal iterator/string_view consistency check in dtor:\n  "
       << status.message();
 }

 void TextStructureView::Clear() {
   syntax_tree_ = nullptr;
   lazy_lines_info_.valid = false;
   lazy_line_token_map_.clear();
   tokens_view_.clear();
   tokens_.clear();
   contents_ = contents_.substr(0, 0);  // clear
 }

 static bool TokenLocationLess(const TokenInfo& token, const char* offset) {
   return token.text().begin() < offset;
 }

 // Makes an iterator-writable copy of items_view without using const_cast.
 template <class V>
 std::vector<typename V::iterator> _CopyWriteableIterators(
     V& items, const std::vector<typename V::const_iterator>& items_view) {
   // precondition: items_view's iterators all point into items array.
   // postcondition: results's iterators point to the same items as items_view.
   std::vector<typename V::iterator> result;
   result.reserve(items_view.size());
   typename V::iterator iter(items.begin());
   const typename V::const_iterator const_iter(items.begin());
   for (auto view_iter : items_view) {
     result.push_back(iter + std::distance(const_iter, view_iter));
   }
   return result;
 }

 TokenStreamReferenceView TextStructureView::MakeTokenStreamReferenceView() {
   return _CopyWriteableIterators(tokens_, tokens_view_);
 }

 const std::vector<TokenSequence::const_iterator>&
 TextStructureView::GetLineTokenMap() const {
   // Lazily calculate the map. It is mutable, so we can modify it here.
   if (lazy_line_token_map_.empty()) {
     auto token_iter = tokens_.cbegin();
     const auto& offset_map = GetLineColumnMap().GetBeginningOfLineOffsets();
     for (const auto offset : offset_map) {
       // TODO(fangism): linear search might be as competitive as binary search
       token_iter =
           std::lower_bound(token_iter, tokens_.cend(),
                            Contents().begin() + offset, &TokenLocationLess);
       lazy_line_token_map_.push_back(token_iter);
     }
     // Add an end() iterator so map has N+1 entries.
     // When there is no newline on the last line, this value will be different
     // from the previous iterator's index.
     lazy_line_token_map_.push_back(tokens_.cend());
   }

   return lazy_line_token_map_;
 }

 TokenRange TextStructureView::TokenRangeSpanningOffsets(size_t lower,
                                                         size_t upper) const {
   const auto text_base = Contents().begin();
   // Range is found with two binary searches.
   // TODO(fangism): Is there a way to make this work with a single call
   // to std::equal_range()?
   const auto left = std::lower_bound(tokens_.cbegin(), tokens_.cend(),
                                      text_base + lower, &TokenLocationLess);
   const auto right = std::lower_bound(left, tokens_.cend(), text_base + upper,
                                       &TokenLocationLess);
   return make_range(left, right);
 }

 LineColumnRange TextStructureView::GetRangeForToken(
     const TokenInfo& token) const {
   if (token.isEOF()) {
     // In particular some unit tests pass in an artificial EOF token, not a
     // EOF token generated from this view. So handle this directly.
     const LineColumn eofPos = GetLineColAtOffset(Contents().length());
     return {eofPos, eofPos};
   }
   // TODO(hzeller): This should simply be GetRangeForText(token.text()),
   // but the more thorough error checking in GetRangeForText()
   // exposes a token overrun in verilog_analyzer_test.cc
   // Defer to fix in separate change.
   return {GetLineColAtOffset(token.left(Contents())),
           GetLineColAtOffset(token.right(Contents()))};
 }

 LineColumnRange TextStructureView::GetRangeForText(
     absl::string_view text) const {
   const int from = std::distance(Contents().begin(), text.begin());
   const int to = std::distance(Contents().begin(), text.end());
   CHECK_GE(from, 0) << '"' << text << '"';
   CHECK_LE(to, static_cast<int>(Contents().length())) << '"' << text << '"';
   return {GetLineColAtOffset(from), GetLineColAtOffset(to)};
 }

 TokenRange TextStructureView::TokenRangeOnLine(size_t lineno) const {
   const auto& line_token_map = GetLineTokenMap();
   if (lineno + 1 < line_token_map.size()) {
     return make_range(line_token_map[lineno], line_token_map[lineno + 1]);
   } else {
     return make_range(tokens_.cend(), tokens_.cend());
   }
 }

 TokenInfo TextStructureView::FindTokenAt(const LineColumn& pos) const {
   if (pos.line < 0 || pos.column < 0) return EOFToken();
   // Maybe do binary search here if we have a huge amount tokens per line.
   for (const TokenInfo& token : TokenRangeOnLine(pos.line)) {
     if (GetRangeForToken(token).PositionInRange(pos)) return token;
   }
   return EOFToken();
 }

 TokenInfo TextStructureView::EOFToken() const {
   return TokenInfo::EOFToken(Contents());
 }

 // Removes tokens from the TokenStreamView that do not satisfy the keep
 // predicate.
 void TextStructureView::FilterTokens(const TokenFilterPredicate& keep) {
   FilterTokenStreamViewInPlace(keep, &tokens_view_);
 }

 static void TerminateTokenStream(TokenSequence* tokens) {
   if (tokens->empty()) return;
   if (tokens->back().isEOF()) return;
   // push_back might cause re-alloc.
   tokens->push_back(TokenInfo::EOFToken(tokens->back().text()));
 }

 void TextStructureView::FocusOnSubtreeSpanningSubstring(int left_offset,
                                                         int length) {
   VLOG(2) << __FUNCTION__ << " at " << left_offset << " +" << length;
   const int right_offset = left_offset + length;
   TrimSyntaxTree(left_offset, right_offset);
   TrimTokensToSubstring(left_offset, right_offset);
   TrimContents(left_offset, length);
   lazy_lines_info_.valid = false;
   CalculateFirstTokensPerLine();
   const absl::Status status = InternalConsistencyCheck();
   CHECK(status.ok())
       << "Failed internal iterator/string_view consistency check:\n  "
       << status.message();
   VLOG(2) << "end of " << __FUNCTION__;
 }

 // Replace the syntax_tree_ field with the largest subtree wholly contained
 // inside the offset bounds.
 // Discards nodes outside of the referenced subtree.
 void TextStructureView::TrimSyntaxTree(int first_token_offset,
                                        int last_token_offset) {
   const absl::string_view text_range(Contents().substr(
       first_token_offset, last_token_offset - first_token_offset));
   verible::TrimSyntaxTree(&syntax_tree_, text_range);
 }

 // Reduces the set of tokens to that spanned by [left_offset, right_offset).
 // The resulting token stream is terminated with an EOF token, whose range
 // reflects the right_offset.
 void TextStructureView::TrimTokensToSubstring(int left_offset,
                                               int right_offset) {
   VLOG(2) << __FUNCTION__ << " [" << left_offset << ',' << right_offset << ')';
   // Find first token that starts at or after the offset.  (binary_search)
   // Find first token that starts beyond the syntax tree.  (binary_search)
   const auto view_trim_range =
       TokenRangeSpanningOffsets(left_offset, right_offset);
   CHECK(tokens_.begin() <= view_trim_range.begin());
   CHECK(view_trim_range.begin() <= view_trim_range.end());
   CHECK(view_trim_range.end() <= tokens_.end());

   // Find the view iterators that fall within this range.
   const auto iter_trim_begin = std::lower_bound(
       tokens_view_.begin(), tokens_view_.end(), view_trim_range.begin());
   const auto iter_trim_end = std::lower_bound(
       iter_trim_begin, tokens_view_.end(), view_trim_range.end());

   // Copy subset of tokens to new token sequence.
   TokenSequence trimmed_stream(view_trim_range.begin(), view_trim_range.end());

   // If the last token straddles the end-of-range, (possibly due to lexical
   // error), then trim its tail, bounded by right_offset.
   if (!trimmed_stream.empty()) {
     const absl::string_view substr(
         contents_.substr(left_offset, right_offset - left_offset));
     TokenInfo& last(trimmed_stream.back());
     const int overhang = std::distance(substr.end(), last.text().end());
     if (!IsSubRange(last.text(), substr)) {
       VLOG(2) << "last token overhangs end by " << overhang << ": " << last;
       absl::string_view trimmed_tail_token(last.text());
       trimmed_tail_token.remove_suffix(overhang);
       last.set_text(trimmed_tail_token);
       // TODO(fangism): Should the token enum be set to some error value,
       // if it is not already an error value?
     }
   }

   TerminateTokenStream(&trimmed_stream);  // Append EOF token.

   // Recalculate iterators for new token stream view, pointing into new
   // token sequence.
   const int index_difference =
       std::distance(tokens_.cbegin(), view_trim_range.begin());
   TokenStreamView trimmed_view;
   trimmed_view.reserve(std::distance(iter_trim_begin, iter_trim_end));
   for (auto token_iterator : make_range(iter_trim_begin, iter_trim_end)) {
     const int old_index = std::distance(tokens_.cbegin(), token_iterator);
     const int new_index = old_index - index_difference;
     trimmed_view.push_back(trimmed_stream.begin() + new_index);
   }

   // Swap new-old arrays, which will cause old arrays to be deleted.
   tokens_view_.swap(trimmed_view);
   tokens_.swap(trimmed_stream);
 }

 void TextStructureView::TrimContents(int left_offset, int length) {
   contents_ = contents_.substr(left_offset, length);
 }

 const TextStructureView::LinesInfo& TextStructureView::LinesInfo::Get(
     absl::string_view contents) {
   if (valid) return *this;

   lines = absl::StrSplit(contents, '\n');
   line_column_map.reset(new LineColumnMap(lines));
   valid = true;

   return *this;
 }

 void TextStructureView::RebaseTokensToSuperstring(absl::string_view superstring,
                                                   absl::string_view src_base,
                                                   int offset) {
   MutateTokens([&](TokenInfo* token) {
     const int delta = token->left(src_base);
     // Superstring must point to separate memory space.
     token->RebaseStringView(superstring.begin() + offset + delta);
   });
   // Assigning superstring for the sake of maintaining range invariants.
   contents_ = superstring;
   lazy_lines_info_.valid = false;
 }

 void TextStructureView::MutateTokens(const LeafMutator& mutator) {
   for (auto& token : tokens_) {
     mutator(&token);
   }
   // No need to touch tokens_view_, all transformations are in-place.

   if (syntax_tree_ != nullptr) {
     // The tokens at the leaves of the tree are their own copies, and thus
     // need to re-apply the same transformation.
     MutateLeaves(&syntax_tree_, mutator);
   }
 }

 // Find the last non-EOF token.  Usually searches at most 2 tokens.
 static const TokenInfo* FindLastNonEOFToken(const TokenSequence& tokens) {
   const auto iter =
       std::find_if(tokens.rbegin(), tokens.rend(),
                    [](const TokenInfo& token) { return !token.isEOF(); });
   return iter != tokens.rend() ? &*iter : nullptr;
 }

 absl::Status TextStructureView::FastTokenRangeConsistencyCheck() const {
   VLOG(2) << __FUNCTION__;
   // Check the ranges of the first and last element of the critical arrays.
   // A more thorough full-check would scan every single token.
   const auto lower_bound = contents_.begin();
   const auto upper_bound = contents_.end();
   if (!tokens_.empty()) {
     // Check that extremities of first and last token lie inside contents_.
     const TokenInfo& first = tokens_.front();
     if (!first.isEOF() && lower_bound > first.text().cbegin()) {
       return absl::InternalError(absl::StrCat(
           "Token offset points before beginning of string contents.  delta=",
           std::distance(first.text().cbegin(), lower_bound)));
     }
     const TokenInfo* last = FindLastNonEOFToken(tokens_);
     if (last != nullptr && last->text().cend() > upper_bound) {
       return absl::InternalError(absl::StrCat(
           "Token offset points past end of string contents.  delta=",
           std::distance(upper_bound, last->text().cend())));
     }
     if (!tokens_view_.empty()) {
       // Check that TokenSequence iterators point into tokens_.
       if (tokens_.begin() > tokens_view_.front()) {
         return absl::InternalError(
             "First token iterator points before beginning of array.");
       }
       if (tokens_view_.front() >= tokens_.end()) {
         return absl::InternalError(
             "First token iterator points past end of array.");
       }
       if (tokens_.begin() > tokens_view_.back()) {
         return absl::InternalError(
             "Last token iterator points before beginning of array.");
       }
       if (tokens_view_.back() >= tokens_.end()) {
         return absl::InternalError(
             "Last token iterator points past end of array.");
       }
     }
     if (!lazy_line_token_map_.empty()) {
       if (lazy_line_token_map_.front() != tokens_.begin()) {
         return absl::InternalError(
             "Per-line token iterator map does not start with the beginning of "
             "the token sequence.");
       }
       if (lazy_line_token_map_.back() != tokens_.end()) {
         return absl::InternalError(
             "Per-line token iterator map does not end with to the end of the "
             "token sequence.");
       }
     }
   }
   return absl::OkStatus();
 }

 absl::Status TextStructureView::FastLineRangeConsistencyCheck() const {
   VLOG(2) << __FUNCTION__;
   const auto& lines = Lines();
   if (!lines.empty()) {
     if (lines.front().cbegin() != contents_.cbegin()) {
       return absl::InternalError(
           "First line does not match beginning of text.");
     }
     if (lines.back().cend() != contents_.cend()) {
       return absl::InternalError("Last line does not match end of text.");
     }
   }
   return absl::OkStatus();
 }

 absl::Status TextStructureView::SyntaxTreeConsistencyCheck() const {
   VLOG(2) << __FUNCTION__;
   // Check that first and last token in syntax_tree_ point to text
   // inside contents_.
   const char* const lower_bound = contents_.data();
   const char* const upper_bound = lower_bound + contents_.length();
   if (syntax_tree_ != nullptr) {
     const SyntaxTreeLeaf* left = GetLeftmostLeaf(*syntax_tree_);
     if (!left) return absl::OkStatus();
     const SyntaxTreeLeaf* right = GetRightmostLeaf(*syntax_tree_);
     if (lower_bound > left->get().text().cbegin()) {
       return absl::InternalError(
           "Left-most tree leaf points before beginning of contents.");
     }
     if (right->get().text().cend() > upper_bound) {
       return absl::InternalError(
           "Right-most tree leaf points past end of contents.");
     }
   }
   return absl::OkStatus();
 }

 absl::Status TextStructureView::InternalConsistencyCheck() const {
   RETURN_IF_ERROR(FastLineRangeConsistencyCheck());
   RETURN_IF_ERROR(FastTokenRangeConsistencyCheck());
   return SyntaxTreeConsistencyCheck();
 }

 // TokenRange can be a container reference or iterator range.
 // TokenViewRange can be a container reference or iterator range.
 template <typename TokenRange, typename TokenViewRange>
 static void CopyTokensAndView(TokenSequence* destination,
                               std::vector<int>* view_indices,
                               const TokenRange& token_source,
                               const TokenViewRange& view_source) {
   // Translate token_view's iterators into array indices, adjusting for the
   // number of pre-existing tokens.
   for (const auto& token_iter : view_source) {
     view_indices->push_back(destination->size() +
                             std::distance(token_source.begin(), token_iter));
   }
   // Copy tokens up to this expansion point.
   for (const auto& token : token_source) {
     destination->push_back(token);
   }
 }

 // Incrementally copies a slice of tokens and expands a single subtree.
 // This advances the next_token_iter and next_token_view_iter iterators.
 // The subtree from the expansion is transferred into this objects's syntax
 // tree.  Indices into the final token stream view are collected in
 // token_view_indices.  Offset is the location of each expansion point.
 void TextStructureView::ConsumeDeferredExpansion(
     TokenSequence::const_iterator* next_token_iter,
     TokenStreamView::const_iterator* next_token_view_iter,
     DeferredExpansion* expansion, TokenSequence* combined_tokens,
     std::vector<int>* token_view_indices, const char* offset) {
   auto token_iter = *next_token_iter;
   auto token_view_iter = *next_token_view_iter;
   // Find the position up to each expansion point.
   *next_token_iter =
       std::lower_bound(token_iter, tokens_.cend(), offset,
                        [](const TokenInfo& token, const char* target) {
                          return std::distance(target, token.text().begin()) < 0;
                        });
   CHECK(*next_token_iter != tokens_.cend());
   *next_token_view_iter = std::lower_bound(
       token_view_iter, tokens_view_.cend(), offset,
       [](TokenStreamView::const_reference token_ref, const char* target) {
         return std::distance(target, token_ref->text().begin()) < 0;
       });
   CHECK(*next_token_view_iter != tokens_view_.cend());

   // Copy tokens and partial view into output.
   CopyTokensAndView(combined_tokens, token_view_indices,
                     make_range(token_iter, *next_token_iter),
                     make_range(token_view_iter, *next_token_view_iter));

   // Adjust locations of tokens in the expanded tree by pointing them
   // into the original text (contents_).
   std::unique_ptr<TextStructure>& subanalysis = expansion->subanalysis;
   TextStructureView& sub_data = ABSL_DIE_IF_NULL(subanalysis)->MutableData();
   const absl::string_view sub_data_text(sub_data.Contents());
   CHECK(!IsSubRange(sub_data_text, contents_));
   CHECK_EQ(sub_data_text, absl::string_view(offset, sub_data_text.length()));
   CHECK_GE(offset, contents_.begin());
   sub_data.RebaseTokensToSuperstring(contents_, sub_data_text,
                                      std::distance(contents_.begin(), offset));

   // Translate token_view's iterators into array indices.
   if (!sub_data.tokens_.empty() && sub_data.tokens_.back().isEOF()) {
     // Remove auxiliary data's end-token sentinel before copying.
     // Don't want to splice it into result.
     sub_data.tokens_.pop_back();
   }
   CopyTokensAndView(combined_tokens, token_view_indices, sub_data.tokens_,
                     sub_data.tokens_view_);

   // Transfer ownership of transformed syntax tree to this object's tree.
   *expansion->expansion_point = std::move(sub_data.MutableSyntaxTree());
   subanalysis->MutableData().Clear();

   // Advance one past expansion point to skip over expanded token.
   ++*next_token_iter;
   ++*next_token_view_iter;
 }

 TextStructure::TextStructure(std::shared_ptr<MemBlock> contents)
     : contents_(std::move(contents)), data_(contents_->AsStringView()) {
   // Internal string_view must point to memory owned by contents_.
   const absl::Status status = InternalConsistencyCheck();
   CHECK(status.ok()) << status.message() << " (in ctor)";
 }

 TextStructure::TextStructure(absl::string_view contents)
     : TextStructure(std::make_shared<StringMemBlock>(contents)) {}

 TextStructure::~TextStructure() {
   const absl::Status status = StringViewConsistencyCheck();
   CHECK(status.ok()) << status.message() << " (in dtor)";
 }

 void TextStructureView::ExpandSubtrees(NodeExpansionMap* expansions) {
   TokenSequence combined_tokens;
   // Gather indices and reconstruct iterators after there are no more
   // reallocations due to growing combined_tokens.
   std::vector<int> combined_token_view_indices;
   auto token_iter = tokens_.cbegin();
   auto token_view_iter = tokens_view_.cbegin();
   for (auto& expansion_entry : *expansions) {
     const auto offset = Contents().begin() + expansion_entry.first;
     ConsumeDeferredExpansion(&token_iter, &token_view_iter,
                              &expansion_entry.second, &combined_tokens,
                              &combined_token_view_indices, offset);
   }

   // Copy the remaining tokens beyond the last expansion point.
   CopyTokensAndView(&combined_tokens, &combined_token_view_indices,
                     make_range(token_iter, tokens_.cend()),
                     make_range(token_view_iter, tokens_view_.cend()));

   // Commit the newly expanded sequence of tokens.
   tokens_.swap(combined_tokens);

   // Reconstruct view iterators from indices into the new sequence.
   tokens_view_.clear();
   tokens_view_.reserve(combined_token_view_indices.size());
   for (const auto index : combined_token_view_indices) {
     tokens_view_.push_back(tokens_.cbegin() + index);
   }

   // Recalculate line-by-line token ranges.
   // TODO(fangism): Should be possible to update line_token_map_ incrementally
   // as well.
   CalculateFirstTokensPerLine();
 }

 absl::Status TextStructure::StringViewConsistencyCheck() const {
   const absl::string_view contents = data_.Contents();
   if (!contents.empty() && !IsSubRange(contents, contents_->AsStringView())) {
     return absl::InternalError(
         "string_view contents_ is not a substring of contents_, "
         "contents_ might reference deallocated memory!");
   }
   return absl::OkStatus();
 }

 absl::Status TextStructure::InternalConsistencyCheck() const {
   RETURN_IF_ERROR(StringViewConsistencyCheck());
   return data_.InternalConsistencyCheck();
 }

 }  // namespace verible