verilog/formatting/token_annotator.cc - third_party/verible - Git at Google

 // Copyright 2017-2020 The Verible Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "verilog/formatting/token_annotator.h"

 #include <iterator>
 #include <vector>

 #include "absl/strings/string_view.h"
 #include "common/formatting/format_token.h"
 #include "common/formatting/tree_annotator.h"
 #include "common/strings/range.h"
 #include "common/text/syntax_tree_context.h"
 #include "common/text/text_structure.h"
 #include "common/text/token_info.h"
 #include "common/util/iterator_range.h"
 #include "common/util/logging.h"
 #include "common/util/with_reason.h"
 #include "verilog/CST/verilog_nonterminals.h"
 #include "verilog/formatting/format_style.h"
 #include "verilog/formatting/verilog_token.h"
 #include "verilog/parser/verilog_parser.h"
 #include "verilog/parser/verilog_token_classifications.h"
 #include "verilog/parser/verilog_token_enum.h"

 namespace verilog {
 namespace formatter {

 using ::verible::PreFormatToken;
 using ::verible::SpacingOptions;
 using ::verible::SyntaxTreeContext;
 using ::verible::WithReason;
 using FTT = FormatTokenType;

 // Signal that spacing was not explicitly handled in case logic.
 // This value must be negative.
 static constexpr int kUnhandledSpacesRequired = -1;

 static bool IsUnaryPrefixExpressionOperand(const PreFormatToken& left,
                                            const SyntaxTreeContext& context) {
   return (IsUnaryOperator(verilog_tokentype(left.TokenEnum())) &&
           context.IsInsideFirst({NodeEnum::kUnaryPrefixExpression},
                                 {NodeEnum::kExpression})) ||
          // Treat '##' like a unary prefix operator.
          left.TokenEnum() == verilog_tokentype::TK_POUNDPOUND;
 }

 static bool IsInsideNumericLiteral(const PreFormatToken& left,
                                    const PreFormatToken& right) {
   return (left.format_token_enum == FormatTokenType::numeric_literal &&
           right.format_token_enum == FormatTokenType::numeric_base) ||
          left.format_token_enum == FormatTokenType::numeric_base;
 }

 // Returns true if keyword can be used like a function/method call.
 // Based on various LRM sections mentioning subroutine calls.
 static bool IsKeywordCallable(verilog_tokentype e) {
   switch (e) {
     case TK_and:  // array method
     case TK_find:
     case TK_find_index:
     case TK_find_first:
     case TK_find_first_index:
     case TK_find_last:
     case TK_find_last_index:
     case TK_min:
     case TK_max:
     case TK_new:
     case TK_or:  // array method
     case TK_product:
     case TK_randomize:
     case TK_reverse:
     case TK_rsort:
     case TK_shuffle:
     case TK_sort:
     case TK_sum:
     case TK_unique:  // array method
     case TK_wait:    // wait statement
     case TK_xor:     // array method
       // TODO(fangism): Verilog-AMS functions, like sin, cos, ...
       return true;
     default:
       break;
   }
   return false;
 }

 // The following combinations cannot be merged without a space:
 //   number number : would result in one different number
 //   number id/kw : would result in a bad identifier (lexer)
 //   id/kw number : would result in a (different) identifier
 //   id/kw id/kw : would result in a (different) identifier
 static bool PairwiseNonmergeable(const PreFormatToken& ftoken) {
   return ftoken.TokenEnum() == TK_DecNumber ||
          ftoken.format_token_enum == FormatTokenType::identifier ||
          ftoken.format_token_enum == FormatTokenType::keyword;
 }

 static bool InDeclaredDimensions(const SyntaxTreeContext& context) {
   return context.IsInsideFirst(
       {NodeEnum::kDimensionRange, NodeEnum::kDimensionScalar}, {});
 }

 static bool InRangeLikeContext(const SyntaxTreeContext& context) {
   return context.IsInsideFirst(
       {NodeEnum::kSelectVariableDimension, NodeEnum::kDimensionRange,
        NodeEnum::kDimensionSlice, NodeEnum::kCycleDelayRange},
       {});
 }

 static bool IsAnySemicolon(const PreFormatToken& ftoken) {
   // These are just syntactically disambiguated versions of ';'.
   return ftoken.TokenEnum() == ';' ||
          ftoken.TokenEnum() ==
              verilog_tokentype::SemicolonEndOfAssertionVariableDeclarations;
 }

 // Returns minimum number of spaces required between left and right token.
 // Returning kUnhandledSpacesRequired means the case was not explicitly
 // handled, and it is up to the caller to decide what to do when this happens.
 static WithReason<int> SpacesRequiredBetween(
     const PreFormatToken& left, const PreFormatToken& right,
     const SyntaxTreeContext& left_context,
     const SyntaxTreeContext& right_context) {
   VLOG(3) << "Spacing between " << verilog_symbol_name(left.TokenEnum())
           << " and " << verilog_symbol_name(right.TokenEnum());
   // Higher precedence rules should be handled earlier in this function.

   // Preserve space after escaped identifiers.
   if (left.TokenEnum() == EscapedIdentifier) {
     return {1, "Escaped identifiers must end with whitespace."};
   }

   if (right.TokenEnum() == verilog_tokentype::TK_LINE_CONT) {
     return {0, "Add no spaces before \\ line continuation."};
   }
   if (left.TokenEnum() == verilog_tokentype::TK_LINE_CONT) {
     return {0, "Add no spaces after \\ line continuation."};
   }

   if (IsComment(FormatTokenType(right.format_token_enum))) {
     return {2, "Style: require 2+ spaces before comments"};
     // TODO(fangism): Take this from FormatStyle.
   }

   if (left.format_token_enum == FormatTokenType::open_group ||
       right.format_token_enum == FormatTokenType::close_group) {
     return {0,
             "Prefer \"(foo)\" over \"( foo )\", \"[x]\" over \"[ x ]\", "
             "and \"{y}\" over \"{ y }\"."};
   }

   // For now, leave everything inside [dimensions] alone.
   if (InDeclaredDimensions(right_context)) {
     // ... except for the spacing before '[' and around ':',
     // which are covered elsewhere.
     if (right.TokenEnum() != '[' && left.TokenEnum() != ':' &&
         right.TokenEnum() != ':') {
       return {kUnhandledSpacesRequired,
               "Leave [expressions] inside scalar and range dimensions alone "
               "(for now)."};
     }
   }

   // Unary operators (context-sensitive)
   if (IsUnaryPrefixExpressionOperand(left, right_context) &&
       (left.format_token_enum != FormatTokenType::binary_operator ||
        !IsUnaryOperator(static_cast<verilog_tokentype>(right.TokenEnum())))) {
     // TODO: There are _some_ unary operators on the right that could
     // be formatted with 0-space, for example:
     // 'a = & ~b'; could be 'a = &~b;'
     return {0, "Bind unary prefix operator close to its operand."};
   }

   if (left.TokenEnum() == TK_SCOPE_RES) {
     return {0, "Prefer \"::id\" over \":: id\", \"::*\" over \":: *\""};
   }

   // Delimiters, list separators
   if (right.TokenEnum() == ',') return {0, "No space before comma"};
   if (left.TokenEnum() == ',') return {1, "Require space after comma"};

   if (IsAnySemicolon(right)) {
     if (left.TokenEnum() == ':') {
       return {1, "Space between semicolon and colon, (e.g. \"default: ;\")"};
     }
     return {0, "No space before semicolon"};
   }
   if (IsAnySemicolon(left)) {
     return {1, "Require space after semicolon"};
   }

   if (right_context.IsInsideFirst({NodeEnum::kStreamingConcatenation}, {})) {
     if (left.TokenEnum() == TK_LS || left.TokenEnum() == TK_RS) {
       return {0, "No space around streaming operators"};
     } else if (left.format_token_enum == FormatTokenType::numeric_literal ||
                left.format_token_enum == FormatTokenType::identifier ||
                left.format_token_enum == FormatTokenType::keyword) {
       return {0, "No space around streaming operator slice size"};
     }
   }

   // "@(" vs. "@ (" for event control
   // "@*" vs. "@ *" for event control, '*' is not a binary operator here
   if (left.TokenEnum() == '@') {
     return {0, "No space after \"@\" in most cases."};
   }
   if (right.TokenEnum() == '@') {
     return {1, "Space before \"@\" in most cases."};
   }

   // Do not force space between '^' and '{' operators
   if (right_context.IsInsideFirst({NodeEnum::kUnaryPrefixExpression}, {})) {
     if (IsUnaryOperator(static_cast<verilog_tokentype>(left.TokenEnum())) &&
         right.TokenEnum() == '{') {
       return {0, "No space between unary and concatenation operators"};
     }
   }

   // Add missing space around either side of all types of assignment operator.
   // "assign foo = bar;"  instead of "assign foo =bar;"
   // Consider assignment operators in the same class as binary operators.
   if (left.format_token_enum == FormatTokenType::binary_operator ||
       right.format_token_enum == FormatTokenType::binary_operator) {
     // Inside [], allows 0 or 1 spaces, and symmetrize.
     // TODO(fangism): make this behavior configurable
     if (right.format_token_enum == FormatTokenType::binary_operator &&
         InRangeLikeContext(right_context)) {
       int spaces = right.OriginalLeadingSpaces().length();
       if (spaces > 1) {
         spaces = 1;
       }
       return {spaces, "Limit <= 1 space before binary operator inside []."};
     }
     if (left.format_token_enum == FormatTokenType::binary_operator &&
         InRangeLikeContext(left_context)) {
       return {left.before.spaces_required,
               "Symmetrize spaces before and after binary operator inside []."};
     }
     return {1, "Space around binary and assignment operators"};
   }

   // If the token on either side is an empty string, do not inject any
   // additional spaces.  This can occur with some lexical tokens like
   // verilog_tokentype::PP_define_body.
   if (left.token->text().empty() || right.token->text().empty()) {
     return {0, "No additional space around empty-string tokens."};
   }

   // Remove any extra spaces between numeric literals' width, base and digits.
   // "16'h123, 'h123" instead of "16 'h123", "16'h 123, 'h 123"
   if (IsInsideNumericLiteral(left, right)) {
     return {0, "No space inside based numeric literals"};
   }

   if (right_context.IsInsideFirst(
           {NodeEnum::kUdpCombEntry, NodeEnum::kUdpSequenceEntry}, {})) {
     // Spacing before ';' is handled above
     return {1, "One space around UDP entries"};
   }

   // TODO(fangism): Never insert trailing spaces before a newline.

   // Hierarchy examples: "a.b", "a::b"
   if (left.format_token_enum == FormatTokenType::hierarchy ||
       right.format_token_enum == FormatTokenType::hierarchy)
     return {0,
             "No space separating hierarchy components "
             "(separated by . or ::)"};
   // TODO(fangism): space between numeric literals and '.'
   // Don't want to accidentally form m.d floating-point values.

   // cast operator, e.g. "void'(...)"
   if (right.TokenEnum() == '\'' || left.TokenEnum() == '\'') {
     return {0, "No space around cast operator '\\''"};
   }

   if (right.TokenEnum() == '(') {
     // "#(" vs. "# (" for parameter formals and arguments
     if (left.TokenEnum() == '#') return {0, "Fuse \"#(\""};

     // ") (" vs. ")(" for between parameter and port formals
     if (left.TokenEnum() == ')') {
       return {1, "Separate \") (\" between parameters and ports"};
     }

     // General handling of ID '(' spacing:
     if (left.format_token_enum == FormatTokenType::identifier ||
         IsKeywordCallable(verilog_tokentype(left.TokenEnum()))) {
       if (right_context.IsInside(NodeEnum::kActualNamedPort) ||
           right_context.IsInside(NodeEnum::kPort)) {
         return {0, "Named port: no space between ID and '('"};
       }
       if (right_context.IsInside(NodeEnum::kGateInstance) ||
           right_context.IsInside(NodeEnum::kPrimitiveGateInstance)) {
         return {1, "Module/primitive instance: want space between ID and '('"};
       }
       if (right_context.IsInside(NodeEnum::kModuleHeader)) {
         return {1,
                 "Module/interface declarations: want space between ID and '('"};
       }
       // Default: This case intended to cover function/task/macro calls:
       return {0, "Function/constructor calls: no space before ("};
     }
   }

   if (left.TokenEnum() == ':') {
     // Spacing in ranges
     if (InRangeLikeContext(right_context)) {
       // Take advantage here that the left token was already annotated (above)
       return {left.before.spaces_required,
               "Symmetrize spaces before and after ':' in bit slice"};
     }
     // Most contexts want a space after ':'.
     return {1, "Default to 1 space after ':'"};
   }

   if (left.TokenEnum() == '}') {
     // e.g. typedef struct { ... } foo_t;
     return {1, "Space after '}' in most other cases."};
   }
   if (right.TokenEnum() == '{') {
     if (left.format_token_enum == FormatTokenType::keyword) {
       return {1, "Space between keyword and '{'."};
     }
     if (right_context.DirectParentsAre(
             {NodeEnum::kBraceGroup, NodeEnum::kConstraintDeclaration})) {
       return {1, "Space before '{' when opening a constraint definition body."};
     }
     if (right_context.DirectParentsAre(
             {NodeEnum::kBraceGroup, NodeEnum::kCoverPoint})) {
       return {1, "Space before '{' when opening a coverpoint body."};
     }
     if (left.TokenEnum() == ')') {
       return {1, "Space betwen ')' and '{', e.g. conditional constraint."};
     }
     if (left.TokenEnum() == ']' && InDeclaredDimensions(left_context)) {
       return {1, "Space between declared array type and '{' (e.g. in typedef)"};
     }
     return {0, "No space before '{' in most other contexts."};
   }

   // Handle padding around packed array dimensions like "type [N] id;"
   if ((left.format_token_enum == FormatTokenType::keyword ||
        left.format_token_enum == FormatTokenType::identifier) &&
       right.TokenEnum() == '[') {
     if (right_context.IsInsideFirst({NodeEnum::kPackedDimensions},
                                     {NodeEnum::kExpression})) {
       // "type [packed...]" (space between type and packed dimensions)
       // avoid touching any expressions inside the packed dimensions
       return {1, "spacing before [packed dimensions] of declarations"};
     }
     // All other contexts, such as "a[i]" indices, no space.
     return {0, "All other cases of \".*[\", no space"};
   }
   if (left.TokenEnum() == ']' &&
       right.format_token_enum == FormatTokenType::identifier) {
     if (right_context.DirectParentsAre(
             {NodeEnum::kUnqualifiedId,
              NodeEnum::kDataTypeImplicitBasicIdDimensions})) {
       // "[packed...] id" (space between packed dimensions and id)
       return {1, "spacing after [packed dimensions] of declarations"};
     }
     // Not sure if "] id" appears in any other context, so leave it unhandled.
   }

   // Cannot merge tokens that would result in a different token.
   if (PairwiseNonmergeable(left) && PairwiseNonmergeable(right)) {
     return {1, "Cannot pair {number, identifier, keyword} without space."};
   }

   if (right.TokenEnum() == ':') {
     if (left.TokenEnum() == TK_default) {
       return {0, "No space inside \"default:\""};
     }
     if (right_context.DirectParentIsOneOf(
             {NodeEnum::kCaseItem, NodeEnum::kCaseInsideItem,
              NodeEnum::kCasePatternItem, NodeEnum::kGenerateCaseItem,
              NodeEnum::kPropertyCaseItem, NodeEnum::kRandSequenceCaseItem,
              NodeEnum::kCoverPoint})) {
       return {0, "Case-like items, no space before ':'"};
     }

     // Everything that resembles an end-label should have 1 space
     //   example nodes: kLabel, kEndNew, kFunctionEndLabel
     if (IsEndKeyword(verilog_tokentype(left.TokenEnum()))) {
       return {1, "Want 1 space between end-keyword and ':'"};
     }

     // Spacing between 'begin' and ':' is already covered
     // Spacing between 'fork' and ':' is already covered

     // Everything that resembles a prefix-statement label,
     // and label before 'begin'
     if (right_context.DirectParentIsOneOf({NodeEnum::kBlockIdentifier,
                                            NodeEnum::kLabeledStatement,
                                            NodeEnum::kGenerateBlock})) {
       return {1, "1 space before ':' in prefix block labels"};
     }

     // kTernaryExpression should have 1 space
     if (right_context.DirectParentIs(NodeEnum::kTernaryExpression)) {
       return {1, "Ternary ?: expression wants 1 space around ':'"};
     }

     // Spacing in ranges
     if (InRangeLikeContext(right_context)) {
       int spaces = right.OriginalLeadingSpaces().length();
       if (spaces > 1) {
         spaces = 1;
       }
       return {spaces, "Limit spaces before ':' in bit slice to 0 or 1"};
     }
     if (right_context.DirectParentIs(NodeEnum::kValueRange)) {
       return {1, "Spaces around ':' in value ranges."};
     }

     // TODO(fangism): Everything that resembles a range (in index, dimensions)
     // should have 1 space.
     //   kValueRange, kCycleRange
     //   kMinTypMax expressions?

     // TODO(fangism): Other unknowns:
     //   'enum_name' in verilog.y
     //   kMemberPattern?
     //   kPatternExpression?
     //   ':' as a polarity operator?
     //   as a UDP combinational entry? UDP sequence entry?
     //   kBindDirective?
     //   kCoverCross? kCoverPoint?
     //   kProduction? (randsequence)

     // For now, if case is not explicitly handled, preserve existing space.
   }

   // "if (...)", "for (...) instead of "if(...)", "for(...)",
   // "case ...", "return ..."
   if (left.format_token_enum == FormatTokenType::keyword) {
     // TODO(b/144605476): function-like keywords, however, do not get a space.
     return {1, "Space between flow control keywords and ("};
   }

   if (left.TokenEnum() == verilog_tokentype::TK_TimeLiteral) {
     if (right.TokenEnum() == ';') {
       return {0, "No space between time literal and ';'."};
     }
     return {1, "Space after time literals in most other cases."};
   }

   if (right.TokenEnum() == TK_POUNDPOUND)
     return {1, "Space before ## (delay) operator"};
   if (left.format_token_enum == FormatTokenType::unary_operator)
     return {0, "++i over ++ i"};  // "++i" instead of "++ i"
   if (right.format_token_enum == FormatTokenType::unary_operator)
     return {0, "i++ over i ++"};  // "i++" instead of "i ++"

   // TODO(fangism): handle ranges [ ... : ... ]

   if (left.TokenEnum() == TK_DecNumber &&
       right.TokenEnum() == TK_UnBasedNumber) {
     // e.g. 1'b1, 16'hbabe
     return {0, "No space between numeric width and un-based number"};
   }

   // Brackets in multi-dimensional arrays/indices.
   if (left.TokenEnum() == ']' && right.TokenEnum() == '[') {
     return {0, "No spaces separating multidimensional arrays/indices"};
   }

   if (left.TokenEnum() == '#') {
     return {0, "No spaces after # (delay expressions, parameters)."};
   }
   if (right.TokenEnum() == '#') {
     // This may be controversial or context-dependent, as parameterized
     // classes often appear with method calls like:
     //   type#(params...)::method(...);
     if (left_context.DirectParentIs(NodeEnum::kUnqualifiedId) &&
         !left_context.IsInsideFirst(
             {NodeEnum::kInstantiationType, NodeEnum::kBindTargetInstance},
             {})) {
       return {0, "No space before # when direct parent is kUnqualifiedId."};
     } else {
       return {1, "Spaces before # in most other contexts."};
     }
   }

   if (right.format_token_enum == FormatTokenType::keyword) {
     return {1, "Space before keywords in most other cases."};
   }

   // e.g. always_ff @(posedge clk) begin ...
   // e.g. case (expr): ...
   if (left.TokenEnum() == ')') {
     switch (right.TokenEnum()) {
       case ':':
         return {0, "No space between ')' and ':'."};
       default:
         break;
     }
     return {1, "Space between ')' and most other tokens"};
   }
   if (left.TokenEnum() == verilog_tokentype::MacroCallCloseToEndLine) {
     if (IsAnySemicolon(right)) {
       return {0, "No space between macro-closing ')' and ';'"};
     }
     // Really only expect comments to follow macro-closing ')'
     return {1, "Space between macro-closing ')' and most other tokens"};
   }
   if (left.TokenEnum() == ']') {
     return {1, "Space between ']' and most other tokens"};
   }

   if (IsPreprocessorKeyword(
           static_cast<verilog_tokentype>(right.TokenEnum()))) {
     // most of these should start on their own line anyway
     return {1, "Preprocessor keywords should be separated from token on left."};
   }

   if (IsComment(FormatTokenType(left.format_token_enum))) {
     // Nothing should ever be to the right of an EOL comment.
     // But we have to explicitly handle these cases to prevent them from
     // unintentionally preserving spacing after comments.
     return {1, "Handle left=comment to avoid preserving unwanted spaces."};
   }

   // Case was not explicitly handled.
   return {kUnhandledSpacesRequired, "Default: spacing not explicitly handled"};
 }

 struct SpacePolicy {
   int spaces_required;
   bool force_preserve_spaces;
 };

 static SpacePolicy SpacesRequiredBetween(
     const FormatStyle& style, const PreFormatToken& left,
     const PreFormatToken& right, const SyntaxTreeContext& left_context,
     const SyntaxTreeContext& right_context) {
   // Default for unhandled cases, 1 space to be conservative.
   constexpr int kUnhandledSpacesDefault = 1;
   const auto spaces =
       SpacesRequiredBetween(left, right, left_context, right_context);
   VLOG(1) << "spaces: " << spaces.value << ", reason: " << spaces.reason;

   if (spaces.value == kUnhandledSpacesRequired) {
     VLOG(1) << "Unhandled inter-token spacing between "
             << verilog_symbol_name(left.TokenEnum()) << " and "
             << verilog_symbol_name(right.TokenEnum()) << ", defaulting to "
             << kUnhandledSpacesDefault;
     return SpacePolicy{kUnhandledSpacesDefault, true};
   }
   // else spacing was explicitly handled in a case
   return SpacePolicy{spaces.value, false};
 }

 // Context-independent break penalty factor.
 static WithReason<int> BreakPenaltyBetweenTokens(
     const verible::PreFormatToken& left, const verible::PreFormatToken& right) {
   // Higher precedence rules should be handled earlier in this function.
   if (left.format_token_enum == FormatTokenType::identifier &&
       right.format_token_enum == FormatTokenType::open_group) {
     return {20, "identifier, open-group"};
   }
   // Hierarchy examples: "a.b", "a::b"
   // TODO(fangism): '.' is not always hierarchy, differentiate by context.
   // slightly prefer to break on the left: "a .b" better than "a. b"
   if (left.format_token_enum == FormatTokenType::hierarchy)
     return {50, "hierarchy separator on left"};
   if (right.format_token_enum == FormatTokenType::hierarchy)
     return {45, "hierarchy separator on right"};

   // Prefer to split after commas rather than before them.
   if (right.TokenEnum() == ',') return {10, "avoid breaking before ','"};
   if (right.TokenEnum() == ';') return {10, "avoid breaking before ';'"};

   if (left.TokenEnum() == ',') return {-5, "encourage breaking after ','"};
   if (left.TokenEnum() == ';') return {-5, "encourage breaking after ';'"};

   // Prefer to split after an assignment operator, rather than before.
   // TODO(fangism): use context to cover all assignment-like cases
   if (right.TokenEnum() == '=') return {5, "right is '='"};

   // Prefer to keep '(' with whatever is on the left.
   // TODO(fangism): ... except when () is used as precedence.
   if (right.format_token_enum == FormatTokenType::open_group)
     return {5, "right is open-group"};

   if (left.TokenEnum() == TK_DecNumber &&
       right.TokenEnum() == TK_UnBasedNumber) {
     // e.g. 1'b1, 16'hbabe
     // doesn't really matter, because we never break here
     return {90, "numeric width, base"};
   }

   return {0, "no further adjustment (default)"};
 }

 static int CommonAncestors(const SyntaxTreeContext& left,
                            const SyntaxTreeContext& right) {
   // TODO(fangism): re-check of common ancestry is slow (linear-time),
   // and could be avoided by memoizing the point of common ancestry between
   // leaves *during* the traversal.
   const auto* shorter = &left;
   const auto* longer = &right;
   // For C++11 compatibility, we use the 3-iterator form of std::mismatch().
   if (shorter->size() > longer->size()) std::swap(shorter, longer);
   const auto first_mismatches =
       std::mismatch(shorter->begin(), shorter->end(), longer->begin());
   const int short_common =
       std::distance(shorter->begin(), first_mismatches.first);
   const int long_common =
       std::distance(longer->begin(), first_mismatches.second);
   CHECK_GE(short_common, 0);
   CHECK_EQ(short_common, long_common);
   return short_common;
 }

 // Token-independent break penalty factor.
 static int ContextBasedPenalty(const SyntaxTreeContext& left_context,
                                const SyntaxTreeContext& right_context) {
   // This factor takes into account syntax tree depth, favoring keeping
   // elements deeper in the tree closer together.
   // The current simple model gives equal weight to every element in the
   // context stack.
   // TODO(fangism): custom weights by syntax tree node type.
   constexpr int kDepthScaleFactor = 2;
   const int num_common = CommonAncestors(left_context, right_context);
   const int penalty = num_common * kDepthScaleFactor;
   return penalty;
 }

 static WithReason<int> TokensWithContextBreakPenalty(
     const verible::PreFormatToken& left, const verible::PreFormatToken& right,
     const SyntaxTreeContext& left_context,
     const SyntaxTreeContext& right_context) {
   const verilog_tokentype left_type =
       static_cast<verilog_tokentype>(left.TokenEnum());
   const verilog_tokentype right_type =
       static_cast<verilog_tokentype>(right.TokenEnum());
   if (right_context.DirectParentIs(NodeEnum::kTernaryExpression) &&
       IsTernaryOperator(right_type)) {
     return {3, "Prefer to split after ternary operators (+3 on left)."};
   }
   if (left_context.DirectParentIs(NodeEnum::kTernaryExpression) &&
       IsTernaryOperator(left_type)) {
     return {-1, "Prefer to split after ternary operators (-1 on right)."};
   }
   if (right_context.DirectParentIs(NodeEnum::kBinaryExpression) &&
       right.format_token_enum == FormatTokenType::binary_operator) {
     // This value should be kept small so that binding affinity still honors
     // operator precedence which is currently reflected in syntax tree depth.
     return {8, "Prefer to split after binary operators (+8 on left)."};
   }
   if (left_context.DirectParentIs(NodeEnum::kBinaryExpression) &&
       left.format_token_enum == FormatTokenType::binary_operator) {
     return {0, "Prefer to split after binary operators (+0 on right)."};
   }
   return {0, "No adjustment."};
 }

 // Returns the split penalty for line-breaking before the right token.
 static WithReason<int> BreakPenaltyBetween(
     const verible::PreFormatToken& left, const verible::PreFormatToken& right,
     const SyntaxTreeContext& left_context,
     const SyntaxTreeContext& right_context) {
   VLOG(3) << "Inter-token penalty between "
           << verilog_symbol_name(left.TokenEnum()) << " and "
           << verilog_symbol_name(right.TokenEnum());

   const int depth_penalty = ContextBasedPenalty(left_context, right_context);
   VLOG(3) << "context break penalty: " << depth_penalty;

   // This factor only looks at left and right tokens:
   const auto inter_token_penalty = BreakPenaltyBetweenTokens(left, right);
   VLOG(3) << "inter-token break penalty: " << inter_token_penalty.value << ", "
           << inter_token_penalty.reason;

   const auto token_with_context_penalty =
       TokensWithContextBreakPenalty(left, right, left_context, right_context);
   VLOG(3) << "token+context break penalty: " << token_with_context_penalty.value
           << ", " << token_with_context_penalty.reason;

   constexpr int kMinPenalty = 1;   // absolute minimum
   constexpr int kPenaltyBias = 5;  // baseline penalty value
   const int total_penalty =
       std::max(kPenaltyBias + depth_penalty + inter_token_penalty.value +
                    token_with_context_penalty.value,
                kMinPenalty);

   VLOG(3) << "total break penalty: " << total_penalty;
   return {total_penalty, inter_token_penalty.reason};
 }

 // Returns decision whether to break, not break, or evaluate both choices.
 static WithReason<SpacingOptions> BreakDecisionBetween(
     const FormatStyle& style, const PreFormatToken& left,
     const PreFormatToken& right, const SyntaxTreeContext& left_context,
     const SyntaxTreeContext& right_context) {
   // For now, leave everything inside [dimensions] alone.
   if (InDeclaredDimensions(right_context)) {
     // ... except for the spacing immediately around '[' and ']',
     // which is covered by other rules.
     if (left.TokenEnum() != '[' && left.TokenEnum() != ']' &&
         right.TokenEnum() != '[' && right.TokenEnum() != ']' &&
         left.TokenEnum() != ':' && right.TokenEnum() != ':') {
       return {SpacingOptions::Preserve,
               "For now, leave spaces inside [] untouched."};
     }
   }

   if (right.TokenEnum() == verilog_tokentype::TK_LINE_CONT) {
     return {SpacingOptions::MustAppend,
             "Keep \\ line continuation attached to its left neighbor."};
   }

   if (left.TokenEnum() == verilog_tokentype::TK_LINE_CONT) {
     return {SpacingOptions::MustWrap,
             "Keep \\ line continuation is always followed by \\n."};
   }

   if (left.TokenEnum() == PP_define) {
     return {SpacingOptions::MustAppend,
             "Keep `define and macro name together."};
   }
   if (right.TokenEnum() == PP_define_body) {
     // TODO(b/141517267): reflow macro definition text with flexible
     // line-continuations.
     const absl::string_view text = right.Text();
     if (std::count(text.begin(), text.end(), '\n') >= 2) {
       return {SpacingOptions::Preserve,
               "Preserve spacing before a multi-line macro definition body."};
     } else {
       return {SpacingOptions::MustAppend,
               "Macro definition body must start on same line (but may be "
               "line-continued)."};
     }
   }

   // Check for mandatory line breaks.
   if (left.format_token_enum == FTT::eol_comment ||
       left.TokenEnum() == PP_define_body  // definition excludes trailing '\n'
   ) {
     return {SpacingOptions::MustWrap, "Token must be newline-terminated"};
   }

   if (right.format_token_enum == FTT::eol_comment) {
     // Check if there are any newlines between these tokens' texts.
     // Caution: when testing this case, must provide valid text between
     // tokens to avoid reading uninitialized memory.
     auto preceding_whitespace = verible::make_string_view_range(
         left.token->text().end(), right.token->text().begin());

     auto pos = preceding_whitespace.find_first_of('\n', 0);
     if (pos == absl::string_view::npos) {
       // There are other tokens on this line
       return {SpacingOptions::MustAppend,
               "EOL comment cannot break from "
               "tokens to the left on its line"};
     }
   }

   // TODO(fangism): check for all token types in verilog.lex that
   // scan to an end-of-line, even if it returns the newline to scanning with
   // yyless().

   // Unary operators (context-sensitive)
   // For now, never separate unary prefix operators from their operands.
   if (IsUnaryPrefixExpressionOperand(left, right_context)) {
     return {SpacingOptions::MustAppend,
             "Never separate unary prefix operator from its operand"};
   }

   if (IsInsideNumericLiteral(left, right)) {
     return {SpacingOptions::MustAppend,
             "Never separate numeric width, base, and digits"};
   }

   // Preprocessor macro definitions with args: no space between ID and '('.
   if (left.TokenEnum() == PP_Identifier && right.TokenEnum() == '(') {
     return {SpacingOptions::MustAppend, "No space between macro call id and ("};
   }

   // TODO(fangism): No break between `define and PP_Identifier.

   if (IsEndKeyword(verilog_tokentype(right.TokenEnum()))) {
     return {SpacingOptions::MustWrap, "end* keywords should start own lines"};
   }

   if (right.TokenEnum() == TK_else) {
     if (left.TokenEnum() != TK_end)
       return {SpacingOptions::MustWrap,
               "'else' token should start its own line unless preceded by 'end' "
               "without label."};
     else
       return {SpacingOptions::MustAppend,
               "'end'-'else' tokens should be together on one line."};
   }

   if ((left.TokenEnum() == TK_else) && (right.TokenEnum() == TK_begin)) {
     return {SpacingOptions::MustAppend,
             "'else'-'begin' tokens should be together on one line."};
   }

   if ((left.TokenEnum() == ')') && (right.TokenEnum() == TK_begin)) {
     return {SpacingOptions::MustAppend,
             "')'-'begin' tokens should be together on one line."};
   }

   if (left.TokenEnum() == verilog_tokentype::MacroCallCloseToEndLine) {
     if (!IsComment(FormatTokenType(right.format_token_enum)) &&
         !IsAnySemicolon(right)) {
       return {SpacingOptions::MustWrap,
               "Macro-closing ')' should end its own line except for comments "
               "nad ';'."};
     }
   }

   if (left.TokenEnum() == PP_else || left.TokenEnum() == PP_endif) {
     if (IsComment(FormatTokenType(right.format_token_enum))) {
       return {SpacingOptions::Undecided, "Comment may follow `else and `end"};
     }
     return {SpacingOptions::MustWrap,
             "`end and `else should be on their own line except for comments."};
   }

   if (IsPreprocessorKeyword(
           static_cast<verilog_tokentype>(right.TokenEnum()))) {
     // The tree unwrapper should make sure these start their own partition.
     return {SpacingOptions::MustWrap,
             "Preprocessor directives should start their own line."};
   }

   if (left.TokenEnum() == '#') {
     return {SpacingOptions::MustAppend,
             "Never separate # from whatever follows (delay expressions)."};
   }
   if (left.TokenEnum() == verilog_tokentype::TK_TimeLiteral) {
     if (right.TokenEnum() == ';') {
       return {SpacingOptions::MustAppend,
               "Keep delay statements together, like \"#1ps;\"."};
     }
   }

   if (left.TokenEnum() == ',' &&
       right.TokenEnum() == verilog_tokentype::MacroArg) {
     const absl::string_view text(right.Text());
     if (std::find(text.begin(), text.end(), '\n') != text.end()) {
       return {SpacingOptions::MustWrap,
               "Multi-line unlexed macro arguments start on their own line."};
     }
   }

   // By default, leave undecided for penalty minimization.
   return {SpacingOptions::Undecided,
           "Default: leave wrap decision to algorithm"};
 }

 // Extern linkage for sake of direct testing, though not exposed in public
 // headers.
 // TODO(fangism): could move this to a -internal.h header.
 void AnnotateFormatToken(const FormatStyle& style,
                          const PreFormatToken& prev_token,
                          PreFormatToken* curr_token,
                          const SyntaxTreeContext& prev_context,
                          const SyntaxTreeContext& curr_context) {
   const auto p = SpacesRequiredBetween(style, prev_token, *curr_token,
                                        prev_context, curr_context);
   curr_token->before.spaces_required = p.spaces_required;
   if (p.force_preserve_spaces) {
     // forego all inter-token calculations
     curr_token->before.break_decision = SpacingOptions::Preserve;
   } else {
     // Update the break penalty and if the curr_token is allowed to
     // break before it.
     const auto break_penalty = BreakPenaltyBetween(prev_token, *curr_token,
                                                    prev_context, curr_context);
     curr_token->before.break_penalty = break_penalty.value;
     const auto breaker = BreakDecisionBetween(style, prev_token, *curr_token,
                                               prev_context, curr_context);
     curr_token->before.break_decision = breaker.value;
     VLOG(3) << "line break constraint: " << breaker.value << ": "
             << breaker.reason;
   }
 }

 void AnnotateFormattingInformation(
     const FormatStyle& style, const verible::TextStructureView& text_structure,
     std::vector<verible::PreFormatToken>* format_tokens) {
   // This interface just forwards the relevant information from text_structure.
   AnnotateFormattingInformation(style, text_structure.Contents().begin(),
                                 text_structure.SyntaxTree().get(),
                                 text_structure.EOFToken(), format_tokens);
 }

 void AnnotateFormattingInformation(
     const FormatStyle& style, const char* buffer_start,
     const verible::Symbol* syntax_tree_root,
     const verible::TokenInfo& eof_token,
     std::vector<verible::PreFormatToken>* format_tokens) {
   if (format_tokens->empty()) {
     return;
   }

   if (buffer_start != nullptr) {
     // For unit testing, tokens' text snippets don't necessarily originate
     // from the same contiguous string buffer, so skip this step.
     ConnectPreFormatTokensPreservedSpaceStarts(buffer_start, format_tokens);
   }

   // Annotate inter-token information using the syntax tree for context.
   AnnotateFormatTokensUsingSyntaxContext(
       syntax_tree_root, eof_token, format_tokens->begin(), format_tokens->end(),
       // lambda: bind the FormatStyle, forwarding all other arguments
       [&style](const PreFormatToken& prev_token, PreFormatToken* curr_token,
                const SyntaxTreeContext& prev_context,
                const SyntaxTreeContext& current_context) {
         AnnotateFormatToken(style, prev_token, curr_token, prev_context,
                             current_context);
       });
 }

 }  // namespace formatter
 }  // namespace verilog