common/lexer/flex_lexer_adapter.h - third_party/verible - Git at Google

 // Copyright 2017-2020 The Verible Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // FlexLexerAdapter class adapts Flex-generated lexers to Lexer interface.
 //
 // Template parameter L must be a Flex-generated lexer (yyFlexLexer).
 // The adapter inherits from this generated class to override functions.
 //
 // Main lexing function yylex() must be defined in a subclass.
 //
 // Example usage:
 // in verilog_lexer.h:
 // class verilogFlexLexer;  // generated by flex
 // class VerilogLexer : public verible::FlexLexerAdapter<verilogFlexLexer> {
 //   ...
 // };
 //
 // and in verilog.lex:
 // %option yyclass="verilog::VerilogLexer"

 #ifndef VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_
 #define VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_

 #include <cstdlib>
 #include <iostream>
 #include <sstream>  // IWYU pragma: keep  // for ostringstream
 #include <string>

 #include "absl/strings/string_view.h"
 #include "common/lexer/lexer.h"
 #include "common/text/token_info.h"
 #include "common/util/logging.h"

 namespace verible {

 // The "L" base class of FlexLexerAdaptor needs to use code_stream_ in its
 // constructor, which means that code_stream_ must be initialized first.  All
 // base classes are initialized before any non-static data members, so to
 // achieve that, we need to also put code_stream_ in a base class that is
 // ordered before "L" in FlexLexerAdaptor's base classes.
 class CodeStreamHolder {
  protected:
   // The stream object conforms to the FlexLexer input interface.
   // Even though scanning is done on the stream's internal copy of the input
   // string, the byte offsets being tracked can be used to construct
   // string_views based on the original string's start address.
   // Using the standard istream interface also lets us switch buffers, e.g.
   // during preprocessing.
   std::istringstream code_stream_;
 };

 // L is a (flex-generated) yyFlexLexer-like class.
 template <typename L>
 class FlexLexerAdapter : private CodeStreamHolder, protected L, public Lexer {
  public:
   explicit FlexLexerAdapter(absl::string_view code)
       : L(&code_stream_),
         code_(code),
         // last_token_ points to the beginning of the code_ buffer
         last_token_(0 /* enum doesn't matter */, code_.substr(0, 0)) {
     code_stream_.str(std::string(code));
     // istringstream copies text into its own internal buffer.
   }

   // Returns the token associated with the last UpdateLocation() call.
   const TokenInfo& GetLastToken() const final { return last_token_; }

   // Returns next token and updates its location.
   const TokenInfo& DoNextToken() override {
     if (at_eof_) {
       // Do not call yylex(), because that will result in the fatal error:
       // "fatal flex scanner internal error--end of buffer missed"
       last_token_ = TokenInfo::EOFToken(code_);
     } else {
       // In normal operation, call yylex() to extract the next token.
       last_token_.set_token_enum(this->yylex());
     }
     // yylex has already called UpdateLocation()
     return last_token_;
   }

  protected:
   // Must be called by subclasses to update location of the current token.
   void UpdateLocation() { last_token_.AdvanceText(this->YYLeng()); }

   // EOF needs special handling because yyleng is set to include a terminating
   // \0 (NUL) character.  Once EOF is encountered it is also not possible to
   // yyless-rewind the window -- doing so messes up the internal state machine,
   // and causes (flex) errors like:
   // "fatal flex scanner internal error--end of buffer missed"
   // We advance the token text without spanning the NUL character.
   // This should only be needed in lexer states that need to explicitly
   // handle <<EOF>>.
   void UpdateLocationEOF() {
     last_token_.AdvanceText(this->YYLeng() - 1);
     at_eof_ = true;
   }

   // Restart lexer by pointing to new input stream, and reset all state.
   void Restart(absl::string_view code) override {
     at_eof_ = false;
     code_ = code;
     code_stream_.str(std::string(code_));
     last_token_ = TokenInfo(0, code_.substr(0, 0));

     // Reset buffer stack.
     while (L::yy_buffer_stack_top > 1) {  // Keep bottom buffer only.
       L::yypop_buffer_state();
     }

     // Reset the current buffer to use new stream.
     L::yyrestart(&code_stream_);

     // Reset start condition stack.
     while (L::yy_start_stack_ptr > 1) {  // Keep INITIAL state.
       L::yy_pop_state();
     }
   }

   // Overrides yyFlexLexer's implementation to handle unrecognized chars.
   void LexerOutput(const char* buf, int size) final {
     VLOG(1) << "LexerOutput: rejected text: \"" << std::string(buf, size)
             << '\"';

     // Update location by the size of the unrecognized sequence.
     // Note, this is a last-resort guard. The preferred way
     // to handle unrecognized chars is to add wildcard rule
     // at the end of the lexer definition that just calls
     // UpdateLocation().
     last_token_.AdvanceText(size);
     // TODO(fangism): Communicate some sort of error token to the consumer.
   }

   // Overrides yyFlexLexer's implementation to do proper error handling.
   void LexerError(const char* msg) final {
     std::cerr << "Fatal LexerError: " << msg;
     abort();
   }

  private:
   // A read-only view of the entire text to be scanned.
   absl::string_view code_;

   // Contains the enumeration and the substring slice of the last lexed token.
   TokenInfo last_token_;

   // Kludge: the generated FlexLexer (subclass) doesn't expose a way to
   // determine whether and EOF has already been encountered:
   //   (yy_buffer_stack[yy_buffer_stack_top]->yy_buffer_status
   //       == YY_BUFFER_EOF_PENDING)
   // because yy_buffer_state's implementation is private.
   // Thus, we manually set this bit upon encountering <<EOF>>.
   bool at_eof_ = false;
 };

 }  // namespace verible

 #endif  // VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_
	// Copyright 2017-2020 The Verible Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// FlexLexerAdapter class adapts Flex-generated lexers to Lexer interface.
	//
	// Template parameter L must be a Flex-generated lexer (yyFlexLexer).
	// The adapter inherits from this generated class to override functions.
	//
	// Main lexing function yylex() must be defined in a subclass.
	//
	// Example usage:
	// in verilog_lexer.h:
	// class verilogFlexLexer; // generated by flex
	// class VerilogLexer : public verible::FlexLexerAdapter<verilogFlexLexer> {
	// ...
	// };
	//
	// and in verilog.lex:
	// %option yyclass="verilog::VerilogLexer"

	#ifndef VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_
	#define VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_

	#include <cstdlib>
	#include <iostream>
	#include <sstream> // IWYU pragma: keep // for ostringstream
	#include <string>

	#include "absl/strings/string_view.h"
	#include "common/lexer/lexer.h"
	#include "common/text/token_info.h"
	#include "common/util/logging.h"

	namespace verible {

	// The "L" base class of FlexLexerAdaptor needs to use code_stream_ in its
	// constructor, which means that code_stream_ must be initialized first. All
	// base classes are initialized before any non-static data members, so to
	// achieve that, we need to also put code_stream_ in a base class that is
	// ordered before "L" in FlexLexerAdaptor's base classes.
	class CodeStreamHolder {
	protected:
	// The stream object conforms to the FlexLexer input interface.
	// Even though scanning is done on the stream's internal copy of the input
	// string, the byte offsets being tracked can be used to construct
	// string_views based on the original string's start address.
	// Using the standard istream interface also lets us switch buffers, e.g.
	// during preprocessing.
	std::istringstream code_stream_;
	};

	// L is a (flex-generated) yyFlexLexer-like class.
	template <typename L>
	class FlexLexerAdapter : private CodeStreamHolder, protected L, public Lexer {
	public:
	explicit FlexLexerAdapter(absl::string_view code)
	: L(&code_stream_),
	code_(code),
	// last_token_ points to the beginning of the code_ buffer
	last_token_(0 /* enum doesn't matter */, code_.substr(0, 0)) {
	code_stream_.str(std::string(code));
	// istringstream copies text into its own internal buffer.
	}

	// Returns the token associated with the last UpdateLocation() call.
	const TokenInfo& GetLastToken() const final { return last_token_; }

	// Returns next token and updates its location.
	const TokenInfo& DoNextToken() override {
	if (at_eof_) {
	// Do not call yylex(), because that will result in the fatal error:
	// "fatal flex scanner internal error--end of buffer missed"
	last_token_ = TokenInfo::EOFToken(code_);
	} else {
	// In normal operation, call yylex() to extract the next token.
	last_token_.set_token_enum(this->yylex());
	}
	// yylex has already called UpdateLocation()
	return last_token_;
	}

	protected:
	// Must be called by subclasses to update location of the current token.
	void UpdateLocation() { last_token_.AdvanceText(this->YYLeng()); }

	// EOF needs special handling because yyleng is set to include a terminating
	// \0 (NUL) character. Once EOF is encountered it is also not possible to
	// yyless-rewind the window -- doing so messes up the internal state machine,
	// and causes (flex) errors like:
	// "fatal flex scanner internal error--end of buffer missed"
	// We advance the token text without spanning the NUL character.
	// This should only be needed in lexer states that need to explicitly
	// handle <<EOF>>.
	void UpdateLocationEOF() {
	last_token_.AdvanceText(this->YYLeng() - 1);
	at_eof_ = true;
	}

	// Restart lexer by pointing to new input stream, and reset all state.
	void Restart(absl::string_view code) override {
	at_eof_ = false;
	code_ = code;
	code_stream_.str(std::string(code_));
	last_token_ = TokenInfo(0, code_.substr(0, 0));

	// Reset buffer stack.
	while (L::yy_buffer_stack_top > 1) { // Keep bottom buffer only.
	L::yypop_buffer_state();
	}

	// Reset the current buffer to use new stream.
	L::yyrestart(&code_stream_);

	// Reset start condition stack.
	while (L::yy_start_stack_ptr > 1) { // Keep INITIAL state.
	L::yy_pop_state();
	}
	}

	// Overrides yyFlexLexer's implementation to handle unrecognized chars.
	void LexerOutput(const char* buf, int size) final {
	VLOG(1) << "LexerOutput: rejected text: \"" << std::string(buf, size)
	<< '\"';

	// Update location by the size of the unrecognized sequence.
	// Note, this is a last-resort guard. The preferred way
	// to handle unrecognized chars is to add wildcard rule
	// at the end of the lexer definition that just calls
	// UpdateLocation().
	last_token_.AdvanceText(size);
	// TODO(fangism): Communicate some sort of error token to the consumer.
	}

	// Overrides yyFlexLexer's implementation to do proper error handling.
	void LexerError(const char* msg) final {
	std::cerr << "Fatal LexerError: " << msg;
	abort();
	}

	private:
	// A read-only view of the entire text to be scanned.
	absl::string_view code_;

	// Contains the enumeration and the substring slice of the last lexed token.
	TokenInfo last_token_;

	// Kludge: the generated FlexLexer (subclass) doesn't expose a way to
	// determine whether and EOF has already been encountered:
	// (yy_buffer_stack[yy_buffer_stack_top]->yy_buffer_status
	// == YY_BUFFER_EOF_PENDING)
	// because yy_buffer_state's implementation is private.
	// Thus, we manually set this bit upon encountering <<EOF>>.
	bool at_eof_ = false;
	};

	} // namespace verible

	#endif // VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_