blob: 596dd5ba3da24ffe742dafe186af31d09c0e3cbe [file] [log] [blame]
// Copyright 2017-2020 The Verible Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// FlexLexerAdapter class adapts Flex-generated lexers to Lexer interface.
//
// Template parameter L must be a Flex-generated lexer (yyFlexLexer).
// The adapter inherits from this generated class to override functions.
//
// Main lexing function yylex() must be defined in a subclass.
//
// Example usage:
// in verilog_lexer.h:
// class verilogFlexLexer; // generated by flex
// class VerilogLexer : public verible::FlexLexerAdapter<verilogFlexLexer> {
// ...
// };
//
// and in verilog.lex:
// %option yyclass="verilog::VerilogLexer"
#ifndef VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_
#define VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_
#include <cstdlib>
#include <iostream>
#include <sstream> // IWYU pragma: keep // for ostringstream
#include <string>
#include "absl/strings/string_view.h"
#include "common/lexer/lexer.h"
#include "common/text/token_info.h"
#include "common/util/logging.h"
namespace verible {
// The "L" base class of FlexLexerAdaptor needs to use code_stream_ in its
// constructor, which means that code_stream_ must be initialized first. All
// base classes are initialized before any non-static data members, so to
// achieve that, we need to also put code_stream_ in a base class that is
// ordered before "L" in FlexLexerAdaptor's base classes.
class CodeStreamHolder {
protected:
// The stream object conforms to the FlexLexer input interface.
// Even though scanning is done on the stream's internal copy of the input
// string, the byte offsets being tracked can be used to construct
// string_views based on the original string's start address.
// Using the standard istream interface also lets us switch buffers, e.g.
// during preprocessing.
std::istringstream code_stream_;
};
// L is a (flex-generated) yyFlexLexer-like class.
template <typename L>
class FlexLexerAdapter : private CodeStreamHolder, protected L, public Lexer {
public:
explicit FlexLexerAdapter(absl::string_view code)
: L(&code_stream_),
code_(code),
// last_token_ points to the beginning of the code_ buffer
last_token_(0 /* enum doesn't matter */, code_.substr(0, 0)) {
code_stream_.str(std::string(code));
// istringstream copies text into its own internal buffer.
}
// Returns the token associated with the last UpdateLocation() call.
const TokenInfo& GetLastToken() const final { return last_token_; }
// Returns next token and updates its location.
const TokenInfo& DoNextToken() override {
if (at_eof_) {
// Do not call yylex(), because that will result in the fatal error:
// "fatal flex scanner internal error--end of buffer missed"
last_token_ = TokenInfo::EOFToken(code_);
} else {
// In normal operation, call yylex() to extract the next token.
last_token_.set_token_enum(this->yylex());
}
// yylex has already called UpdateLocation()
return last_token_;
}
protected:
// Must be called by subclasses to update location of the current token.
void UpdateLocation() { last_token_.AdvanceText(this->YYLeng()); }
// EOF needs special handling because yyleng is set to include a terminating
// \0 (NUL) character. Once EOF is encountered it is also not possible to
// yyless-rewind the window -- doing so messes up the internal state machine,
// and causes (flex) errors like:
// "fatal flex scanner internal error--end of buffer missed"
// We advance the token text without spanning the NUL character.
// This should only be needed in lexer states that need to explicitly
// handle <<EOF>>.
void UpdateLocationEOF() {
last_token_.AdvanceText(this->YYLeng() - 1);
at_eof_ = true;
}
// Restart lexer by pointing to new input stream, and reset all state.
void Restart(absl::string_view code) override {
at_eof_ = false;
code_ = code;
code_stream_.str(std::string(code_));
last_token_ = TokenInfo(0, code_.substr(0, 0));
// Reset buffer stack.
while (L::yy_buffer_stack_top > 1) { // Keep bottom buffer only.
L::yypop_buffer_state();
}
// Reset the current buffer to use new stream.
L::yyrestart(&code_stream_);
// Reset start condition stack.
while (L::yy_start_stack_ptr > 1) { // Keep INITIAL state.
L::yy_pop_state();
}
}
// Overrides yyFlexLexer's implementation to handle unrecognized chars.
void LexerOutput(const char* buf, int size) final {
VLOG(1) << "LexerOutput: rejected text: \"" << std::string(buf, size)
<< '\"';
// Update location by the size of the unrecognized sequence.
// Note, this is a last-resort guard. The preferred way
// to handle unrecognized chars is to add wildcard rule
// at the end of the lexer definition that just calls
// UpdateLocation().
last_token_.AdvanceText(size);
// TODO(fangism): Communicate some sort of error token to the consumer.
}
// Overrides yyFlexLexer's implementation to do proper error handling.
void LexerError(const char* msg) final {
std::cerr << "Fatal LexerError: " << msg;
abort();
}
private:
// A read-only view of the entire text to be scanned.
absl::string_view code_;
// Contains the enumeration and the substring slice of the last lexed token.
TokenInfo last_token_;
// Kludge: the generated FlexLexer (subclass) doesn't expose a way to
// determine whether and EOF has already been encountered:
// (yy_buffer_stack[yy_buffer_stack_top]->yy_buffer_status
// == YY_BUFFER_EOF_PENDING)
// because yy_buffer_state's implementation is private.
// Thus, we manually set this bit upon encountering <<EOF>>.
bool at_eof_ = false;
};
} // namespace verible
#endif // VERIBLE_COMMON_LEXER_FLEX_LEXER_ADAPTER_H_