ScanSingleToken——V8底层技术细节的学习之旅

2024-02-20 14:31:12

V8引擎概述

V8是Google开发的开源JavaScript引擎，也是当今最受欢迎的JavaScript引擎之一。它被广泛应用于Chrome浏览器、Node.js等项目。V8引擎以其高效的执行速度和强大的功能而著称，成为许多开发者的首选。

ScanSingleToken方法在词法分析中的作用

词法分析是编译器或解释器对源代码进行分析的第一个阶段。在这个阶段，源代码被分解成一系列的词法符号，如标识符、数字、运算符、括号等。这些词法符号是编程语言的基本构成单元，也是编译器或解释器进行语法分析的基础。

在V8引擎中，ScanSingleToken方法负责从源代码中识别单个的词法符号。它采用了一个简单的有限状态自动机（FSM）模型。FSM是一种能够识别特定语言的正则表达式的机器。在ScanSingleToken方法中，FSM用于识别JavaScript语言中各种词法符号的模式。

ScanSingleToken方法的实现

ScanSingleToken方法的实现位于v8/src/parser/scanner.cc文件中。方法的定义如下：

TokenInfo Scanner::ScanSingleToken(bool can_be_harmony,
                                  bool can_be_break,
                                  int stop_pos,
                                  int stop_control_char) {
  DCHECK_NOT_NULL(pre_parser_);
  DCHECK_LE(start_pos_, stream_->position());
  DCHECK_LE(stream_->position(), stop_pos);
  DCHECK_LT(stop_control_char, base::numeric_limits<uint16_t>::max());
  Token::Value next = Next();
  // Eat white space and newlines.
  if (next.is_white_space()) {
    bool saw_cr = false;
    bool saw_lf = false;
    while (next.is_white_space()) {
      if (next.is_line_terminator()) {
        if (next.IsNewline()) {
          // Allow non-terminated newlines anywhere.
          saw_lf = true;
        } else {
          saw_cr = true;
          next = Next();
          if (!next.is_line_terminator()) continue;
          saw_lf = next.IsNewline();
        }
      }
      next = Next();
    }
  }
  Token::Value end_token_type =
      (stop_pos == -1) ? next : Token::UNINITIALIZED;
  while (next.is_strict_mode_reserved() &&
         next != Token::ILLEGAL &&
         stop_pos == -1 &&
         !strict_mode_) {
    // We found a strict mode reserved word in non-strict mode code. Eat
    // the token and continue.
    next = Next();
  }
  Token::Value token_type;
  if (!Token::IsTokenType(next) && next.is_strict_mode_reserved() &&
      strict_mode_) {
    // For strict mode code, we have to treat certain keywords as literals.
    token_type = next;
    next = Next();
    if (token_type == Token::SUPER &&
        (!IsIdentifierStart(next) || next == Token::PRIVATE_IDENTIFIER)) {
      // We look ahead to see if super() is used as a function call.
      token_type = Token::SUPER_CALL;
    }
  } else {
    token_type = Token::ConvertCharToToken(next);
  }
  if (token_type == Token::DIV) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::DIV_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::STAR) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::MUL_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::SUB) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::SUB_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::SHL) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::SHL_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::SHR) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::SHR_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::SAR) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::SAR_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::MOD) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::MOD_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::XOR) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::XOR_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::BIT_OR) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::BIT_OR_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::BIT_AND) {
    next = Next();
    if (next.is_assign()) {
      token_type = Token::BIT_AND_ASSIGN;
      next = Next();
    }
  } else if (token_type == Token::NE) {
    next = Next();
    if (next.is_eq()) {
      token_type = Token::NE_EQ;
      next = Next();
    }
  } else if (token_type == Token::EQ_EQ) {
    next = Next();
    if (next.is_eq()) {
      token_type = Token::EQ_EQ_EQ;
      next = Next();
    }
  } else if (token_type == Token::NOT) {
    next = Next();
    if (next.is_in()) {
      token_type = Token::NOT_IN;
      next = Next();
    }
  } else if (token_type == Token::LT) {
    next = Next();
    if (next.is_eq()) {
      token_type = Token::LT_EQ;
      next = Next();
    } else if (next.is_lt()) {
      token_type = Token::SHL;
      next = Next();
    }
  } else if (token_type == Token::GT) {
    next = Next();
    if (next.is_eq()) {
      token_type = Token::GT_EQ;
      next = Next();
    } else if (next.is_gt()) {
      token_type = Token::SHR;
      next = Next();
    }
  }
  if (stop_pos != -1 && next.position() >= stop_pos) {
    if (token_type == Token::NUMBER || token_type == Token::STRING ||
        token_type == Token::REGEX) {
      // If we hit stop_pos in the middle of a literal, we need to backtrack
      // and return an incomplete token.  This token is then recovered by
      // ScanRestOfLiteral.
      // We can't do this if the token has been converted to another type,
      // e.g. DIV -> DIV_ASSIGN.
      BacktrackTo(start_pos_);
      token_type = Token::ILLEGAL;
    } else {
      // For other tokens we can report the token right away and continue
      // parsing.
      next = Token::Truncate(next, stop_pos);
    }
  }
  if (token_type == Token::END_OF_FILE) {
    Stream::Position fp = stream_->position();
    if (fp == stop_pos) {
      next.column_offset_ = stop_control_char;