Continued work on the Tokenizer. Starting to track source position.

2024-02-20 21:47:36 -06:00 · 2024-02-20 21:47:36 -06:00 · 7beaef3b0d
commit 7beaef3b0d
parent c5ea827944
6 changed files with 289 additions and 16 deletions
--- a/ava.ebnf
+++ b/ava.ebnf
@ -22,7 +22,7 @@ lower               ::= 'a' | ... | 'z' | Ll
                      | Lm lower only;
 letter              ::= upper | lower;
 escape_unicode      ::= '\', 'u', hex, hex, hex, hex;
-escape_char         ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"');
+escape_char         ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"' | "'");
 escape_seq          ::= escape_unicode | escape_char;
 number_integer      ::= '0' | digit, [{digit}];
 number_float        ::= number_integer, '.', number_integer;
--- a/modules/parser/src/main/scala/ava/parser/CharEscape.scala
+++ b/modules/parser/src/main/scala/ava/parser/CharEscape.scala
@ -0,0 +1,47 @@
+package ava.parser
+
+/** Enumeration for all supported character escapes.
+  *
+  * @param name
+  *   The character escape name - how it resolves in source code.
+  * @param output
+  *   The character emitted by the escape.
+  */
+sealed abstract class CharEscape(
+  val name: Char,
+  val output: Char
+)
+
+object CharEscape:
+
+  case object NewLine        extends CharEscape('n', '\n')
+  case object CarriageReturn extends CharEscape('r', '\r')
+  case object Backspace      extends CharEscape('b', '\b')
+  case object FormFeed       extends CharEscape('f', '\f')
+  case object Tab            extends CharEscape('t', '\t')
+  case object BackSlash      extends CharEscape('\\', '\\')
+  case object DoubleQuote    extends CharEscape('"', '"')
+  case object SingleQuote    extends CharEscape('\'', '\'')
+
+  val All: List[CharEscape] = List(
+    NewLine,
+    CarriageReturn,
+    Backspace,
+    FormFeed,
+    Tab,
+    BackSlash,
+    DoubleQuote,
+    SingleQuote
+  )
+
+  /** Given some escape name, attempt to resolve to a supported [[CharEscape]].
+    *
+    * @param escapeName
+    *   The candidate name.
+    * @return
+    *   The relevent [[CharEscape]], or `None` if no such escape exists.
+    */
+  def resolve(escapeName: Char): Option[CharEscape] =
+    All.find(_.name == escapeName)
+
+end CharEscape
--- a/modules/parser/src/main/scala/ava/parser/CharacterReader.scala
+++ b/modules/parser/src/main/scala/ava/parser/CharacterReader.scala
@ -32,10 +32,18 @@ class CharacterReader(
  private var peekedAhead: Boolean = false
  private var lookAhead: Char      = 0

+  // Tracking for callers.
+  private var charInFile: Long = 0
+  private var lineNumber: Long = 0
+  private var charInLine: Long = 0
+
  /** Close the underlying stream.
    */
  def close(): Unit = input.close()

+  def currentSourcePosition(): SourcePosition =
+    SourcePosition(charInFile, lineNumber, charInLine)
+
  /** Set EOF and close the underlying stream.
    */
  private def setEof(): Unit =
@ -182,6 +190,14 @@ class CharacterReader(
      if currentChar != 0 then lastChars.push(currentChar)
      else ()

+    // Update tracking information.
+    charInFile = charInFile + 1
+    if ch == '\n' then
+      lineNumber = lineNumber + 1
+      charInLine = 0
+    else charInLine = charInLine + 1
+
+    // Finally, remap the current character.
    currentChar = ch

 object CharacterReader:
--- a/modules/parser/src/main/scala/ava/parser/SourcePosition.scala
+++ b/modules/parser/src/main/scala/ava/parser/SourcePosition.scala
@ -0,0 +1,7 @@
+package ava.parser
+
+case class SourcePosition(
+  characterInFile: Long,
+  lineNumber: Long,
+  characterInLine: Long
+)
--- a/modules/parser/src/main/scala/ava/parser/Token.scala
+++ b/modules/parser/src/main/scala/ava/parser/Token.scala
@ -18,6 +18,32 @@ object Token:
    */
  case class Comment(value: String) extends Token

+  /** String literals are captured in total, with escapes resolved.
+    *
+    * @param value
+    *   The resolved string literal value.
+    * @param errors
+    *   The list of errors that occurred while resolving this string literal, if
+    *   any.
+    */
+  case class StringLiteral(
+    value: String,
+    errors: List[Tokenizer.Error]
+  ) extends Token
+
+  /** Character literals are captured in total, with escapes resolved.
+    *
+    * @param value
+    *   The resolved character literal value.
+    * @param errors
+    *   The list of errors that occurred while resolving this character literal,
+    *   if any.
+    */
+  case class CharacterLiteral(
+    value: Char,
+    errors: List[Tokenizer.Error]
+  ) extends Token
+
  /** The '(' character.
    */
  case object OpenParen extends Token
@ -34,14 +60,6 @@ object Token:
    */
  case object Comma extends Token

-  /** The '"' character.
-    */
-  case object DoubleQuote extends Token
-
-  /** The ''' character.
-    */
-  case object SingleQuote extends Token
-
  /** The ':' character.
    */
  case object Colon extends Token
@ -49,4 +67,8 @@ object Token:
  /** The '#' character.
    */
  case object Tuple extends Token
+
+  /** Represents end of file (EOF), not in error.
+    */
+  case object Eof extends Token
 end Token
--- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala
+++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala
@ -1,26 +1,207 @@
 package ava.parser

+import scala.annotation.tailrec
+import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.Stack
+
 class Tokenizer(private val reader: CharacterReader):
  import Tokenizer.*

-  private var state: State = State.Initial
+  private val buffer: ListBuffer[Char]  = ListBuffer.empty
+  private val states: Stack[State]      = Stack.empty
+  private val errors: ListBuffer[Error] = ListBuffer.empty

-  def next(): Option[Token] = None
+  private def dumpStack(): List[State] = states.toList

+  /** Consume the next available token.
+    *
+    * @return
+    *   The next available token, or an error if resolving a token fails.
+    */
+  def next(): Either[Error, Token] =
+    buffer.clear()
+    states.clear()
+    errors.clear()
+    nextInternal(State.Initial)
+
+  /** Close this Tokenizer instance and free all resources, including the
+    * [[CharacterReader]].
+    */
  def close(): Unit = reader.close()

+  @tailrec
+  private def nextInternal(state: State): Either[Error, Token] =
+    state match
+      case State.Initial =>
+        Left(Error.NotImplemented)
+      case State.PotentialComment(startPos) =>
+        Left(Error.NotImplemented)
+      case State.InComment(startPos) =>
+        Left(Error.NotImplemented)
+      case State.InDoubleQuote(startPos) =>
+        reader.consume() match
+          case None => Left(Error.PrematureEof(dumpStack()))
+          case Some(ch) =>
+            ch match
+              case '\n' =>
+                // Literal Newlines are not allowed within string literals.
+                Left(
+                  Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
+                )
+              case '\\' =>
+                // Character escapes are supported within string literals.
+                states.push(state)
+                nextInternal(State.InCharEscape(reader.currentSourcePosition()))
+              case '"' =>
+                // This string literal is now closed. If anything failed inside
+                // this literal, suppress those errors and return them as part
+                // of the token so that parsing may continue.
+                Right(
+                  Token.StringLiteral(
+                    value = buffer.mkString,
+                    errors = errors.toList
+                  )
+                )
+              case _ =>
+                // Continue accumulating characters.
+                buffer.addOne(ch)
+                nextInternal(state)
+      case State.InSingleQuote(startPos) =>
+        reader.consume() match
+          case None => Left(Error.PrematureEof(dumpStack()))
+          case Some(ch) =>
+            ch match
+              case '\n' =>
+                // Literal Newlines are not allowed within character literals.
+                Left(
+                  Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
+                )
+              case '\\' =>
+                // Character escapse are supported within character literals.
+                states.push(state)
+                nextInternal(State.InCharEscape(reader.currentSourcePosition()))
+              case '\'' =>
+                // This character literal is now closed.
+                createCharacterLiteral()
+              case _ =>
+                // Continue accumulating characters.
+                buffer.addOne(ch)
+                nextInternal(state)
+      case State.InCharEscape(startPos) =>
+        reader.consume() match
+          case None =>
+            Left(Error.PrematureEof(dumpStack()))
+          case Some(ch) =>
+            resolveCharEscape(ch) match
+              case Left(error) =>
+                // Character escapes always live within some other state.
+                // Capture the error, but don't just stop consuming -- pop the
+                // state and return to wherever (string or character literal)
+                // this error originated.
+                errors.addOne(error)
+                nextInternal(states.pop())
+              case Right(actual) =>
+                // Add the resolved character to the buffer and return to the
+                // parent state.
+                val _ = buffer.addOne(actual)
+                nextInternal(states.pop())
+      case State.InGeneric(startPos) =>
+        Left(Error.NotImplemented)
+
+  private def resolveCharEscape(ch: Char): Either[Error, Char] =
+    CharEscape.resolve(ch) match
+      case None =>
+        Left(Error.InvalidCharEscape(reader.currentSourcePosition(), ch))
+      case Some(escape) =>
+        Right(escape.output)
+
+  private def createCharacterLiteral(): Either[Error, Token] =
+    val dump = buffer.mkString
+    if dump.length() > 1 then
+      Left(
+        Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump)
+      )
+    else
+      val out = if dump.isEmpty() then 0 else dump.charAt(0)
+      Right(
+        Token.CharacterLiteral(
+          value = out,
+          errors = errors.toList
+        )
+      )
+
 object Tokenizer:

  sealed trait State

  object State:

+    /** The initial state. This is always the state of the [[Tokenizer]] when it
+      * starts, as well as between each token.
+      */
    case object Initial extends State
-    case object PotentialComment extends State
-    case object InComment        extends State
-    case object InQuote          extends State
-    case object InGeneric        extends State
+
+    case class PotentialComment(start: SourcePosition) extends State
+    case class InComment(start: SourcePosition)        extends State
+    case class InDoubleQuote(start: SourcePosition)    extends State
+    case class InSingleQuote(start: SourcePosition)    extends State
+    case class InCharEscape(start: SourcePosition)     extends State
+    case class InGeneric(start: SourcePosition)        extends State
+
+    given CanEqual[State, State] = CanEqual.derived

  end State

+  sealed trait Error
+
+  object Error:
+
+    sealed trait Positional extends Error:
+      def sourcePosition: SourcePosition
+
+    case object NotImplemented extends Error
+
+    /** This error occurs when the end of file is reached if the tokenizer is
+      * still expecting more characters.
+      *
+      * @param stack
+      *   The stack of states at time of error. Helps communicate what
+      *   additional characters were expected.
+      */
+    case class PrematureEof(stack: List[State]) extends Error
+
+    case class UnexpectedNewLine(
+      sourcePosition: SourcePosition,
+      currentState: State
+    ) extends Positional
+
+    /** This error occurs when the source contains a character escape indicated
+      * by a backslash character within some string/character literal, but the
+      * escape type is not recognized.
+      *
+      * @param sourcePosition
+      *   The [[SourcePosition]] where the error triggered.
+      * @param candidate
+      *   The escape name that was attempted.
+      */
+    case class InvalidCharEscape(
+      sourcePosition: SourcePosition,
+      candidate: Char
+    ) extends Positional
+
+    /** This error occurs when a character literal is expected, but more than
+      * one character is present in the accumulated buffer.
+      *
+      * @param sourcePosition
+      *   The [[SourcePosition]] where the error triggered.
+      * @param candidate
+      *   The candidate string.
+      */
+    case class MultipleCharactersInLiteral(
+      sourcePosition: SourcePosition,
+      candidate: String
+    ) extends Error
+
+  end Error
+
 end Tokenizer