One big test case and improving code documentation.

2024-02-24 23:35:19 -06:00 · 2024-02-24 23:35:19 -06:00 · 1bd0e383ed
commit 1bd0e383ed
parent b27e16776e
3 changed files with 130 additions and 11 deletions
--- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala
+++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala
@ -5,17 +5,38 @@ import scala.annotation.tailrec
 import scala.collection.mutable.ListBuffer
 import scala.collection.mutable.Stack

+/** Transforms a stream of characters into a stream of tokens.
+  *
+  * @param reader
+  *   The [[CharacterReader]].
+  */
 class Tokenizer(private val reader: CharacterReader):
  import Tokenizer.*

-  private val buffer: ListBuffer[Char]  = ListBuffer.empty
-  private val states: Stack[State]      = Stack.empty
-  private val errors: ListBuffer[Error] = ListBuffer.empty
-  private var whiteSpaceOnly: Boolean   = true
+  // Accumulates characters for tokens.
+  private val buffer: ListBuffer[Char] = ListBuffer.empty

+  // Stack of states. This is refreshed on each call for a new token.
+  private val states: Stack[State] = Stack.empty
+
+  // List of errors within the scope of the current token.
+  private val errors: ListBuffer[Error] = ListBuffer.empty
+
+  // Tracks whether ONLY white space has been seen since the next newline. Used
+  // To determine if comments are allowed. Comments are only allowed as the
+  // first non-whitespace character on a line.
+  private var whiteSpaceOnly: Boolean = true
+
+  /** Dump the current [[State]] stack.
+    *
+    * @return
+    *   List of [[State]], from top to bottom.
+    */
  private def dumpStack(): List[State] = states.toList

  /** Consume the next available token.
+    *
+    * This function is **not** thread safe.
    *
    * @return
    *   The next available token, or an error if resolving a token fails.
@ -267,6 +288,8 @@ class Tokenizer(private val reader: CharacterReader):

 object Tokenizer:

+  /** Enumeration which defines the [[Tokenizer]] internal state.
+    */
  sealed trait State

  object State:
@ -276,17 +299,61 @@ object Tokenizer:
      */
    case object Initial extends State

+    /** Used if the `-` character is seen as the first character on some line.
+      * In this case, the token may or may not be a comment.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the `-` character.
+      */
    case class PotentialComment(start: SourcePosition) extends State
-    case class InComment(start: SourcePosition)        extends State
-    case class InDoubleQuote(start: SourcePosition)    extends State
-    case class InSingleQuote(start: SourcePosition)    extends State
-    case class InCharEscape(start: SourcePosition)     extends State
-    case class InGeneric(start: SourcePosition)        extends State
+
+    /** State for being within a comment. This state ends when a newline is hit.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the beginning of the comment.
+      */
+    case class InComment(start: SourcePosition) extends State
+
+    /** State for being within double quotes -- a String Literal. This state
+      * ends when a closing double quote is hit. Newlines are not allowed within
+      * this state.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the beginning of the string.
+      */
+    case class InDoubleQuote(start: SourcePosition) extends State
+
+    /** State for being within single quotes -- a Char Literal. This state ends
+      * when a closing single quote is hit. Newlines are not allowed within this
+      * state.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the beginning of the character.
+      */
+    case class InSingleQuote(start: SourcePosition) extends State
+
+    /** State indicating that a character escape within a string or character
+      * literal was initiated.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the character escape.
+      */
+    case class InCharEscape(start: SourcePosition) extends State
+
+    /** State that indicates some generic token is being parsed. These might be
+      * keywords, operators, or names.
+      *
+      * @param start
+      *   The [[SourcePosition]] of the start of the token.
+      */
+    case class InGeneric(start: SourcePosition) extends State

    given CanEqual[State, State] = CanEqual.derived

  end State

+  /** Enumeration which defines all possible [[Tokenizer]] errors.
+    */
  sealed trait Error

  object Error:
@ -294,8 +361,6 @@ object Tokenizer:
    sealed trait Positional extends Error:
      def sourcePosition: SourcePosition

-    case object NotImplemented extends Error
-
    /** This error occurs when the end of file is reached if the tokenizer is
      * still expecting more characters.
      *
--- a/modules/parser/src/test/resources/tokenizer-3.ava
+++ b/modules/parser/src/test/resources/tokenizer-3.ava
@ -0,0 +1,9 @@
+namespace unit.test
+
+--- Type class for type constructors which can be mapped over.
+given F *
+class Functor
+    --- Transform some wrapped data from one type to another.
+    given A, B
+    defn map: F A -> (A -> B) -> F B
+end class
--- a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala
+++ b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala
@ -302,6 +302,51 @@ class TokenizerTests extends munit.FunSuite:
    )
  }

+  test("should tokenize a valid file (case 3)") {
+    val source = loadFileToString("tokenizer-3.ava")
+    println(source)
+    assertTokens(
+      source,
+      Right(Token.Generic("namespace")),
+      Right(Token.Generic("unit")),
+      Right(Token.Dot),
+      Right(Token.Generic("test")),
+      Right(
+        Token.Comment(
+          "- Type class for type constructors which can be mapped over."
+        )
+      ),
+      Right(Token.Generic("given")),
+      Right(Token.Generic("F")),
+      Right(Token.Generic("*")),
+      Right(Token.Generic("class")),
+      Right(Token.Generic("Functor")),
+      Right(
+        Token.Comment("- Transform some wrapped data from one type to another.")
+      ),
+      Right(Token.Generic("given")),
+      Right(Token.Generic("A")),
+      Right(Token.Comma),
+      Right(Token.Generic("B")),
+      Right(Token.Generic("defn")),
+      Right(Token.Generic("map")),
+      Right(Token.Colon),
+      Right(Token.Generic("F")),
+      Right(Token.Generic("A")),
+      Right(Token.Generic("->")),
+      Right(Token.OpenParen),
+      Right(Token.Generic("A")),
+      Right(Token.Generic("->")),
+      Right(Token.Generic("B")),
+      Right(Token.CloseParen),
+      Right(Token.Generic("->")),
+      Right(Token.Generic("F")),
+      Right(Token.Generic("B")),
+      Right(Token.Generic("end")),
+      Right(Token.Generic("class"))
+    )
+  }
+
  private def assertTokens(
    source: String,
    expectedOutput: Either[Error, Token]*