From f7bc41e5394f64a1e5420aebc64c31af110766a9 Mon Sep 17 00:00:00 2001 From: Pat Garrity Date: Sat, 24 Feb 2024 10:21:00 -0600 Subject: [PATCH] Writing tests and test utilities for the Tokenizer --- .../scala/ava/parser/CharacterReader.scala | 2 +- .../scala/ava/parser/SourcePosition.scala | 4 + .../src/main/scala/ava/parser/Token.scala | 8 ++ .../src/main/scala/ava/parser/Tokenizer.scala | 76 ++++++++++--- .../scala/ava/parser/TokenizerTests.scala | 106 ++++++++++++++++++ 5 files changed, 180 insertions(+), 16 deletions(-) create mode 100644 modules/parser/src/test/scala/ava/parser/TokenizerTests.scala diff --git a/modules/parser/src/main/scala/ava/parser/CharacterReader.scala b/modules/parser/src/main/scala/ava/parser/CharacterReader.scala index 65e9c05..94a7875 100644 --- a/modules/parser/src/main/scala/ava/parser/CharacterReader.scala +++ b/modules/parser/src/main/scala/ava/parser/CharacterReader.scala @@ -34,7 +34,7 @@ class CharacterReader( // Tracking for callers. private var charInFile: Long = 0 - private var lineNumber: Long = 0 + private var lineNumber: Long = 1 private var charInLine: Long = 0 /** Close the underlying stream. diff --git a/modules/parser/src/main/scala/ava/parser/SourcePosition.scala b/modules/parser/src/main/scala/ava/parser/SourcePosition.scala index b39e6c4..b798753 100644 --- a/modules/parser/src/main/scala/ava/parser/SourcePosition.scala +++ b/modules/parser/src/main/scala/ava/parser/SourcePosition.scala @@ -5,3 +5,7 @@ case class SourcePosition( lineNumber: Long, characterInLine: Long ) + +object SourcePosition: + given CanEqual[SourcePosition, SourcePosition] = CanEqual.derived +end SourcePosition diff --git a/modules/parser/src/main/scala/ava/parser/Token.scala b/modules/parser/src/main/scala/ava/parser/Token.scala index 2685cd2..461bc3b 100644 --- a/modules/parser/src/main/scala/ava/parser/Token.scala +++ b/modules/parser/src/main/scala/ava/parser/Token.scala @@ -3,6 +3,8 @@ package ava.parser sealed trait Token object Token: + given CanEqual[Token, Token] = CanEqual.derived + /** Most tokens are generic tokens. They represent some arbitrary grouping of * characters that will be refined later. * @@ -10,6 +12,7 @@ object Token: * The token value. */ case class Generic(value: String) extends Token + given CanEqual[Generic, Generic] = CanEqual.derived /** Comments are detected at time of tokenization, and are arbitrary strings. * @@ -17,6 +20,7 @@ object Token: * The comment value. */ case class Comment(value: String) extends Token + given CanEqual[Comment, Comment] = CanEqual.derived /** String literals are captured in total, with escapes resolved. * @@ -31,6 +35,8 @@ object Token: errors: List[Tokenizer.Error] ) extends Token + given CanEqual[StringLiteral, StringLiteral] = CanEqual.derived + /** Character literals are captured in total, with escapes resolved. * * @param value @@ -44,6 +50,8 @@ object Token: errors: List[Tokenizer.Error] ) extends Token + given CanEqual[CharacterLiteral, CharacterLiteral] = CanEqual.derived + /** The '(' character. */ case object OpenParen extends Token diff --git a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala index 0e58efa..aed23e6 100644 --- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala +++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala @@ -11,6 +11,7 @@ class Tokenizer(private val reader: CharacterReader): private val buffer: ListBuffer[Char] = ListBuffer.empty private val states: Stack[State] = Stack.empty private val errors: ListBuffer[Error] = ListBuffer.empty + private var whiteSpaceOnly: Boolean = true private def dumpStack(): List[State] = states.toList @@ -20,9 +21,7 @@ class Tokenizer(private val reader: CharacterReader): * The next available token, or an error if resolving a token fails. */ def next(): Either[Error, Token] = - buffer.clear() - states.clear() - errors.clear() + resetState() nextInternal(State.Initial) /** Close this Tokenizer instance and free all resources, including the @@ -30,6 +29,11 @@ class Tokenizer(private val reader: CharacterReader): */ def close(): Unit = reader.close() + private def resetState(): Unit = + buffer.clear() + states.clear() + errors.clear() + @tailrec private def nextInternal(state: State): Either[Error, Token] = state match @@ -38,28 +42,57 @@ class Tokenizer(private val reader: CharacterReader): case None => Right(Token.Eof) case Some(ch) => ch match - case _ if isWhiteSpace(ch) => nextInternal(state) - case TokenDelimiter.OpenParen => Right(Token.OpenParen) - case TokenDelimiter.CloseParen => Right(Token.CloseParen) - case TokenDelimiter.Comma => Right(Token.Comma) - case TokenDelimiter.Colon => Right(Token.Colon) - case TokenDelimiter.Dot => Right(Token.Dot) - case TokenDelimiter.Tuple => Right(Token.Tuple) + case '\n' => + // Every newline resets our white space check for comments. + whiteSpaceOnly = true + nextInternal(state) + case _ if isWhiteSpace(ch) => nextInternal(state) + case TokenDelimiter.OpenParen => + whiteSpaceOnly = false + Right(Token.OpenParen) + case TokenDelimiter.CloseParen => + whiteSpaceOnly = false + Right(Token.CloseParen) + case TokenDelimiter.Comma => + whiteSpaceOnly = false + Right(Token.Comma) + case TokenDelimiter.Colon => + whiteSpaceOnly = false + Right(Token.Colon) + case TokenDelimiter.Dot => + whiteSpaceOnly = false + Right(Token.Dot) + case TokenDelimiter.Tuple => + whiteSpaceOnly = false + Right(Token.Tuple) case TokenDelimiter.BackSlash => + whiteSpaceOnly = false Left(Error.BackSlashNotAllowed(reader.currentSourcePosition())) case TokenDelimiter.DoubleQuote => + whiteSpaceOnly = false nextInternal( State.InDoubleQuote(reader.currentSourcePosition()) ) case TokenDelimiter.SingleQuote => + whiteSpaceOnly = false nextInternal( State.InSingleQuote(reader.currentSourcePosition()) ) case '-' => - nextInternal( - State.PotentialComment(reader.currentSourcePosition()) - ) + if whiteSpaceOnly then + whiteSpaceOnly = false + nextInternal( + State.PotentialComment(reader.currentSourcePosition()) + ) + else + whiteSpaceOnly = false + buffer.addOne(ch) + nextInternal( + State.InGeneric(reader.currentSourcePosition()) + ) case _ => + whiteSpaceOnly = false + buffer.addOne(ch) nextInternal(State.InGeneric(reader.currentSourcePosition())) case State.PotentialComment(startPos) => reader.consume() match @@ -71,13 +104,21 @@ class Tokenizer(private val reader: CharacterReader): // so that we can read the remainder of the line. states.push(state) nextInternal(State.InComment(reader.currentSourcePosition())) - case ' ' | '\t' | '\n' => + case '\n' => + // This token is only a `-` character and has been delimited by + // a newline. Reset our white space check and return the token. + whiteSpaceOnly = true + Right(Token.Generic("-")) + case ' ' | '\t' => // This token is only a `-` character and has been delimited by // whitespace on either side. Return the token. + whiteSpaceOnly = false Right(Token.Generic("-")) case _ => // This is some generic token that starts with `-`. Switch // states now that we're sure about what we're reading. + whiteSpaceOnly = false + buffer.addOne('-') buffer.addOne(ch) nextInternal(State.InGeneric(startPos)) case State.InComment(startPos) => @@ -89,6 +130,7 @@ class Tokenizer(private val reader: CharacterReader): ch match case '\n' => // Newlines terminate a comment. + whiteSpaceOnly = true Right(Token.Comment(buffer.mkString)) case _ => // Any non-newline character is considered part of a comment. @@ -167,7 +209,11 @@ class Tokenizer(private val reader: CharacterReader): // EOF is permitted for tokens - it forcibly terminates them. Right(Token.Generic(buffer.mkString)) case Some(ch) => - if TokenDelimiter.isDelimiter(ch) then + if ch == '\n' then + // New lines close the token AND reset our white space state. + whiteSpaceOnly = true + Right(Token.Generic(buffer.mkString)) + else if TokenDelimiter.isDelimiter(ch) then // Any delimiter forcibly terminates a token. Right(Token.Generic(buffer.mkString)) else diff --git a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala new file mode 100644 index 0000000..1276ac1 --- /dev/null +++ b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala @@ -0,0 +1,106 @@ +package ava.parser + +import cats.effect.IO +import cats.effect.Resource +import cats.effect.unsafe.IORuntime +import java.io.ByteArrayInputStream +import java.io.InputStream + +class TokenizerTests extends munit.FunSuite: + implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global + + private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]()) + + test("should produce the EOF token for an empty stream") { + run(newCR(EmptyStream)) { tokenizer => + IO(assertEquals(tokenizer.next(), Right(Token.Eof))) + } + } + + test( + "should produce the EOF token for a stream of only whitespace characters" + ) { + run(newCR(stringStream(" \t\n\r\n "))) { tokenizer => + IO(assertEquals(tokenizer.next(), Right(Token.Eof))) + } + } + + test( + "should produce an error, followed by EOF, if only a backslash chararacter is present" + ) { + assertTokens( + " \\ ", + Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2))) + ) + } + + test("should capture all fixed, single-character tokens") { + assertTokens( + "(),:.#", + Right(Token.OpenParen), + Right(Token.CloseParen), + Right(Token.Comma), + Right(Token.Colon), + Right(Token.Dot), + Right(Token.Tuple) + ) + } + + test("should not recognize comments if preceded by any non-whitespace") { + assertTokens( + "( --", + Right(Token.OpenParen), + Right(Token.Generic("--")) + ) + } + + test("should recognize potential comments that are just a dash (space)") { + assertTokens("- ", Right(Token.Generic("-"))) + } + + test("should recognize potential comments that are just a dash (tab)") { + assertTokens("-\t", Right(Token.Generic("-"))) + } + + test("should recognize potential comments that are just a dash (newline)") { + assertTokens("-\n", Right(Token.Generic("-"))) + } + + private def assertTokens( + source: String, + expectedOutput: Either[Tokenizer.Error, Token]* + ): Unit = + run(newCR(stringStream(source))) { tokenizer => + consumeAll(tokenizer).map { tokens => + assertEquals(tokens, expectedOutput.toList) + } + } + + private def consumeAll(tokenizer: Tokenizer) + : IO[List[Either[Tokenizer.Error, Token]]] = + fs2.Stream + .repeatEval(IO(tokenizer.next())) + .takeWhile(_ != Right(Token.Eof)) + .compile + .toList + + private def pos( + charInFile: Long, + lineNumber: Long, + charInLine: Long + ): SourcePosition = + SourcePosition(charInFile, lineNumber, charInLine) + + private def stringStream(contents: String): InputStream = + new ByteArrayInputStream(contents.getBytes()) + + private def newCR(is: InputStream): CharacterReader = + CharacterReader.forInputStream(is) + + private def run(reader: CharacterReader)(testF: Tokenizer => IO[Unit]): Unit = + Resource + .make( + acquire = IO(new Tokenizer(reader)) + )(release = t => IO(t.close())) + .use(testF) + .unsafeRunSync()