Writing tests and test utilities for the Tokenizer

This commit is contained in:
Pat Garrity 2024-02-24 10:21:00 -06:00
parent d265e1b617
commit f7bc41e539
Signed by: pfm
GPG key ID: 5CA5D21BAB7F3A76
5 changed files with 180 additions and 16 deletions

View file

@ -34,7 +34,7 @@ class CharacterReader(
// Tracking for callers. // Tracking for callers.
private var charInFile: Long = 0 private var charInFile: Long = 0
private var lineNumber: Long = 0 private var lineNumber: Long = 1
private var charInLine: Long = 0 private var charInLine: Long = 0
/** Close the underlying stream. /** Close the underlying stream.

View file

@ -5,3 +5,7 @@ case class SourcePosition(
lineNumber: Long, lineNumber: Long,
characterInLine: Long characterInLine: Long
) )
object SourcePosition:
given CanEqual[SourcePosition, SourcePosition] = CanEqual.derived
end SourcePosition

View file

@ -3,6 +3,8 @@ package ava.parser
sealed trait Token sealed trait Token
object Token: object Token:
given CanEqual[Token, Token] = CanEqual.derived
/** Most tokens are generic tokens. They represent some arbitrary grouping of /** Most tokens are generic tokens. They represent some arbitrary grouping of
* characters that will be refined later. * characters that will be refined later.
* *
@ -10,6 +12,7 @@ object Token:
* The token value. * The token value.
*/ */
case class Generic(value: String) extends Token case class Generic(value: String) extends Token
given CanEqual[Generic, Generic] = CanEqual.derived
/** Comments are detected at time of tokenization, and are arbitrary strings. /** Comments are detected at time of tokenization, and are arbitrary strings.
* *
@ -17,6 +20,7 @@ object Token:
* The comment value. * The comment value.
*/ */
case class Comment(value: String) extends Token case class Comment(value: String) extends Token
given CanEqual[Comment, Comment] = CanEqual.derived
/** String literals are captured in total, with escapes resolved. /** String literals are captured in total, with escapes resolved.
* *
@ -31,6 +35,8 @@ object Token:
errors: List[Tokenizer.Error] errors: List[Tokenizer.Error]
) extends Token ) extends Token
given CanEqual[StringLiteral, StringLiteral] = CanEqual.derived
/** Character literals are captured in total, with escapes resolved. /** Character literals are captured in total, with escapes resolved.
* *
* @param value * @param value
@ -44,6 +50,8 @@ object Token:
errors: List[Tokenizer.Error] errors: List[Tokenizer.Error]
) extends Token ) extends Token
given CanEqual[CharacterLiteral, CharacterLiteral] = CanEqual.derived
/** The '(' character. /** The '(' character.
*/ */
case object OpenParen extends Token case object OpenParen extends Token

View file

@ -11,6 +11,7 @@ class Tokenizer(private val reader: CharacterReader):
private val buffer: ListBuffer[Char] = ListBuffer.empty private val buffer: ListBuffer[Char] = ListBuffer.empty
private val states: Stack[State] = Stack.empty private val states: Stack[State] = Stack.empty
private val errors: ListBuffer[Error] = ListBuffer.empty private val errors: ListBuffer[Error] = ListBuffer.empty
private var whiteSpaceOnly: Boolean = true
private def dumpStack(): List[State] = states.toList private def dumpStack(): List[State] = states.toList
@ -20,9 +21,7 @@ class Tokenizer(private val reader: CharacterReader):
* The next available token, or an error if resolving a token fails. * The next available token, or an error if resolving a token fails.
*/ */
def next(): Either[Error, Token] = def next(): Either[Error, Token] =
buffer.clear() resetState()
states.clear()
errors.clear()
nextInternal(State.Initial) nextInternal(State.Initial)
/** Close this Tokenizer instance and free all resources, including the /** Close this Tokenizer instance and free all resources, including the
@ -30,6 +29,11 @@ class Tokenizer(private val reader: CharacterReader):
*/ */
def close(): Unit = reader.close() def close(): Unit = reader.close()
private def resetState(): Unit =
buffer.clear()
states.clear()
errors.clear()
@tailrec @tailrec
private def nextInternal(state: State): Either[Error, Token] = private def nextInternal(state: State): Either[Error, Token] =
state match state match
@ -38,28 +42,57 @@ class Tokenizer(private val reader: CharacterReader):
case None => Right(Token.Eof) case None => Right(Token.Eof)
case Some(ch) => case Some(ch) =>
ch match ch match
case '\n' =>
// Every newline resets our white space check for comments.
whiteSpaceOnly = true
nextInternal(state)
case _ if isWhiteSpace(ch) => nextInternal(state) case _ if isWhiteSpace(ch) => nextInternal(state)
case TokenDelimiter.OpenParen => Right(Token.OpenParen) case TokenDelimiter.OpenParen =>
case TokenDelimiter.CloseParen => Right(Token.CloseParen) whiteSpaceOnly = false
case TokenDelimiter.Comma => Right(Token.Comma) Right(Token.OpenParen)
case TokenDelimiter.Colon => Right(Token.Colon) case TokenDelimiter.CloseParen =>
case TokenDelimiter.Dot => Right(Token.Dot) whiteSpaceOnly = false
case TokenDelimiter.Tuple => Right(Token.Tuple) Right(Token.CloseParen)
case TokenDelimiter.Comma =>
whiteSpaceOnly = false
Right(Token.Comma)
case TokenDelimiter.Colon =>
whiteSpaceOnly = false
Right(Token.Colon)
case TokenDelimiter.Dot =>
whiteSpaceOnly = false
Right(Token.Dot)
case TokenDelimiter.Tuple =>
whiteSpaceOnly = false
Right(Token.Tuple)
case TokenDelimiter.BackSlash => case TokenDelimiter.BackSlash =>
whiteSpaceOnly = false
Left(Error.BackSlashNotAllowed(reader.currentSourcePosition())) Left(Error.BackSlashNotAllowed(reader.currentSourcePosition()))
case TokenDelimiter.DoubleQuote => case TokenDelimiter.DoubleQuote =>
whiteSpaceOnly = false
nextInternal( nextInternal(
State.InDoubleQuote(reader.currentSourcePosition()) State.InDoubleQuote(reader.currentSourcePosition())
) )
case TokenDelimiter.SingleQuote => case TokenDelimiter.SingleQuote =>
whiteSpaceOnly = false
nextInternal( nextInternal(
State.InSingleQuote(reader.currentSourcePosition()) State.InSingleQuote(reader.currentSourcePosition())
) )
case '-' => case '-' =>
if whiteSpaceOnly then
whiteSpaceOnly = false
nextInternal( nextInternal(
State.PotentialComment(reader.currentSourcePosition()) State.PotentialComment(reader.currentSourcePosition())
) )
else
whiteSpaceOnly = false
buffer.addOne(ch)
nextInternal(
State.InGeneric(reader.currentSourcePosition())
)
case _ => case _ =>
whiteSpaceOnly = false
buffer.addOne(ch)
nextInternal(State.InGeneric(reader.currentSourcePosition())) nextInternal(State.InGeneric(reader.currentSourcePosition()))
case State.PotentialComment(startPos) => case State.PotentialComment(startPos) =>
reader.consume() match reader.consume() match
@ -71,13 +104,21 @@ class Tokenizer(private val reader: CharacterReader):
// so that we can read the remainder of the line. // so that we can read the remainder of the line.
states.push(state) states.push(state)
nextInternal(State.InComment(reader.currentSourcePosition())) nextInternal(State.InComment(reader.currentSourcePosition()))
case ' ' | '\t' | '\n' => case '\n' =>
// This token is only a `-` character and has been delimited by
// a newline. Reset our white space check and return the token.
whiteSpaceOnly = true
Right(Token.Generic("-"))
case ' ' | '\t' =>
// This token is only a `-` character and has been delimited by // This token is only a `-` character and has been delimited by
// whitespace on either side. Return the token. // whitespace on either side. Return the token.
whiteSpaceOnly = false
Right(Token.Generic("-")) Right(Token.Generic("-"))
case _ => case _ =>
// This is some generic token that starts with `-`. Switch // This is some generic token that starts with `-`. Switch
// states now that we're sure about what we're reading. // states now that we're sure about what we're reading.
whiteSpaceOnly = false
buffer.addOne('-')
buffer.addOne(ch) buffer.addOne(ch)
nextInternal(State.InGeneric(startPos)) nextInternal(State.InGeneric(startPos))
case State.InComment(startPos) => case State.InComment(startPos) =>
@ -89,6 +130,7 @@ class Tokenizer(private val reader: CharacterReader):
ch match ch match
case '\n' => case '\n' =>
// Newlines terminate a comment. // Newlines terminate a comment.
whiteSpaceOnly = true
Right(Token.Comment(buffer.mkString)) Right(Token.Comment(buffer.mkString))
case _ => case _ =>
// Any non-newline character is considered part of a comment. // Any non-newline character is considered part of a comment.
@ -167,7 +209,11 @@ class Tokenizer(private val reader: CharacterReader):
// EOF is permitted for tokens - it forcibly terminates them. // EOF is permitted for tokens - it forcibly terminates them.
Right(Token.Generic(buffer.mkString)) Right(Token.Generic(buffer.mkString))
case Some(ch) => case Some(ch) =>
if TokenDelimiter.isDelimiter(ch) then if ch == '\n' then
// New lines close the token AND reset our white space state.
whiteSpaceOnly = true
Right(Token.Generic(buffer.mkString))
else if TokenDelimiter.isDelimiter(ch) then
// Any delimiter forcibly terminates a token. // Any delimiter forcibly terminates a token.
Right(Token.Generic(buffer.mkString)) Right(Token.Generic(buffer.mkString))
else else

View file

@ -0,0 +1,106 @@
package ava.parser
import cats.effect.IO
import cats.effect.Resource
import cats.effect.unsafe.IORuntime
import java.io.ByteArrayInputStream
import java.io.InputStream
class TokenizerTests extends munit.FunSuite:
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
test("should produce the EOF token for an empty stream") {
run(newCR(EmptyStream)) { tokenizer =>
IO(assertEquals(tokenizer.next(), Right(Token.Eof)))
}
}
test(
"should produce the EOF token for a stream of only whitespace characters"
) {
run(newCR(stringStream(" \t\n\r\n "))) { tokenizer =>
IO(assertEquals(tokenizer.next(), Right(Token.Eof)))
}
}
test(
"should produce an error, followed by EOF, if only a backslash chararacter is present"
) {
assertTokens(
" \\ ",
Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2)))
)
}
test("should capture all fixed, single-character tokens") {
assertTokens(
"(),:.#",
Right(Token.OpenParen),
Right(Token.CloseParen),
Right(Token.Comma),
Right(Token.Colon),
Right(Token.Dot),
Right(Token.Tuple)
)
}
test("should not recognize comments if preceded by any non-whitespace") {
assertTokens(
"( --",
Right(Token.OpenParen),
Right(Token.Generic("--"))
)
}
test("should recognize potential comments that are just a dash (space)") {
assertTokens("- ", Right(Token.Generic("-")))
}
test("should recognize potential comments that are just a dash (tab)") {
assertTokens("-\t", Right(Token.Generic("-")))
}
test("should recognize potential comments that are just a dash (newline)") {
assertTokens("-\n", Right(Token.Generic("-")))
}
private def assertTokens(
source: String,
expectedOutput: Either[Tokenizer.Error, Token]*
): Unit =
run(newCR(stringStream(source))) { tokenizer =>
consumeAll(tokenizer).map { tokens =>
assertEquals(tokens, expectedOutput.toList)
}
}
private def consumeAll(tokenizer: Tokenizer)
: IO[List[Either[Tokenizer.Error, Token]]] =
fs2.Stream
.repeatEval(IO(tokenizer.next()))
.takeWhile(_ != Right(Token.Eof))
.compile
.toList
private def pos(
charInFile: Long,
lineNumber: Long,
charInLine: Long
): SourcePosition =
SourcePosition(charInFile, lineNumber, charInLine)
private def stringStream(contents: String): InputStream =
new ByteArrayInputStream(contents.getBytes())
private def newCR(is: InputStream): CharacterReader =
CharacterReader.forInputStream(is)
private def run(reader: CharacterReader)(testF: Tokenizer => IO[Unit]): Unit =
Resource
.make(
acquire = IO(new Tokenizer(reader))
)(release = t => IO(t.close()))
.use(testF)
.unsafeRunSync()