Writing tests and test utilities for the Tokenizer
This commit is contained in:
parent
d265e1b617
commit
f7bc41e539
5 changed files with 180 additions and 16 deletions
|
@ -34,7 +34,7 @@ class CharacterReader(
|
||||||
|
|
||||||
// Tracking for callers.
|
// Tracking for callers.
|
||||||
private var charInFile: Long = 0
|
private var charInFile: Long = 0
|
||||||
private var lineNumber: Long = 0
|
private var lineNumber: Long = 1
|
||||||
private var charInLine: Long = 0
|
private var charInLine: Long = 0
|
||||||
|
|
||||||
/** Close the underlying stream.
|
/** Close the underlying stream.
|
||||||
|
|
|
@ -5,3 +5,7 @@ case class SourcePosition(
|
||||||
lineNumber: Long,
|
lineNumber: Long,
|
||||||
characterInLine: Long
|
characterInLine: Long
|
||||||
)
|
)
|
||||||
|
|
||||||
|
object SourcePosition:
|
||||||
|
given CanEqual[SourcePosition, SourcePosition] = CanEqual.derived
|
||||||
|
end SourcePosition
|
||||||
|
|
|
@ -3,6 +3,8 @@ package ava.parser
|
||||||
sealed trait Token
|
sealed trait Token
|
||||||
|
|
||||||
object Token:
|
object Token:
|
||||||
|
given CanEqual[Token, Token] = CanEqual.derived
|
||||||
|
|
||||||
/** Most tokens are generic tokens. They represent some arbitrary grouping of
|
/** Most tokens are generic tokens. They represent some arbitrary grouping of
|
||||||
* characters that will be refined later.
|
* characters that will be refined later.
|
||||||
*
|
*
|
||||||
|
@ -10,6 +12,7 @@ object Token:
|
||||||
* The token value.
|
* The token value.
|
||||||
*/
|
*/
|
||||||
case class Generic(value: String) extends Token
|
case class Generic(value: String) extends Token
|
||||||
|
given CanEqual[Generic, Generic] = CanEqual.derived
|
||||||
|
|
||||||
/** Comments are detected at time of tokenization, and are arbitrary strings.
|
/** Comments are detected at time of tokenization, and are arbitrary strings.
|
||||||
*
|
*
|
||||||
|
@ -17,6 +20,7 @@ object Token:
|
||||||
* The comment value.
|
* The comment value.
|
||||||
*/
|
*/
|
||||||
case class Comment(value: String) extends Token
|
case class Comment(value: String) extends Token
|
||||||
|
given CanEqual[Comment, Comment] = CanEqual.derived
|
||||||
|
|
||||||
/** String literals are captured in total, with escapes resolved.
|
/** String literals are captured in total, with escapes resolved.
|
||||||
*
|
*
|
||||||
|
@ -31,6 +35,8 @@ object Token:
|
||||||
errors: List[Tokenizer.Error]
|
errors: List[Tokenizer.Error]
|
||||||
) extends Token
|
) extends Token
|
||||||
|
|
||||||
|
given CanEqual[StringLiteral, StringLiteral] = CanEqual.derived
|
||||||
|
|
||||||
/** Character literals are captured in total, with escapes resolved.
|
/** Character literals are captured in total, with escapes resolved.
|
||||||
*
|
*
|
||||||
* @param value
|
* @param value
|
||||||
|
@ -44,6 +50,8 @@ object Token:
|
||||||
errors: List[Tokenizer.Error]
|
errors: List[Tokenizer.Error]
|
||||||
) extends Token
|
) extends Token
|
||||||
|
|
||||||
|
given CanEqual[CharacterLiteral, CharacterLiteral] = CanEqual.derived
|
||||||
|
|
||||||
/** The '(' character.
|
/** The '(' character.
|
||||||
*/
|
*/
|
||||||
case object OpenParen extends Token
|
case object OpenParen extends Token
|
||||||
|
|
|
@ -11,6 +11,7 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
private val buffer: ListBuffer[Char] = ListBuffer.empty
|
private val buffer: ListBuffer[Char] = ListBuffer.empty
|
||||||
private val states: Stack[State] = Stack.empty
|
private val states: Stack[State] = Stack.empty
|
||||||
private val errors: ListBuffer[Error] = ListBuffer.empty
|
private val errors: ListBuffer[Error] = ListBuffer.empty
|
||||||
|
private var whiteSpaceOnly: Boolean = true
|
||||||
|
|
||||||
private def dumpStack(): List[State] = states.toList
|
private def dumpStack(): List[State] = states.toList
|
||||||
|
|
||||||
|
@ -20,9 +21,7 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
* The next available token, or an error if resolving a token fails.
|
* The next available token, or an error if resolving a token fails.
|
||||||
*/
|
*/
|
||||||
def next(): Either[Error, Token] =
|
def next(): Either[Error, Token] =
|
||||||
buffer.clear()
|
resetState()
|
||||||
states.clear()
|
|
||||||
errors.clear()
|
|
||||||
nextInternal(State.Initial)
|
nextInternal(State.Initial)
|
||||||
|
|
||||||
/** Close this Tokenizer instance and free all resources, including the
|
/** Close this Tokenizer instance and free all resources, including the
|
||||||
|
@ -30,6 +29,11 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
*/
|
*/
|
||||||
def close(): Unit = reader.close()
|
def close(): Unit = reader.close()
|
||||||
|
|
||||||
|
private def resetState(): Unit =
|
||||||
|
buffer.clear()
|
||||||
|
states.clear()
|
||||||
|
errors.clear()
|
||||||
|
|
||||||
@tailrec
|
@tailrec
|
||||||
private def nextInternal(state: State): Either[Error, Token] =
|
private def nextInternal(state: State): Either[Error, Token] =
|
||||||
state match
|
state match
|
||||||
|
@ -38,28 +42,57 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
case None => Right(Token.Eof)
|
case None => Right(Token.Eof)
|
||||||
case Some(ch) =>
|
case Some(ch) =>
|
||||||
ch match
|
ch match
|
||||||
|
case '\n' =>
|
||||||
|
// Every newline resets our white space check for comments.
|
||||||
|
whiteSpaceOnly = true
|
||||||
|
nextInternal(state)
|
||||||
case _ if isWhiteSpace(ch) => nextInternal(state)
|
case _ if isWhiteSpace(ch) => nextInternal(state)
|
||||||
case TokenDelimiter.OpenParen => Right(Token.OpenParen)
|
case TokenDelimiter.OpenParen =>
|
||||||
case TokenDelimiter.CloseParen => Right(Token.CloseParen)
|
whiteSpaceOnly = false
|
||||||
case TokenDelimiter.Comma => Right(Token.Comma)
|
Right(Token.OpenParen)
|
||||||
case TokenDelimiter.Colon => Right(Token.Colon)
|
case TokenDelimiter.CloseParen =>
|
||||||
case TokenDelimiter.Dot => Right(Token.Dot)
|
whiteSpaceOnly = false
|
||||||
case TokenDelimiter.Tuple => Right(Token.Tuple)
|
Right(Token.CloseParen)
|
||||||
|
case TokenDelimiter.Comma =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
Right(Token.Comma)
|
||||||
|
case TokenDelimiter.Colon =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
Right(Token.Colon)
|
||||||
|
case TokenDelimiter.Dot =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
Right(Token.Dot)
|
||||||
|
case TokenDelimiter.Tuple =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
Right(Token.Tuple)
|
||||||
case TokenDelimiter.BackSlash =>
|
case TokenDelimiter.BackSlash =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
Left(Error.BackSlashNotAllowed(reader.currentSourcePosition()))
|
Left(Error.BackSlashNotAllowed(reader.currentSourcePosition()))
|
||||||
case TokenDelimiter.DoubleQuote =>
|
case TokenDelimiter.DoubleQuote =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
nextInternal(
|
nextInternal(
|
||||||
State.InDoubleQuote(reader.currentSourcePosition())
|
State.InDoubleQuote(reader.currentSourcePosition())
|
||||||
)
|
)
|
||||||
case TokenDelimiter.SingleQuote =>
|
case TokenDelimiter.SingleQuote =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
nextInternal(
|
nextInternal(
|
||||||
State.InSingleQuote(reader.currentSourcePosition())
|
State.InSingleQuote(reader.currentSourcePosition())
|
||||||
)
|
)
|
||||||
case '-' =>
|
case '-' =>
|
||||||
|
if whiteSpaceOnly then
|
||||||
|
whiteSpaceOnly = false
|
||||||
nextInternal(
|
nextInternal(
|
||||||
State.PotentialComment(reader.currentSourcePosition())
|
State.PotentialComment(reader.currentSourcePosition())
|
||||||
)
|
)
|
||||||
|
else
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
buffer.addOne(ch)
|
||||||
|
nextInternal(
|
||||||
|
State.InGeneric(reader.currentSourcePosition())
|
||||||
|
)
|
||||||
case _ =>
|
case _ =>
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
buffer.addOne(ch)
|
||||||
nextInternal(State.InGeneric(reader.currentSourcePosition()))
|
nextInternal(State.InGeneric(reader.currentSourcePosition()))
|
||||||
case State.PotentialComment(startPos) =>
|
case State.PotentialComment(startPos) =>
|
||||||
reader.consume() match
|
reader.consume() match
|
||||||
|
@ -71,13 +104,21 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
// so that we can read the remainder of the line.
|
// so that we can read the remainder of the line.
|
||||||
states.push(state)
|
states.push(state)
|
||||||
nextInternal(State.InComment(reader.currentSourcePosition()))
|
nextInternal(State.InComment(reader.currentSourcePosition()))
|
||||||
case ' ' | '\t' | '\n' =>
|
case '\n' =>
|
||||||
|
// This token is only a `-` character and has been delimited by
|
||||||
|
// a newline. Reset our white space check and return the token.
|
||||||
|
whiteSpaceOnly = true
|
||||||
|
Right(Token.Generic("-"))
|
||||||
|
case ' ' | '\t' =>
|
||||||
// This token is only a `-` character and has been delimited by
|
// This token is only a `-` character and has been delimited by
|
||||||
// whitespace on either side. Return the token.
|
// whitespace on either side. Return the token.
|
||||||
|
whiteSpaceOnly = false
|
||||||
Right(Token.Generic("-"))
|
Right(Token.Generic("-"))
|
||||||
case _ =>
|
case _ =>
|
||||||
// This is some generic token that starts with `-`. Switch
|
// This is some generic token that starts with `-`. Switch
|
||||||
// states now that we're sure about what we're reading.
|
// states now that we're sure about what we're reading.
|
||||||
|
whiteSpaceOnly = false
|
||||||
|
buffer.addOne('-')
|
||||||
buffer.addOne(ch)
|
buffer.addOne(ch)
|
||||||
nextInternal(State.InGeneric(startPos))
|
nextInternal(State.InGeneric(startPos))
|
||||||
case State.InComment(startPos) =>
|
case State.InComment(startPos) =>
|
||||||
|
@ -89,6 +130,7 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
ch match
|
ch match
|
||||||
case '\n' =>
|
case '\n' =>
|
||||||
// Newlines terminate a comment.
|
// Newlines terminate a comment.
|
||||||
|
whiteSpaceOnly = true
|
||||||
Right(Token.Comment(buffer.mkString))
|
Right(Token.Comment(buffer.mkString))
|
||||||
case _ =>
|
case _ =>
|
||||||
// Any non-newline character is considered part of a comment.
|
// Any non-newline character is considered part of a comment.
|
||||||
|
@ -167,7 +209,11 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
// EOF is permitted for tokens - it forcibly terminates them.
|
// EOF is permitted for tokens - it forcibly terminates them.
|
||||||
Right(Token.Generic(buffer.mkString))
|
Right(Token.Generic(buffer.mkString))
|
||||||
case Some(ch) =>
|
case Some(ch) =>
|
||||||
if TokenDelimiter.isDelimiter(ch) then
|
if ch == '\n' then
|
||||||
|
// New lines close the token AND reset our white space state.
|
||||||
|
whiteSpaceOnly = true
|
||||||
|
Right(Token.Generic(buffer.mkString))
|
||||||
|
else if TokenDelimiter.isDelimiter(ch) then
|
||||||
// Any delimiter forcibly terminates a token.
|
// Any delimiter forcibly terminates a token.
|
||||||
Right(Token.Generic(buffer.mkString))
|
Right(Token.Generic(buffer.mkString))
|
||||||
else
|
else
|
||||||
|
|
106
modules/parser/src/test/scala/ava/parser/TokenizerTests.scala
Normal file
106
modules/parser/src/test/scala/ava/parser/TokenizerTests.scala
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
package ava.parser
|
||||||
|
|
||||||
|
import cats.effect.IO
|
||||||
|
import cats.effect.Resource
|
||||||
|
import cats.effect.unsafe.IORuntime
|
||||||
|
import java.io.ByteArrayInputStream
|
||||||
|
import java.io.InputStream
|
||||||
|
|
||||||
|
class TokenizerTests extends munit.FunSuite:
|
||||||
|
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
|
||||||
|
|
||||||
|
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
|
||||||
|
|
||||||
|
test("should produce the EOF token for an empty stream") {
|
||||||
|
run(newCR(EmptyStream)) { tokenizer =>
|
||||||
|
IO(assertEquals(tokenizer.next(), Right(Token.Eof)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test(
|
||||||
|
"should produce the EOF token for a stream of only whitespace characters"
|
||||||
|
) {
|
||||||
|
run(newCR(stringStream(" \t\n\r\n "))) { tokenizer =>
|
||||||
|
IO(assertEquals(tokenizer.next(), Right(Token.Eof)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test(
|
||||||
|
"should produce an error, followed by EOF, if only a backslash chararacter is present"
|
||||||
|
) {
|
||||||
|
assertTokens(
|
||||||
|
" \\ ",
|
||||||
|
Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2)))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should capture all fixed, single-character tokens") {
|
||||||
|
assertTokens(
|
||||||
|
"(),:.#",
|
||||||
|
Right(Token.OpenParen),
|
||||||
|
Right(Token.CloseParen),
|
||||||
|
Right(Token.Comma),
|
||||||
|
Right(Token.Colon),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Tuple)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should not recognize comments if preceded by any non-whitespace") {
|
||||||
|
assertTokens(
|
||||||
|
"( --",
|
||||||
|
Right(Token.OpenParen),
|
||||||
|
Right(Token.Generic("--"))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should recognize potential comments that are just a dash (space)") {
|
||||||
|
assertTokens("- ", Right(Token.Generic("-")))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should recognize potential comments that are just a dash (tab)") {
|
||||||
|
assertTokens("-\t", Right(Token.Generic("-")))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should recognize potential comments that are just a dash (newline)") {
|
||||||
|
assertTokens("-\n", Right(Token.Generic("-")))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def assertTokens(
|
||||||
|
source: String,
|
||||||
|
expectedOutput: Either[Tokenizer.Error, Token]*
|
||||||
|
): Unit =
|
||||||
|
run(newCR(stringStream(source))) { tokenizer =>
|
||||||
|
consumeAll(tokenizer).map { tokens =>
|
||||||
|
assertEquals(tokens, expectedOutput.toList)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def consumeAll(tokenizer: Tokenizer)
|
||||||
|
: IO[List[Either[Tokenizer.Error, Token]]] =
|
||||||
|
fs2.Stream
|
||||||
|
.repeatEval(IO(tokenizer.next()))
|
||||||
|
.takeWhile(_ != Right(Token.Eof))
|
||||||
|
.compile
|
||||||
|
.toList
|
||||||
|
|
||||||
|
private def pos(
|
||||||
|
charInFile: Long,
|
||||||
|
lineNumber: Long,
|
||||||
|
charInLine: Long
|
||||||
|
): SourcePosition =
|
||||||
|
SourcePosition(charInFile, lineNumber, charInLine)
|
||||||
|
|
||||||
|
private def stringStream(contents: String): InputStream =
|
||||||
|
new ByteArrayInputStream(contents.getBytes())
|
||||||
|
|
||||||
|
private def newCR(is: InputStream): CharacterReader =
|
||||||
|
CharacterReader.forInputStream(is)
|
||||||
|
|
||||||
|
private def run(reader: CharacterReader)(testF: Tokenizer => IO[Unit]): Unit =
|
||||||
|
Resource
|
||||||
|
.make(
|
||||||
|
acquire = IO(new Tokenizer(reader))
|
||||||
|
)(release = t => IO(t.close()))
|
||||||
|
.use(testF)
|
||||||
|
.unsafeRunSync()
|
Loading…
Add table
Reference in a new issue