Full coverage on tokenization, including a small real file example.

This commit is contained in:
Pat Garrity 2024-02-24 15:25:33 -06:00
parent cd98dc0342
commit b27e16776e
Signed by: pfm
GPG key ID: 5CA5D21BAB7F3A76
7 changed files with 236 additions and 29 deletions

View file

@ -30,4 +30,16 @@ object Operator:
Case Case
) )
/** List of unambiguous operators that can always be trivially parsed. Other
* operators are either specific tokens, or a combination of tokens.
*/
val Trivial: List[Operator] = List(
Hole,
ImportSplat,
Union,
FnReturn,
BindDo,
Case
)
end Operator end Operator

View file

@ -39,6 +39,10 @@ object TokenDelimiter:
Tuple Tuple
) )
def isDelimiter(str: String): Boolean =
if str.length() == 1 then isDelimiter(str.charAt(0))
else false
def isDelimiter(ch: Char): Boolean = All.contains(ch) def isDelimiter(ch: Char): Boolean = All.contains(ch)
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch) def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)

View file

@ -98,7 +98,8 @@ class Tokenizer(private val reader: CharacterReader):
nextInternal(st) nextInternal(st)
case State.PotentialComment(startPos) => case State.PotentialComment(startPos) =>
reader.consume() match reader.consume() match
case None => Left(Error.PrematureEof(dumpStack())) case None =>
Right(Token.Generic("-"))
case Some(ch) => case Some(ch) =>
ch match ch match
case '-' => case '-' =>
@ -152,8 +153,9 @@ class Tokenizer(private val reader: CharacterReader):
) )
case '\\' => case '\\' =>
// Character escapes are supported within string literals. // Character escapes are supported within string literals.
states.push(state) val st = State.InCharEscape(reader.currentSourcePosition())
nextInternal(State.InCharEscape(reader.currentSourcePosition())) states.push(st)
nextInternal(st)
case '"' => case '"' =>
// This string literal is now closed. If anything failed inside // This string literal is now closed. If anything failed inside
// this literal, suppress those errors and return them as part // this literal, suppress those errors and return them as part
@ -179,12 +181,13 @@ class Tokenizer(private val reader: CharacterReader):
Error.UnexpectedNewLine(reader.currentSourcePosition(), state) Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
) )
case '\\' => case '\\' =>
// Character escapse are supported within character literals. // Character escapes are supported within character literals.
states.push(state) val st = State.InCharEscape(reader.currentSourcePosition())
nextInternal(State.InCharEscape(reader.currentSourcePosition())) states.push(st)
nextInternal(st)
case '\'' => case '\'' =>
// This character literal is now closed. // This character literal is now closed.
createCharacterLiteral() createCharacterLiteral(startPos)
case _ => case _ =>
// Continue accumulating characters. // Continue accumulating characters.
buffer.addOne(ch) buffer.addOne(ch)
@ -201,27 +204,35 @@ class Tokenizer(private val reader: CharacterReader):
// state and return to wherever (string or character literal) // state and return to wherever (string or character literal)
// this error originated. // this error originated.
errors.addOne(error) errors.addOne(error)
nextInternal(states.pop()) val _ = states.pop()
nextInternal(states.top)
case Right(actual) => case Right(actual) =>
// Add the resolved character to the buffer and return to the // Add the resolved character to the buffer and return to the
// parent state. // parent state.
val _ = buffer.addOne(actual) val _ = buffer.addOne(actual)
nextInternal(states.pop()) val _ = states.pop()
nextInternal(states.top)
case State.InGeneric(startPos) => case State.InGeneric(startPos) =>
reader.consume() match // PEEK here! This allows us to react to the character in case it's
// something like a delimter, allowing it to be naturally consumed
// later.
reader.peek() match
case None => case None =>
// EOF is permitted for tokens - it forcibly terminates them. // EOF is permitted for tokens - it forcibly terminates them.
Right(Token.Generic(buffer.mkString)) Right(Token.Generic(buffer.mkString))
case Some(ch) => case Some(ch) =>
if ch == '\n' then if ch == '\n' then
// New lines close the token AND reset our white space state. // New lines close the token AND reset our white space state.
val _ = reader.consume()
whiteSpaceOnly = true whiteSpaceOnly = true
Right(Token.Generic(buffer.mkString)) Right(Token.Generic(buffer.mkString))
else if TokenDelimiter.isDelimiter(ch) then else if TokenDelimiter.isDelimiter(ch) then
// Any delimiter forcibly terminates a token. // Any delimiter forcibly terminates a token.
// Do not consume the character so it gets picked up later.
Right(Token.Generic(buffer.mkString)) Right(Token.Generic(buffer.mkString))
else else
// Non-delimiter characters are added to the token. // Non-delimiter characters are added to the token.
val _ = reader.consume()
buffer.addOne(ch) buffer.addOne(ch)
nextInternal(state) nextInternal(state)
@ -232,20 +243,27 @@ class Tokenizer(private val reader: CharacterReader):
case Some(escape) => case Some(escape) =>
Right(escape.output) Right(escape.output)
private def createCharacterLiteral(): Either[Error, Token] = private def createCharacterLiteral(pos: SourcePosition)
: Either[Error, Token] =
val dump = buffer.mkString val dump = buffer.mkString
if dump.length() > 1 then if dump.length() > 1 then
Left( Left(
Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump) Error.MultipleCharactersInLiteral(
) sourcePosition = pos,
else candidate = dump,
val out = if dump.isEmpty() then 0 else dump.charAt(0)
Right(
Token.CharacterLiteral(
value = out,
errors = errors.toList errors = errors.toList
) )
) )
else if dump.length() < 1 then
Left(
Error.NoCharactersInLiteral(
sourcePosition = pos,
errors = errors.toList
)
)
else
val out = dump.charAt(0)
Right(Token.CharacterLiteral(value = out, errors = errors.toList))
object Tokenizer: object Tokenizer:
@ -316,7 +334,13 @@ object Tokenizer:
*/ */
case class MultipleCharactersInLiteral( case class MultipleCharactersInLiteral(
sourcePosition: SourcePosition, sourcePosition: SourcePosition,
candidate: String candidate: String,
errors: List[Error]
) extends Error
case class NoCharactersInLiteral(
sourcePosition: SourcePosition,
errors: List[Error]
) extends Error ) extends Error
case class BackSlashNotAllowed( case class BackSlashNotAllowed(

View file

@ -0,0 +1 @@
namespace unit.test

View file

@ -0,0 +1,8 @@
namespace unit.test
import one.two.three.*
import x.y
fn foo: Int -> Int
λ value => value
end fn

View file

@ -5,8 +5,13 @@ import cats.effect.Resource
import cats.effect.unsafe.IORuntime import cats.effect.unsafe.IORuntime
import java.io.ByteArrayInputStream import java.io.ByteArrayInputStream
import java.io.InputStream import java.io.InputStream
import scala.io.Source
class TokenizerTests extends munit.FunSuite: class TokenizerTests extends munit.FunSuite:
import Tokenizer.Error
import Tokenizer.State
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]()) private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
@ -30,7 +35,7 @@ class TokenizerTests extends munit.FunSuite:
) { ) {
assertTokens( assertTokens(
" \\ ", " \\ ",
Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2))) Left(Error.BackSlashNotAllowed(pos(2, 1, 2)))
) )
} }
@ -66,6 +71,10 @@ class TokenizerTests extends munit.FunSuite:
assertTokens("-\n", Right(Token.Generic("-"))) assertTokens("-\n", Right(Token.Generic("-")))
} }
test("should capture a generic - if a potential comment hits EOF") {
assertTokens("-", Right(Token.Generic("-")))
}
test("should handle basic comments") { test("should handle basic comments") {
assertTokens("-- Comment", Right(Token.Comment("Comment"))) assertTokens("-- Comment", Right(Token.Comment("Comment")))
} }
@ -91,8 +100,8 @@ class TokenizerTests extends munit.FunSuite:
assertTokens( assertTokens(
"\"", "\"",
Left( Left(
Tokenizer.Error.PrematureEof( Error.PrematureEof(
List(Tokenizer.State.InDoubleQuote(pos(1, 1, 1))) List(State.InDoubleQuote(pos(1, 1, 1)))
) )
) )
) )
@ -102,8 +111,33 @@ class TokenizerTests extends munit.FunSuite:
assertTokens( assertTokens(
"\'", "\'",
Left( Left(
Tokenizer.Error.PrematureEof( Error.PrematureEof(
List(Tokenizer.State.InSingleQuote(pos(1, 1, 1))) List(State.InSingleQuote(pos(1, 1, 1)))
)
)
)
}
test("should throw an error if EOF is reached inside a character ESCAPE") {
assertTokens(
"\'\\",
Left(
Error.PrematureEof(
List(
State.InCharEscape(pos(2, 1, 2)),
State.InSingleQuote(pos(1, 1, 1))
)
)
)
)
assertTokens(
"\"\\",
Left(
Error.PrematureEof(
List(
State.InCharEscape(pos(2, 1, 2)),
State.InDoubleQuote(pos(1, 1, 1))
)
) )
) )
) )
@ -140,15 +174,137 @@ class TokenizerTests extends munit.FunSuite:
Right( Right(
Token.StringLiteral( Token.StringLiteral(
"foobar", "foobar",
List(Tokenizer.Error.InvalidCharEscape(pos(6, 1, 6), '!')) List(Error.InvalidCharEscape(pos(6, 1, 6), '!'))
) )
) )
) )
} }
test("should reject a string literal with a newline") {
assertTokens(
"\"\n",
Left(
Error.UnexpectedNewLine(
sourcePosition = pos(2, 2, 0),
currentState = State.InDoubleQuote(pos(1, 1, 1))
)
)
)
}
test("should reject a character literal with a newline") {
assertTokens(
"\'\n",
Left(
Error.UnexpectedNewLine(
sourcePosition = pos(2, 2, 0),
currentState = State.InSingleQuote(pos(1, 1, 1))
)
)
)
}
test("should accept an empty string literal") {
assertTokens("\"\"", Right(Token.StringLiteral("", Nil)))
}
test("should reject an empty character literal") {
assertTokens(
"\'\'",
Left(Error.NoCharactersInLiteral(pos(1, 1, 1), Nil))
)
}
test("should reject an empty character literal with a failed escape") {
assertTokens(
"\'\\!\'",
Left(
Error.NoCharactersInLiteral(
sourcePosition = pos(1, 1, 1),
List(Error.InvalidCharEscape(pos(3, 1, 3), '!'))
)
)
)
}
test("should reject a character literal with multiple characters") {
assertTokens(
"\'abc\'",
Left(
Error.MultipleCharactersInLiteral(
sourcePosition = pos(1, 1, 1),
candidate = "abc",
errors = Nil
)
)
)
}
test("should consume a generic token") {
assertTokens("abcdefg", Right(Token.Generic("abcdefg")))
}
test("should consume a generic token for every possible keyword") {
Keyword.All.foreach { keyword =>
assertTokens(keyword.value, Right(Token.Generic(keyword.value)))
}
}
test("should consume a generic token for every trivial operator") {
Operator.Trivial.foreach { op =>
assertTokens(op.value, Right(Token.Generic(op.value)))
}
}
test("should tokenize a valid file (case 1)") {
val source = loadFileToString("tokenizer-1.ava")
println(source)
assertTokens(
source,
Right(Token.Generic("namespace")),
Right(Token.Generic("unit")),
Right(Token.Dot),
Right(Token.Generic("test"))
)
}
test("should tokenize a valid file (case 2)") {
assertTokens(
loadFileToString("tokenizer-2.ava"),
Right(Token.Generic("namespace")),
Right(Token.Generic("unit")),
Right(Token.Dot),
Right(Token.Generic("test")),
Right(Token.Generic("import")),
Right(Token.Generic("one")),
Right(Token.Dot),
Right(Token.Generic("two")),
Right(Token.Dot),
Right(Token.Generic("three")),
Right(Token.Dot),
Right(Token.Generic("*")),
Right(Token.Generic("import")),
Right(Token.Generic("x")),
Right(Token.Dot),
Right(Token.Generic("y")),
Right(Token.Generic("fn")),
Right(Token.Generic("foo")),
Right(Token.Colon),
Right(Token.Generic("Int")),
Right(Token.Generic("->")),
Right(Token.Generic("Int")),
Right(Token.Generic("λ")),
Right(Token.Generic("value")),
Right(Token.Generic("=>")),
Right(Token.Generic("value")),
Right(Token.Generic("end")),
Right(Token.Generic("fn"))
)
}
private def assertTokens( private def assertTokens(
source: String, source: String,
expectedOutput: Either[Tokenizer.Error, Token]* expectedOutput: Either[Error, Token]*
): Unit = ): Unit =
run(newCR(stringStream(source))) { tokenizer => run(newCR(stringStream(source))) { tokenizer =>
consumeAll(tokenizer).map { tokens => consumeAll(tokenizer).map { tokens =>
@ -156,8 +312,7 @@ class TokenizerTests extends munit.FunSuite:
} }
} }
private def consumeAll(tokenizer: Tokenizer) private def consumeAll(tokenizer: Tokenizer): IO[List[Either[Error, Token]]] =
: IO[List[Either[Tokenizer.Error, Token]]] =
fs2.Stream fs2.Stream
.repeatEval(IO(tokenizer.next())) .repeatEval(IO(tokenizer.next()))
.takeWhile(_ != Right(Token.Eof)) .takeWhile(_ != Right(Token.Eof))
@ -184,3 +339,6 @@ class TokenizerTests extends munit.FunSuite:
)(release = t => IO(t.close())) )(release = t => IO(t.close()))
.use(testF) .use(testF)
.unsafeRunSync() .unsafeRunSync()
private def loadFileToString(name: String): String =
Source.fromResource(name).mkString

View file

@ -5,6 +5,6 @@ externalResolvers := Seq(
"Garrity Software Releases" at "https://maven.garrity.co/gs" "Garrity Software Releases" at "https://maven.garrity.co/gs"
) )
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.10")
addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0") addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0")
addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0") addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0")