Full coverage on tokenization, including a small real file example.
This commit is contained in:
parent
cd98dc0342
commit
b27e16776e
7 changed files with 236 additions and 29 deletions
|
@ -30,4 +30,16 @@ object Operator:
|
|||
Case
|
||||
)
|
||||
|
||||
/** List of unambiguous operators that can always be trivially parsed. Other
|
||||
* operators are either specific tokens, or a combination of tokens.
|
||||
*/
|
||||
val Trivial: List[Operator] = List(
|
||||
Hole,
|
||||
ImportSplat,
|
||||
Union,
|
||||
FnReturn,
|
||||
BindDo,
|
||||
Case
|
||||
)
|
||||
|
||||
end Operator
|
||||
|
|
|
@ -39,6 +39,10 @@ object TokenDelimiter:
|
|||
Tuple
|
||||
)
|
||||
|
||||
def isDelimiter(str: String): Boolean =
|
||||
if str.length() == 1 then isDelimiter(str.charAt(0))
|
||||
else false
|
||||
|
||||
def isDelimiter(ch: Char): Boolean = All.contains(ch)
|
||||
|
||||
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)
|
||||
|
|
|
@ -98,7 +98,8 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
nextInternal(st)
|
||||
case State.PotentialComment(startPos) =>
|
||||
reader.consume() match
|
||||
case None => Left(Error.PrematureEof(dumpStack()))
|
||||
case None =>
|
||||
Right(Token.Generic("-"))
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case '-' =>
|
||||
|
@ -152,8 +153,9 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
)
|
||||
case '\\' =>
|
||||
// Character escapes are supported within string literals.
|
||||
states.push(state)
|
||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
||||
val st = State.InCharEscape(reader.currentSourcePosition())
|
||||
states.push(st)
|
||||
nextInternal(st)
|
||||
case '"' =>
|
||||
// This string literal is now closed. If anything failed inside
|
||||
// this literal, suppress those errors and return them as part
|
||||
|
@ -179,12 +181,13 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
|
||||
)
|
||||
case '\\' =>
|
||||
// Character escapse are supported within character literals.
|
||||
states.push(state)
|
||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
||||
// Character escapes are supported within character literals.
|
||||
val st = State.InCharEscape(reader.currentSourcePosition())
|
||||
states.push(st)
|
||||
nextInternal(st)
|
||||
case '\'' =>
|
||||
// This character literal is now closed.
|
||||
createCharacterLiteral()
|
||||
createCharacterLiteral(startPos)
|
||||
case _ =>
|
||||
// Continue accumulating characters.
|
||||
buffer.addOne(ch)
|
||||
|
@ -201,27 +204,35 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
// state and return to wherever (string or character literal)
|
||||
// this error originated.
|
||||
errors.addOne(error)
|
||||
nextInternal(states.pop())
|
||||
val _ = states.pop()
|
||||
nextInternal(states.top)
|
||||
case Right(actual) =>
|
||||
// Add the resolved character to the buffer and return to the
|
||||
// parent state.
|
||||
val _ = buffer.addOne(actual)
|
||||
nextInternal(states.pop())
|
||||
val _ = states.pop()
|
||||
nextInternal(states.top)
|
||||
case State.InGeneric(startPos) =>
|
||||
reader.consume() match
|
||||
// PEEK here! This allows us to react to the character in case it's
|
||||
// something like a delimter, allowing it to be naturally consumed
|
||||
// later.
|
||||
reader.peek() match
|
||||
case None =>
|
||||
// EOF is permitted for tokens - it forcibly terminates them.
|
||||
Right(Token.Generic(buffer.mkString))
|
||||
case Some(ch) =>
|
||||
if ch == '\n' then
|
||||
// New lines close the token AND reset our white space state.
|
||||
val _ = reader.consume()
|
||||
whiteSpaceOnly = true
|
||||
Right(Token.Generic(buffer.mkString))
|
||||
else if TokenDelimiter.isDelimiter(ch) then
|
||||
// Any delimiter forcibly terminates a token.
|
||||
// Do not consume the character so it gets picked up later.
|
||||
Right(Token.Generic(buffer.mkString))
|
||||
else
|
||||
// Non-delimiter characters are added to the token.
|
||||
val _ = reader.consume()
|
||||
buffer.addOne(ch)
|
||||
nextInternal(state)
|
||||
|
||||
|
@ -232,20 +243,27 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
case Some(escape) =>
|
||||
Right(escape.output)
|
||||
|
||||
private def createCharacterLiteral(): Either[Error, Token] =
|
||||
private def createCharacterLiteral(pos: SourcePosition)
|
||||
: Either[Error, Token] =
|
||||
val dump = buffer.mkString
|
||||
if dump.length() > 1 then
|
||||
Left(
|
||||
Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump)
|
||||
)
|
||||
else
|
||||
val out = if dump.isEmpty() then 0 else dump.charAt(0)
|
||||
Right(
|
||||
Token.CharacterLiteral(
|
||||
value = out,
|
||||
Error.MultipleCharactersInLiteral(
|
||||
sourcePosition = pos,
|
||||
candidate = dump,
|
||||
errors = errors.toList
|
||||
)
|
||||
)
|
||||
else if dump.length() < 1 then
|
||||
Left(
|
||||
Error.NoCharactersInLiteral(
|
||||
sourcePosition = pos,
|
||||
errors = errors.toList
|
||||
)
|
||||
)
|
||||
else
|
||||
val out = dump.charAt(0)
|
||||
Right(Token.CharacterLiteral(value = out, errors = errors.toList))
|
||||
|
||||
object Tokenizer:
|
||||
|
||||
|
@ -316,7 +334,13 @@ object Tokenizer:
|
|||
*/
|
||||
case class MultipleCharactersInLiteral(
|
||||
sourcePosition: SourcePosition,
|
||||
candidate: String
|
||||
candidate: String,
|
||||
errors: List[Error]
|
||||
) extends Error
|
||||
|
||||
case class NoCharactersInLiteral(
|
||||
sourcePosition: SourcePosition,
|
||||
errors: List[Error]
|
||||
) extends Error
|
||||
|
||||
case class BackSlashNotAllowed(
|
||||
|
|
1
modules/parser/src/test/resources/tokenizer-1.ava
Normal file
1
modules/parser/src/test/resources/tokenizer-1.ava
Normal file
|
@ -0,0 +1 @@
|
|||
namespace unit.test
|
8
modules/parser/src/test/resources/tokenizer-2.ava
Normal file
8
modules/parser/src/test/resources/tokenizer-2.ava
Normal file
|
@ -0,0 +1,8 @@
|
|||
namespace unit.test
|
||||
|
||||
import one.two.three.*
|
||||
import x.y
|
||||
|
||||
fn foo: Int -> Int
|
||||
λ value => value
|
||||
end fn
|
|
@ -5,8 +5,13 @@ import cats.effect.Resource
|
|||
import cats.effect.unsafe.IORuntime
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.io.InputStream
|
||||
import scala.io.Source
|
||||
|
||||
class TokenizerTests extends munit.FunSuite:
|
||||
|
||||
import Tokenizer.Error
|
||||
import Tokenizer.State
|
||||
|
||||
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
|
||||
|
||||
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
|
||||
|
@ -30,7 +35,7 @@ class TokenizerTests extends munit.FunSuite:
|
|||
) {
|
||||
assertTokens(
|
||||
" \\ ",
|
||||
Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2)))
|
||||
Left(Error.BackSlashNotAllowed(pos(2, 1, 2)))
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -66,6 +71,10 @@ class TokenizerTests extends munit.FunSuite:
|
|||
assertTokens("-\n", Right(Token.Generic("-")))
|
||||
}
|
||||
|
||||
test("should capture a generic - if a potential comment hits EOF") {
|
||||
assertTokens("-", Right(Token.Generic("-")))
|
||||
}
|
||||
|
||||
test("should handle basic comments") {
|
||||
assertTokens("-- Comment", Right(Token.Comment("Comment")))
|
||||
}
|
||||
|
@ -91,8 +100,8 @@ class TokenizerTests extends munit.FunSuite:
|
|||
assertTokens(
|
||||
"\"",
|
||||
Left(
|
||||
Tokenizer.Error.PrematureEof(
|
||||
List(Tokenizer.State.InDoubleQuote(pos(1, 1, 1)))
|
||||
Error.PrematureEof(
|
||||
List(State.InDoubleQuote(pos(1, 1, 1)))
|
||||
)
|
||||
)
|
||||
)
|
||||
|
@ -102,8 +111,33 @@ class TokenizerTests extends munit.FunSuite:
|
|||
assertTokens(
|
||||
"\'",
|
||||
Left(
|
||||
Tokenizer.Error.PrematureEof(
|
||||
List(Tokenizer.State.InSingleQuote(pos(1, 1, 1)))
|
||||
Error.PrematureEof(
|
||||
List(State.InSingleQuote(pos(1, 1, 1)))
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should throw an error if EOF is reached inside a character ESCAPE") {
|
||||
assertTokens(
|
||||
"\'\\",
|
||||
Left(
|
||||
Error.PrematureEof(
|
||||
List(
|
||||
State.InCharEscape(pos(2, 1, 2)),
|
||||
State.InSingleQuote(pos(1, 1, 1))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
assertTokens(
|
||||
"\"\\",
|
||||
Left(
|
||||
Error.PrematureEof(
|
||||
List(
|
||||
State.InCharEscape(pos(2, 1, 2)),
|
||||
State.InDoubleQuote(pos(1, 1, 1))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
@ -140,15 +174,137 @@ class TokenizerTests extends munit.FunSuite:
|
|||
Right(
|
||||
Token.StringLiteral(
|
||||
"foobar",
|
||||
List(Tokenizer.Error.InvalidCharEscape(pos(6, 1, 6), '!'))
|
||||
List(Error.InvalidCharEscape(pos(6, 1, 6), '!'))
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should reject a string literal with a newline") {
|
||||
assertTokens(
|
||||
"\"\n",
|
||||
Left(
|
||||
Error.UnexpectedNewLine(
|
||||
sourcePosition = pos(2, 2, 0),
|
||||
currentState = State.InDoubleQuote(pos(1, 1, 1))
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should reject a character literal with a newline") {
|
||||
assertTokens(
|
||||
"\'\n",
|
||||
Left(
|
||||
Error.UnexpectedNewLine(
|
||||
sourcePosition = pos(2, 2, 0),
|
||||
currentState = State.InSingleQuote(pos(1, 1, 1))
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should accept an empty string literal") {
|
||||
assertTokens("\"\"", Right(Token.StringLiteral("", Nil)))
|
||||
}
|
||||
|
||||
test("should reject an empty character literal") {
|
||||
assertTokens(
|
||||
"\'\'",
|
||||
Left(Error.NoCharactersInLiteral(pos(1, 1, 1), Nil))
|
||||
)
|
||||
}
|
||||
|
||||
test("should reject an empty character literal with a failed escape") {
|
||||
assertTokens(
|
||||
"\'\\!\'",
|
||||
Left(
|
||||
Error.NoCharactersInLiteral(
|
||||
sourcePosition = pos(1, 1, 1),
|
||||
List(Error.InvalidCharEscape(pos(3, 1, 3), '!'))
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should reject a character literal with multiple characters") {
|
||||
assertTokens(
|
||||
"\'abc\'",
|
||||
Left(
|
||||
Error.MultipleCharactersInLiteral(
|
||||
sourcePosition = pos(1, 1, 1),
|
||||
candidate = "abc",
|
||||
errors = Nil
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("should consume a generic token") {
|
||||
assertTokens("abcdefg", Right(Token.Generic("abcdefg")))
|
||||
}
|
||||
|
||||
test("should consume a generic token for every possible keyword") {
|
||||
Keyword.All.foreach { keyword =>
|
||||
assertTokens(keyword.value, Right(Token.Generic(keyword.value)))
|
||||
}
|
||||
}
|
||||
|
||||
test("should consume a generic token for every trivial operator") {
|
||||
Operator.Trivial.foreach { op =>
|
||||
assertTokens(op.value, Right(Token.Generic(op.value)))
|
||||
}
|
||||
}
|
||||
|
||||
test("should tokenize a valid file (case 1)") {
|
||||
val source = loadFileToString("tokenizer-1.ava")
|
||||
println(source)
|
||||
assertTokens(
|
||||
source,
|
||||
Right(Token.Generic("namespace")),
|
||||
Right(Token.Generic("unit")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("test"))
|
||||
)
|
||||
}
|
||||
|
||||
test("should tokenize a valid file (case 2)") {
|
||||
assertTokens(
|
||||
loadFileToString("tokenizer-2.ava"),
|
||||
Right(Token.Generic("namespace")),
|
||||
Right(Token.Generic("unit")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("test")),
|
||||
Right(Token.Generic("import")),
|
||||
Right(Token.Generic("one")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("two")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("three")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("*")),
|
||||
Right(Token.Generic("import")),
|
||||
Right(Token.Generic("x")),
|
||||
Right(Token.Dot),
|
||||
Right(Token.Generic("y")),
|
||||
Right(Token.Generic("fn")),
|
||||
Right(Token.Generic("foo")),
|
||||
Right(Token.Colon),
|
||||
Right(Token.Generic("Int")),
|
||||
Right(Token.Generic("->")),
|
||||
Right(Token.Generic("Int")),
|
||||
Right(Token.Generic("λ")),
|
||||
Right(Token.Generic("value")),
|
||||
Right(Token.Generic("=>")),
|
||||
Right(Token.Generic("value")),
|
||||
Right(Token.Generic("end")),
|
||||
Right(Token.Generic("fn"))
|
||||
)
|
||||
}
|
||||
|
||||
private def assertTokens(
|
||||
source: String,
|
||||
expectedOutput: Either[Tokenizer.Error, Token]*
|
||||
expectedOutput: Either[Error, Token]*
|
||||
): Unit =
|
||||
run(newCR(stringStream(source))) { tokenizer =>
|
||||
consumeAll(tokenizer).map { tokens =>
|
||||
|
@ -156,8 +312,7 @@ class TokenizerTests extends munit.FunSuite:
|
|||
}
|
||||
}
|
||||
|
||||
private def consumeAll(tokenizer: Tokenizer)
|
||||
: IO[List[Either[Tokenizer.Error, Token]]] =
|
||||
private def consumeAll(tokenizer: Tokenizer): IO[List[Either[Error, Token]]] =
|
||||
fs2.Stream
|
||||
.repeatEval(IO(tokenizer.next()))
|
||||
.takeWhile(_ != Right(Token.Eof))
|
||||
|
@ -184,3 +339,6 @@ class TokenizerTests extends munit.FunSuite:
|
|||
)(release = t => IO(t.close()))
|
||||
.use(testF)
|
||||
.unsafeRunSync()
|
||||
|
||||
private def loadFileToString(name: String): String =
|
||||
Source.fromResource(name).mkString
|
||||
|
|
|
@ -5,6 +5,6 @@ externalResolvers := Seq(
|
|||
"Garrity Software Releases" at "https://maven.garrity.co/gs"
|
||||
)
|
||||
|
||||
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8")
|
||||
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.10")
|
||||
addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0")
|
||||
addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0")
|
||||
|
|
Loading…
Add table
Reference in a new issue