Full coverage on tokenization, including a small real file example.
This commit is contained in:
parent
cd98dc0342
commit
b27e16776e
7 changed files with 236 additions and 29 deletions
|
@ -30,4 +30,16 @@ object Operator:
|
||||||
Case
|
Case
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/** List of unambiguous operators that can always be trivially parsed. Other
|
||||||
|
* operators are either specific tokens, or a combination of tokens.
|
||||||
|
*/
|
||||||
|
val Trivial: List[Operator] = List(
|
||||||
|
Hole,
|
||||||
|
ImportSplat,
|
||||||
|
Union,
|
||||||
|
FnReturn,
|
||||||
|
BindDo,
|
||||||
|
Case
|
||||||
|
)
|
||||||
|
|
||||||
end Operator
|
end Operator
|
||||||
|
|
|
@ -39,6 +39,10 @@ object TokenDelimiter:
|
||||||
Tuple
|
Tuple
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def isDelimiter(str: String): Boolean =
|
||||||
|
if str.length() == 1 then isDelimiter(str.charAt(0))
|
||||||
|
else false
|
||||||
|
|
||||||
def isDelimiter(ch: Char): Boolean = All.contains(ch)
|
def isDelimiter(ch: Char): Boolean = All.contains(ch)
|
||||||
|
|
||||||
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)
|
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)
|
||||||
|
|
|
@ -98,7 +98,8 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
nextInternal(st)
|
nextInternal(st)
|
||||||
case State.PotentialComment(startPos) =>
|
case State.PotentialComment(startPos) =>
|
||||||
reader.consume() match
|
reader.consume() match
|
||||||
case None => Left(Error.PrematureEof(dumpStack()))
|
case None =>
|
||||||
|
Right(Token.Generic("-"))
|
||||||
case Some(ch) =>
|
case Some(ch) =>
|
||||||
ch match
|
ch match
|
||||||
case '-' =>
|
case '-' =>
|
||||||
|
@ -152,8 +153,9 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
)
|
)
|
||||||
case '\\' =>
|
case '\\' =>
|
||||||
// Character escapes are supported within string literals.
|
// Character escapes are supported within string literals.
|
||||||
states.push(state)
|
val st = State.InCharEscape(reader.currentSourcePosition())
|
||||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
states.push(st)
|
||||||
|
nextInternal(st)
|
||||||
case '"' =>
|
case '"' =>
|
||||||
// This string literal is now closed. If anything failed inside
|
// This string literal is now closed. If anything failed inside
|
||||||
// this literal, suppress those errors and return them as part
|
// this literal, suppress those errors and return them as part
|
||||||
|
@ -179,12 +181,13 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
|
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
|
||||||
)
|
)
|
||||||
case '\\' =>
|
case '\\' =>
|
||||||
// Character escapse are supported within character literals.
|
// Character escapes are supported within character literals.
|
||||||
states.push(state)
|
val st = State.InCharEscape(reader.currentSourcePosition())
|
||||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
states.push(st)
|
||||||
|
nextInternal(st)
|
||||||
case '\'' =>
|
case '\'' =>
|
||||||
// This character literal is now closed.
|
// This character literal is now closed.
|
||||||
createCharacterLiteral()
|
createCharacterLiteral(startPos)
|
||||||
case _ =>
|
case _ =>
|
||||||
// Continue accumulating characters.
|
// Continue accumulating characters.
|
||||||
buffer.addOne(ch)
|
buffer.addOne(ch)
|
||||||
|
@ -201,27 +204,35 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
// state and return to wherever (string or character literal)
|
// state and return to wherever (string or character literal)
|
||||||
// this error originated.
|
// this error originated.
|
||||||
errors.addOne(error)
|
errors.addOne(error)
|
||||||
nextInternal(states.pop())
|
val _ = states.pop()
|
||||||
|
nextInternal(states.top)
|
||||||
case Right(actual) =>
|
case Right(actual) =>
|
||||||
// Add the resolved character to the buffer and return to the
|
// Add the resolved character to the buffer and return to the
|
||||||
// parent state.
|
// parent state.
|
||||||
val _ = buffer.addOne(actual)
|
val _ = buffer.addOne(actual)
|
||||||
nextInternal(states.pop())
|
val _ = states.pop()
|
||||||
|
nextInternal(states.top)
|
||||||
case State.InGeneric(startPos) =>
|
case State.InGeneric(startPos) =>
|
||||||
reader.consume() match
|
// PEEK here! This allows us to react to the character in case it's
|
||||||
|
// something like a delimter, allowing it to be naturally consumed
|
||||||
|
// later.
|
||||||
|
reader.peek() match
|
||||||
case None =>
|
case None =>
|
||||||
// EOF is permitted for tokens - it forcibly terminates them.
|
// EOF is permitted for tokens - it forcibly terminates them.
|
||||||
Right(Token.Generic(buffer.mkString))
|
Right(Token.Generic(buffer.mkString))
|
||||||
case Some(ch) =>
|
case Some(ch) =>
|
||||||
if ch == '\n' then
|
if ch == '\n' then
|
||||||
// New lines close the token AND reset our white space state.
|
// New lines close the token AND reset our white space state.
|
||||||
|
val _ = reader.consume()
|
||||||
whiteSpaceOnly = true
|
whiteSpaceOnly = true
|
||||||
Right(Token.Generic(buffer.mkString))
|
Right(Token.Generic(buffer.mkString))
|
||||||
else if TokenDelimiter.isDelimiter(ch) then
|
else if TokenDelimiter.isDelimiter(ch) then
|
||||||
// Any delimiter forcibly terminates a token.
|
// Any delimiter forcibly terminates a token.
|
||||||
|
// Do not consume the character so it gets picked up later.
|
||||||
Right(Token.Generic(buffer.mkString))
|
Right(Token.Generic(buffer.mkString))
|
||||||
else
|
else
|
||||||
// Non-delimiter characters are added to the token.
|
// Non-delimiter characters are added to the token.
|
||||||
|
val _ = reader.consume()
|
||||||
buffer.addOne(ch)
|
buffer.addOne(ch)
|
||||||
nextInternal(state)
|
nextInternal(state)
|
||||||
|
|
||||||
|
@ -232,20 +243,27 @@ class Tokenizer(private val reader: CharacterReader):
|
||||||
case Some(escape) =>
|
case Some(escape) =>
|
||||||
Right(escape.output)
|
Right(escape.output)
|
||||||
|
|
||||||
private def createCharacterLiteral(): Either[Error, Token] =
|
private def createCharacterLiteral(pos: SourcePosition)
|
||||||
|
: Either[Error, Token] =
|
||||||
val dump = buffer.mkString
|
val dump = buffer.mkString
|
||||||
if dump.length() > 1 then
|
if dump.length() > 1 then
|
||||||
Left(
|
Left(
|
||||||
Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump)
|
Error.MultipleCharactersInLiteral(
|
||||||
)
|
sourcePosition = pos,
|
||||||
else
|
candidate = dump,
|
||||||
val out = if dump.isEmpty() then 0 else dump.charAt(0)
|
|
||||||
Right(
|
|
||||||
Token.CharacterLiteral(
|
|
||||||
value = out,
|
|
||||||
errors = errors.toList
|
errors = errors.toList
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
else if dump.length() < 1 then
|
||||||
|
Left(
|
||||||
|
Error.NoCharactersInLiteral(
|
||||||
|
sourcePosition = pos,
|
||||||
|
errors = errors.toList
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else
|
||||||
|
val out = dump.charAt(0)
|
||||||
|
Right(Token.CharacterLiteral(value = out, errors = errors.toList))
|
||||||
|
|
||||||
object Tokenizer:
|
object Tokenizer:
|
||||||
|
|
||||||
|
@ -316,7 +334,13 @@ object Tokenizer:
|
||||||
*/
|
*/
|
||||||
case class MultipleCharactersInLiteral(
|
case class MultipleCharactersInLiteral(
|
||||||
sourcePosition: SourcePosition,
|
sourcePosition: SourcePosition,
|
||||||
candidate: String
|
candidate: String,
|
||||||
|
errors: List[Error]
|
||||||
|
) extends Error
|
||||||
|
|
||||||
|
case class NoCharactersInLiteral(
|
||||||
|
sourcePosition: SourcePosition,
|
||||||
|
errors: List[Error]
|
||||||
) extends Error
|
) extends Error
|
||||||
|
|
||||||
case class BackSlashNotAllowed(
|
case class BackSlashNotAllowed(
|
||||||
|
|
1
modules/parser/src/test/resources/tokenizer-1.ava
Normal file
1
modules/parser/src/test/resources/tokenizer-1.ava
Normal file
|
@ -0,0 +1 @@
|
||||||
|
namespace unit.test
|
8
modules/parser/src/test/resources/tokenizer-2.ava
Normal file
8
modules/parser/src/test/resources/tokenizer-2.ava
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
namespace unit.test
|
||||||
|
|
||||||
|
import one.two.three.*
|
||||||
|
import x.y
|
||||||
|
|
||||||
|
fn foo: Int -> Int
|
||||||
|
λ value => value
|
||||||
|
end fn
|
|
@ -5,8 +5,13 @@ import cats.effect.Resource
|
||||||
import cats.effect.unsafe.IORuntime
|
import cats.effect.unsafe.IORuntime
|
||||||
import java.io.ByteArrayInputStream
|
import java.io.ByteArrayInputStream
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
class TokenizerTests extends munit.FunSuite:
|
class TokenizerTests extends munit.FunSuite:
|
||||||
|
|
||||||
|
import Tokenizer.Error
|
||||||
|
import Tokenizer.State
|
||||||
|
|
||||||
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
|
implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global
|
||||||
|
|
||||||
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
|
private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]())
|
||||||
|
@ -30,7 +35,7 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
) {
|
) {
|
||||||
assertTokens(
|
assertTokens(
|
||||||
" \\ ",
|
" \\ ",
|
||||||
Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2)))
|
Left(Error.BackSlashNotAllowed(pos(2, 1, 2)))
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,6 +71,10 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
assertTokens("-\n", Right(Token.Generic("-")))
|
assertTokens("-\n", Right(Token.Generic("-")))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("should capture a generic - if a potential comment hits EOF") {
|
||||||
|
assertTokens("-", Right(Token.Generic("-")))
|
||||||
|
}
|
||||||
|
|
||||||
test("should handle basic comments") {
|
test("should handle basic comments") {
|
||||||
assertTokens("-- Comment", Right(Token.Comment("Comment")))
|
assertTokens("-- Comment", Right(Token.Comment("Comment")))
|
||||||
}
|
}
|
||||||
|
@ -91,8 +100,8 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
assertTokens(
|
assertTokens(
|
||||||
"\"",
|
"\"",
|
||||||
Left(
|
Left(
|
||||||
Tokenizer.Error.PrematureEof(
|
Error.PrematureEof(
|
||||||
List(Tokenizer.State.InDoubleQuote(pos(1, 1, 1)))
|
List(State.InDoubleQuote(pos(1, 1, 1)))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -102,8 +111,33 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
assertTokens(
|
assertTokens(
|
||||||
"\'",
|
"\'",
|
||||||
Left(
|
Left(
|
||||||
Tokenizer.Error.PrematureEof(
|
Error.PrematureEof(
|
||||||
List(Tokenizer.State.InSingleQuote(pos(1, 1, 1)))
|
List(State.InSingleQuote(pos(1, 1, 1)))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should throw an error if EOF is reached inside a character ESCAPE") {
|
||||||
|
assertTokens(
|
||||||
|
"\'\\",
|
||||||
|
Left(
|
||||||
|
Error.PrematureEof(
|
||||||
|
List(
|
||||||
|
State.InCharEscape(pos(2, 1, 2)),
|
||||||
|
State.InSingleQuote(pos(1, 1, 1))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertTokens(
|
||||||
|
"\"\\",
|
||||||
|
Left(
|
||||||
|
Error.PrematureEof(
|
||||||
|
List(
|
||||||
|
State.InCharEscape(pos(2, 1, 2)),
|
||||||
|
State.InDoubleQuote(pos(1, 1, 1))
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -140,15 +174,137 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
Right(
|
Right(
|
||||||
Token.StringLiteral(
|
Token.StringLiteral(
|
||||||
"foobar",
|
"foobar",
|
||||||
List(Tokenizer.Error.InvalidCharEscape(pos(6, 1, 6), '!'))
|
List(Error.InvalidCharEscape(pos(6, 1, 6), '!'))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("should reject a string literal with a newline") {
|
||||||
|
assertTokens(
|
||||||
|
"\"\n",
|
||||||
|
Left(
|
||||||
|
Error.UnexpectedNewLine(
|
||||||
|
sourcePosition = pos(2, 2, 0),
|
||||||
|
currentState = State.InDoubleQuote(pos(1, 1, 1))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should reject a character literal with a newline") {
|
||||||
|
assertTokens(
|
||||||
|
"\'\n",
|
||||||
|
Left(
|
||||||
|
Error.UnexpectedNewLine(
|
||||||
|
sourcePosition = pos(2, 2, 0),
|
||||||
|
currentState = State.InSingleQuote(pos(1, 1, 1))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should accept an empty string literal") {
|
||||||
|
assertTokens("\"\"", Right(Token.StringLiteral("", Nil)))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should reject an empty character literal") {
|
||||||
|
assertTokens(
|
||||||
|
"\'\'",
|
||||||
|
Left(Error.NoCharactersInLiteral(pos(1, 1, 1), Nil))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should reject an empty character literal with a failed escape") {
|
||||||
|
assertTokens(
|
||||||
|
"\'\\!\'",
|
||||||
|
Left(
|
||||||
|
Error.NoCharactersInLiteral(
|
||||||
|
sourcePosition = pos(1, 1, 1),
|
||||||
|
List(Error.InvalidCharEscape(pos(3, 1, 3), '!'))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should reject a character literal with multiple characters") {
|
||||||
|
assertTokens(
|
||||||
|
"\'abc\'",
|
||||||
|
Left(
|
||||||
|
Error.MultipleCharactersInLiteral(
|
||||||
|
sourcePosition = pos(1, 1, 1),
|
||||||
|
candidate = "abc",
|
||||||
|
errors = Nil
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should consume a generic token") {
|
||||||
|
assertTokens("abcdefg", Right(Token.Generic("abcdefg")))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should consume a generic token for every possible keyword") {
|
||||||
|
Keyword.All.foreach { keyword =>
|
||||||
|
assertTokens(keyword.value, Right(Token.Generic(keyword.value)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should consume a generic token for every trivial operator") {
|
||||||
|
Operator.Trivial.foreach { op =>
|
||||||
|
assertTokens(op.value, Right(Token.Generic(op.value)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should tokenize a valid file (case 1)") {
|
||||||
|
val source = loadFileToString("tokenizer-1.ava")
|
||||||
|
println(source)
|
||||||
|
assertTokens(
|
||||||
|
source,
|
||||||
|
Right(Token.Generic("namespace")),
|
||||||
|
Right(Token.Generic("unit")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("test"))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("should tokenize a valid file (case 2)") {
|
||||||
|
assertTokens(
|
||||||
|
loadFileToString("tokenizer-2.ava"),
|
||||||
|
Right(Token.Generic("namespace")),
|
||||||
|
Right(Token.Generic("unit")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("test")),
|
||||||
|
Right(Token.Generic("import")),
|
||||||
|
Right(Token.Generic("one")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("two")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("three")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("*")),
|
||||||
|
Right(Token.Generic("import")),
|
||||||
|
Right(Token.Generic("x")),
|
||||||
|
Right(Token.Dot),
|
||||||
|
Right(Token.Generic("y")),
|
||||||
|
Right(Token.Generic("fn")),
|
||||||
|
Right(Token.Generic("foo")),
|
||||||
|
Right(Token.Colon),
|
||||||
|
Right(Token.Generic("Int")),
|
||||||
|
Right(Token.Generic("->")),
|
||||||
|
Right(Token.Generic("Int")),
|
||||||
|
Right(Token.Generic("λ")),
|
||||||
|
Right(Token.Generic("value")),
|
||||||
|
Right(Token.Generic("=>")),
|
||||||
|
Right(Token.Generic("value")),
|
||||||
|
Right(Token.Generic("end")),
|
||||||
|
Right(Token.Generic("fn"))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
private def assertTokens(
|
private def assertTokens(
|
||||||
source: String,
|
source: String,
|
||||||
expectedOutput: Either[Tokenizer.Error, Token]*
|
expectedOutput: Either[Error, Token]*
|
||||||
): Unit =
|
): Unit =
|
||||||
run(newCR(stringStream(source))) { tokenizer =>
|
run(newCR(stringStream(source))) { tokenizer =>
|
||||||
consumeAll(tokenizer).map { tokens =>
|
consumeAll(tokenizer).map { tokens =>
|
||||||
|
@ -156,8 +312,7 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def consumeAll(tokenizer: Tokenizer)
|
private def consumeAll(tokenizer: Tokenizer): IO[List[Either[Error, Token]]] =
|
||||||
: IO[List[Either[Tokenizer.Error, Token]]] =
|
|
||||||
fs2.Stream
|
fs2.Stream
|
||||||
.repeatEval(IO(tokenizer.next()))
|
.repeatEval(IO(tokenizer.next()))
|
||||||
.takeWhile(_ != Right(Token.Eof))
|
.takeWhile(_ != Right(Token.Eof))
|
||||||
|
@ -184,3 +339,6 @@ class TokenizerTests extends munit.FunSuite:
|
||||||
)(release = t => IO(t.close()))
|
)(release = t => IO(t.close()))
|
||||||
.use(testF)
|
.use(testF)
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
|
|
||||||
|
private def loadFileToString(name: String): String =
|
||||||
|
Source.fromResource(name).mkString
|
||||||
|
|
|
@ -5,6 +5,6 @@ externalResolvers := Seq(
|
||||||
"Garrity Software Releases" at "https://maven.garrity.co/gs"
|
"Garrity Software Releases" at "https://maven.garrity.co/gs"
|
||||||
)
|
)
|
||||||
|
|
||||||
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8")
|
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.10")
|
||||||
addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0")
|
addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0")
|
||||||
addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0")
|
addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0")
|
||||||
|
|
Loading…
Add table
Reference in a new issue