From b27e16776e0dfcf5ba9f545c3c17c6b8bf7dcb45 Mon Sep 17 00:00:00 2001 From: Pat Garrity Date: Sat, 24 Feb 2024 15:25:33 -0600 Subject: [PATCH] Full coverage on tokenization, including a small real file example. --- .../src/main/scala/ava/parser/Operator.scala | 12 ++ .../scala/ava/parser/TokenDelimiter.scala | 4 + .../src/main/scala/ava/parser/Tokenizer.scala | 62 ++++-- .../parser/src/test/resources/tokenizer-1.ava | 1 + .../parser/src/test/resources/tokenizer-2.ava | 8 + .../scala/ava/parser/TokenizerTests.scala | 176 +++++++++++++++++- project/plugins.sbt | 2 +- 7 files changed, 236 insertions(+), 29 deletions(-) create mode 100644 modules/parser/src/test/resources/tokenizer-1.ava create mode 100644 modules/parser/src/test/resources/tokenizer-2.ava diff --git a/modules/parser/src/main/scala/ava/parser/Operator.scala b/modules/parser/src/main/scala/ava/parser/Operator.scala index 402dfc5..23bfcfb 100644 --- a/modules/parser/src/main/scala/ava/parser/Operator.scala +++ b/modules/parser/src/main/scala/ava/parser/Operator.scala @@ -30,4 +30,16 @@ object Operator: Case ) + /** List of unambiguous operators that can always be trivially parsed. Other + * operators are either specific tokens, or a combination of tokens. + */ + val Trivial: List[Operator] = List( + Hole, + ImportSplat, + Union, + FnReturn, + BindDo, + Case + ) + end Operator diff --git a/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala b/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala index e670a75..9719395 100644 --- a/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala +++ b/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala @@ -39,6 +39,10 @@ object TokenDelimiter: Tuple ) + def isDelimiter(str: String): Boolean = + if str.length() == 1 then isDelimiter(str.charAt(0)) + else false + def isDelimiter(ch: Char): Boolean = All.contains(ch) def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch) diff --git a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala index 53118cf..718212a 100644 --- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala +++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala @@ -98,7 +98,8 @@ class Tokenizer(private val reader: CharacterReader): nextInternal(st) case State.PotentialComment(startPos) => reader.consume() match - case None => Left(Error.PrematureEof(dumpStack())) + case None => + Right(Token.Generic("-")) case Some(ch) => ch match case '-' => @@ -152,8 +153,9 @@ class Tokenizer(private val reader: CharacterReader): ) case '\\' => // Character escapes are supported within string literals. - states.push(state) - nextInternal(State.InCharEscape(reader.currentSourcePosition())) + val st = State.InCharEscape(reader.currentSourcePosition()) + states.push(st) + nextInternal(st) case '"' => // This string literal is now closed. If anything failed inside // this literal, suppress those errors and return them as part @@ -179,12 +181,13 @@ class Tokenizer(private val reader: CharacterReader): Error.UnexpectedNewLine(reader.currentSourcePosition(), state) ) case '\\' => - // Character escapse are supported within character literals. - states.push(state) - nextInternal(State.InCharEscape(reader.currentSourcePosition())) + // Character escapes are supported within character literals. + val st = State.InCharEscape(reader.currentSourcePosition()) + states.push(st) + nextInternal(st) case '\'' => // This character literal is now closed. - createCharacterLiteral() + createCharacterLiteral(startPos) case _ => // Continue accumulating characters. buffer.addOne(ch) @@ -201,27 +204,35 @@ class Tokenizer(private val reader: CharacterReader): // state and return to wherever (string or character literal) // this error originated. errors.addOne(error) - nextInternal(states.pop()) + val _ = states.pop() + nextInternal(states.top) case Right(actual) => // Add the resolved character to the buffer and return to the // parent state. val _ = buffer.addOne(actual) - nextInternal(states.pop()) + val _ = states.pop() + nextInternal(states.top) case State.InGeneric(startPos) => - reader.consume() match + // PEEK here! This allows us to react to the character in case it's + // something like a delimter, allowing it to be naturally consumed + // later. + reader.peek() match case None => // EOF is permitted for tokens - it forcibly terminates them. Right(Token.Generic(buffer.mkString)) case Some(ch) => if ch == '\n' then // New lines close the token AND reset our white space state. + val _ = reader.consume() whiteSpaceOnly = true Right(Token.Generic(buffer.mkString)) else if TokenDelimiter.isDelimiter(ch) then // Any delimiter forcibly terminates a token. + // Do not consume the character so it gets picked up later. Right(Token.Generic(buffer.mkString)) else // Non-delimiter characters are added to the token. + val _ = reader.consume() buffer.addOne(ch) nextInternal(state) @@ -232,20 +243,27 @@ class Tokenizer(private val reader: CharacterReader): case Some(escape) => Right(escape.output) - private def createCharacterLiteral(): Either[Error, Token] = + private def createCharacterLiteral(pos: SourcePosition) + : Either[Error, Token] = val dump = buffer.mkString if dump.length() > 1 then Left( - Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump) - ) - else - val out = if dump.isEmpty() then 0 else dump.charAt(0) - Right( - Token.CharacterLiteral( - value = out, + Error.MultipleCharactersInLiteral( + sourcePosition = pos, + candidate = dump, errors = errors.toList ) ) + else if dump.length() < 1 then + Left( + Error.NoCharactersInLiteral( + sourcePosition = pos, + errors = errors.toList + ) + ) + else + val out = dump.charAt(0) + Right(Token.CharacterLiteral(value = out, errors = errors.toList)) object Tokenizer: @@ -316,7 +334,13 @@ object Tokenizer: */ case class MultipleCharactersInLiteral( sourcePosition: SourcePosition, - candidate: String + candidate: String, + errors: List[Error] + ) extends Error + + case class NoCharactersInLiteral( + sourcePosition: SourcePosition, + errors: List[Error] ) extends Error case class BackSlashNotAllowed( diff --git a/modules/parser/src/test/resources/tokenizer-1.ava b/modules/parser/src/test/resources/tokenizer-1.ava new file mode 100644 index 0000000..2b91e84 --- /dev/null +++ b/modules/parser/src/test/resources/tokenizer-1.ava @@ -0,0 +1 @@ +namespace unit.test diff --git a/modules/parser/src/test/resources/tokenizer-2.ava b/modules/parser/src/test/resources/tokenizer-2.ava new file mode 100644 index 0000000..6026581 --- /dev/null +++ b/modules/parser/src/test/resources/tokenizer-2.ava @@ -0,0 +1,8 @@ +namespace unit.test + +import one.two.three.* +import x.y + +fn foo: Int -> Int + λ value => value +end fn diff --git a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala index dc4f5e2..5be3d66 100644 --- a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala +++ b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala @@ -5,8 +5,13 @@ import cats.effect.Resource import cats.effect.unsafe.IORuntime import java.io.ByteArrayInputStream import java.io.InputStream +import scala.io.Source class TokenizerTests extends munit.FunSuite: + + import Tokenizer.Error + import Tokenizer.State + implicit val runtime: IORuntime = cats.effect.unsafe.IORuntime.global private val EmptyStream: InputStream = new ByteArrayInputStream(Array[Byte]()) @@ -30,7 +35,7 @@ class TokenizerTests extends munit.FunSuite: ) { assertTokens( " \\ ", - Left(Tokenizer.Error.BackSlashNotAllowed(pos(2, 1, 2))) + Left(Error.BackSlashNotAllowed(pos(2, 1, 2))) ) } @@ -66,6 +71,10 @@ class TokenizerTests extends munit.FunSuite: assertTokens("-\n", Right(Token.Generic("-"))) } + test("should capture a generic - if a potential comment hits EOF") { + assertTokens("-", Right(Token.Generic("-"))) + } + test("should handle basic comments") { assertTokens("-- Comment", Right(Token.Comment("Comment"))) } @@ -91,8 +100,8 @@ class TokenizerTests extends munit.FunSuite: assertTokens( "\"", Left( - Tokenizer.Error.PrematureEof( - List(Tokenizer.State.InDoubleQuote(pos(1, 1, 1))) + Error.PrematureEof( + List(State.InDoubleQuote(pos(1, 1, 1))) ) ) ) @@ -102,8 +111,33 @@ class TokenizerTests extends munit.FunSuite: assertTokens( "\'", Left( - Tokenizer.Error.PrematureEof( - List(Tokenizer.State.InSingleQuote(pos(1, 1, 1))) + Error.PrematureEof( + List(State.InSingleQuote(pos(1, 1, 1))) + ) + ) + ) + } + + test("should throw an error if EOF is reached inside a character ESCAPE") { + assertTokens( + "\'\\", + Left( + Error.PrematureEof( + List( + State.InCharEscape(pos(2, 1, 2)), + State.InSingleQuote(pos(1, 1, 1)) + ) + ) + ) + ) + assertTokens( + "\"\\", + Left( + Error.PrematureEof( + List( + State.InCharEscape(pos(2, 1, 2)), + State.InDoubleQuote(pos(1, 1, 1)) + ) ) ) ) @@ -140,15 +174,137 @@ class TokenizerTests extends munit.FunSuite: Right( Token.StringLiteral( "foobar", - List(Tokenizer.Error.InvalidCharEscape(pos(6, 1, 6), '!')) + List(Error.InvalidCharEscape(pos(6, 1, 6), '!')) ) ) ) } + test("should reject a string literal with a newline") { + assertTokens( + "\"\n", + Left( + Error.UnexpectedNewLine( + sourcePosition = pos(2, 2, 0), + currentState = State.InDoubleQuote(pos(1, 1, 1)) + ) + ) + ) + } + + test("should reject a character literal with a newline") { + assertTokens( + "\'\n", + Left( + Error.UnexpectedNewLine( + sourcePosition = pos(2, 2, 0), + currentState = State.InSingleQuote(pos(1, 1, 1)) + ) + ) + ) + } + + test("should accept an empty string literal") { + assertTokens("\"\"", Right(Token.StringLiteral("", Nil))) + } + + test("should reject an empty character literal") { + assertTokens( + "\'\'", + Left(Error.NoCharactersInLiteral(pos(1, 1, 1), Nil)) + ) + } + + test("should reject an empty character literal with a failed escape") { + assertTokens( + "\'\\!\'", + Left( + Error.NoCharactersInLiteral( + sourcePosition = pos(1, 1, 1), + List(Error.InvalidCharEscape(pos(3, 1, 3), '!')) + ) + ) + ) + } + + test("should reject a character literal with multiple characters") { + assertTokens( + "\'abc\'", + Left( + Error.MultipleCharactersInLiteral( + sourcePosition = pos(1, 1, 1), + candidate = "abc", + errors = Nil + ) + ) + ) + } + + test("should consume a generic token") { + assertTokens("abcdefg", Right(Token.Generic("abcdefg"))) + } + + test("should consume a generic token for every possible keyword") { + Keyword.All.foreach { keyword => + assertTokens(keyword.value, Right(Token.Generic(keyword.value))) + } + } + + test("should consume a generic token for every trivial operator") { + Operator.Trivial.foreach { op => + assertTokens(op.value, Right(Token.Generic(op.value))) + } + } + + test("should tokenize a valid file (case 1)") { + val source = loadFileToString("tokenizer-1.ava") + println(source) + assertTokens( + source, + Right(Token.Generic("namespace")), + Right(Token.Generic("unit")), + Right(Token.Dot), + Right(Token.Generic("test")) + ) + } + + test("should tokenize a valid file (case 2)") { + assertTokens( + loadFileToString("tokenizer-2.ava"), + Right(Token.Generic("namespace")), + Right(Token.Generic("unit")), + Right(Token.Dot), + Right(Token.Generic("test")), + Right(Token.Generic("import")), + Right(Token.Generic("one")), + Right(Token.Dot), + Right(Token.Generic("two")), + Right(Token.Dot), + Right(Token.Generic("three")), + Right(Token.Dot), + Right(Token.Generic("*")), + Right(Token.Generic("import")), + Right(Token.Generic("x")), + Right(Token.Dot), + Right(Token.Generic("y")), + Right(Token.Generic("fn")), + Right(Token.Generic("foo")), + Right(Token.Colon), + Right(Token.Generic("Int")), + Right(Token.Generic("->")), + Right(Token.Generic("Int")), + Right(Token.Generic("λ")), + Right(Token.Generic("value")), + Right(Token.Generic("=>")), + Right(Token.Generic("value")), + Right(Token.Generic("end")), + Right(Token.Generic("fn")) + ) + } + private def assertTokens( source: String, - expectedOutput: Either[Tokenizer.Error, Token]* + expectedOutput: Either[Error, Token]* ): Unit = run(newCR(stringStream(source))) { tokenizer => consumeAll(tokenizer).map { tokens => @@ -156,8 +312,7 @@ class TokenizerTests extends munit.FunSuite: } } - private def consumeAll(tokenizer: Tokenizer) - : IO[List[Either[Tokenizer.Error, Token]]] = + private def consumeAll(tokenizer: Tokenizer): IO[List[Either[Error, Token]]] = fs2.Stream .repeatEval(IO(tokenizer.next())) .takeWhile(_ != Right(Token.Eof)) @@ -184,3 +339,6 @@ class TokenizerTests extends munit.FunSuite: )(release = t => IO(t.close())) .use(testF) .unsafeRunSync() + + private def loadFileToString(name: String): String = + Source.fromResource(name).mkString diff --git a/project/plugins.sbt b/project/plugins.sbt index 96d399c..7c3ba3e 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -5,6 +5,6 @@ externalResolvers := Seq( "Garrity Software Releases" at "https://maven.garrity.co/gs" ) -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.10") addSbtPlugin("gs" % "sbt-garrity-software" % "0.2.0") addSbtPlugin("gs" % "sbt-gs-semver" % "0.2.0")