diff --git a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala index 718212a..00c3278 100644 --- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala +++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala @@ -5,17 +5,38 @@ import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import scala.collection.mutable.Stack +/** Transforms a stream of characters into a stream of tokens. + * + * @param reader + * The [[CharacterReader]]. + */ class Tokenizer(private val reader: CharacterReader): import Tokenizer.* - private val buffer: ListBuffer[Char] = ListBuffer.empty - private val states: Stack[State] = Stack.empty - private val errors: ListBuffer[Error] = ListBuffer.empty - private var whiteSpaceOnly: Boolean = true + // Accumulates characters for tokens. + private val buffer: ListBuffer[Char] = ListBuffer.empty + // Stack of states. This is refreshed on each call for a new token. + private val states: Stack[State] = Stack.empty + + // List of errors within the scope of the current token. + private val errors: ListBuffer[Error] = ListBuffer.empty + + // Tracks whether ONLY white space has been seen since the next newline. Used + // To determine if comments are allowed. Comments are only allowed as the + // first non-whitespace character on a line. + private var whiteSpaceOnly: Boolean = true + + /** Dump the current [[State]] stack. + * + * @return + * List of [[State]], from top to bottom. + */ private def dumpStack(): List[State] = states.toList /** Consume the next available token. + * + * This function is **not** thread safe. * * @return * The next available token, or an error if resolving a token fails. @@ -267,6 +288,8 @@ class Tokenizer(private val reader: CharacterReader): object Tokenizer: + /** Enumeration which defines the [[Tokenizer]] internal state. + */ sealed trait State object State: @@ -276,17 +299,61 @@ object Tokenizer: */ case object Initial extends State + /** Used if the `-` character is seen as the first character on some line. + * In this case, the token may or may not be a comment. + * + * @param start + * The [[SourcePosition]] of the `-` character. + */ case class PotentialComment(start: SourcePosition) extends State - case class InComment(start: SourcePosition) extends State - case class InDoubleQuote(start: SourcePosition) extends State - case class InSingleQuote(start: SourcePosition) extends State - case class InCharEscape(start: SourcePosition) extends State - case class InGeneric(start: SourcePosition) extends State + + /** State for being within a comment. This state ends when a newline is hit. + * + * @param start + * The [[SourcePosition]] of the beginning of the comment. + */ + case class InComment(start: SourcePosition) extends State + + /** State for being within double quotes -- a String Literal. This state + * ends when a closing double quote is hit. Newlines are not allowed within + * this state. + * + * @param start + * The [[SourcePosition]] of the beginning of the string. + */ + case class InDoubleQuote(start: SourcePosition) extends State + + /** State for being within single quotes -- a Char Literal. This state ends + * when a closing single quote is hit. Newlines are not allowed within this + * state. + * + * @param start + * The [[SourcePosition]] of the beginning of the character. + */ + case class InSingleQuote(start: SourcePosition) extends State + + /** State indicating that a character escape within a string or character + * literal was initiated. + * + * @param start + * The [[SourcePosition]] of the character escape. + */ + case class InCharEscape(start: SourcePosition) extends State + + /** State that indicates some generic token is being parsed. These might be + * keywords, operators, or names. + * + * @param start + * The [[SourcePosition]] of the start of the token. + */ + case class InGeneric(start: SourcePosition) extends State given CanEqual[State, State] = CanEqual.derived end State + /** Enumeration which defines all possible [[Tokenizer]] errors. + */ sealed trait Error object Error: @@ -294,8 +361,6 @@ object Tokenizer: sealed trait Positional extends Error: def sourcePosition: SourcePosition - case object NotImplemented extends Error - /** This error occurs when the end of file is reached if the tokenizer is * still expecting more characters. * diff --git a/modules/parser/src/test/resources/tokenizer-3.ava b/modules/parser/src/test/resources/tokenizer-3.ava new file mode 100644 index 0000000..6112983 --- /dev/null +++ b/modules/parser/src/test/resources/tokenizer-3.ava @@ -0,0 +1,9 @@ +namespace unit.test + +--- Type class for type constructors which can be mapped over. +given F * +class Functor + --- Transform some wrapped data from one type to another. + given A, B + defn map: F A -> (A -> B) -> F B +end class diff --git a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala index 5be3d66..b2c4f18 100644 --- a/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala +++ b/modules/parser/src/test/scala/ava/parser/TokenizerTests.scala @@ -302,6 +302,51 @@ class TokenizerTests extends munit.FunSuite: ) } + test("should tokenize a valid file (case 3)") { + val source = loadFileToString("tokenizer-3.ava") + println(source) + assertTokens( + source, + Right(Token.Generic("namespace")), + Right(Token.Generic("unit")), + Right(Token.Dot), + Right(Token.Generic("test")), + Right( + Token.Comment( + "- Type class for type constructors which can be mapped over." + ) + ), + Right(Token.Generic("given")), + Right(Token.Generic("F")), + Right(Token.Generic("*")), + Right(Token.Generic("class")), + Right(Token.Generic("Functor")), + Right( + Token.Comment("- Transform some wrapped data from one type to another.") + ), + Right(Token.Generic("given")), + Right(Token.Generic("A")), + Right(Token.Comma), + Right(Token.Generic("B")), + Right(Token.Generic("defn")), + Right(Token.Generic("map")), + Right(Token.Colon), + Right(Token.Generic("F")), + Right(Token.Generic("A")), + Right(Token.Generic("->")), + Right(Token.OpenParen), + Right(Token.Generic("A")), + Right(Token.Generic("->")), + Right(Token.Generic("B")), + Right(Token.CloseParen), + Right(Token.Generic("->")), + Right(Token.Generic("F")), + Right(Token.Generic("B")), + Right(Token.Generic("end")), + Right(Token.Generic("class")) + ) + } + private def assertTokens( source: String, expectedOutput: Either[Error, Token]*