From d265e1b617c90ba735052259a37e5dda395b5dea Mon Sep 17 00:00:00 2001 From: Pat Garrity Date: Wed, 21 Feb 2024 19:59:59 -0600 Subject: [PATCH] Finished untested version of tokenizer. --- .../scala/ava/parser/TokenDelimiter.scala | 46 +++++++++++ .../src/main/scala/ava/parser/Tokenizer.scala | 79 ++++++++++++++++++- 2 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala diff --git a/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala b/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala new file mode 100644 index 0000000..e670a75 --- /dev/null +++ b/modules/parser/src/main/scala/ava/parser/TokenDelimiter.scala @@ -0,0 +1,46 @@ +package ava.parser + +object TokenDelimiter: + + val Space: Char = ' ' + val NewLine: Char = '\n' + val CarriageReturn: Char = '\r' + val Tab: Char = '\t' + val OpenParen: Char = '(' + val CloseParen: Char = ')' + val Comma: Char = ',' + val Colon: Char = ':' + val DoubleQuote: Char = '"' + val SingleQuote: Char = '\'' + val Dot: Char = '.' + val BackSlash: Char = '\\' + val Tuple: Char = '#' + + val WhiteSpace: List[Char] = List( + Space, + NewLine, + CarriageReturn, + Tab + ) + + val All: List[Char] = List( + Space, + NewLine, + CarriageReturn, + Tab, + OpenParen, + CloseParen, + Comma, + Colon, + DoubleQuote, + SingleQuote, + Dot, + BackSlash, + Tuple + ) + + def isDelimiter(ch: Char): Boolean = All.contains(ch) + + def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch) + +end TokenDelimiter diff --git a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala index 92287e7..0e58efa 100644 --- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala +++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala @@ -1,5 +1,6 @@ package ava.parser +import ava.parser.TokenDelimiter.isWhiteSpace import scala.annotation.tailrec import scala.collection.mutable.ListBuffer import scala.collection.mutable.Stack @@ -33,11 +34,66 @@ class Tokenizer(private val reader: CharacterReader): private def nextInternal(state: State): Either[Error, Token] = state match case State.Initial => - Left(Error.NotImplemented) + reader.consume() match + case None => Right(Token.Eof) + case Some(ch) => + ch match + case _ if isWhiteSpace(ch) => nextInternal(state) + case TokenDelimiter.OpenParen => Right(Token.OpenParen) + case TokenDelimiter.CloseParen => Right(Token.CloseParen) + case TokenDelimiter.Comma => Right(Token.Comma) + case TokenDelimiter.Colon => Right(Token.Colon) + case TokenDelimiter.Dot => Right(Token.Dot) + case TokenDelimiter.Tuple => Right(Token.Tuple) + case TokenDelimiter.BackSlash => + Left(Error.BackSlashNotAllowed(reader.currentSourcePosition())) + case TokenDelimiter.DoubleQuote => + nextInternal( + State.InDoubleQuote(reader.currentSourcePosition()) + ) + case TokenDelimiter.SingleQuote => + nextInternal( + State.InSingleQuote(reader.currentSourcePosition()) + ) + case '-' => + nextInternal( + State.PotentialComment(reader.currentSourcePosition()) + ) + case _ => + nextInternal(State.InGeneric(reader.currentSourcePosition())) case State.PotentialComment(startPos) => - Left(Error.NotImplemented) + reader.consume() match + case None => Left(Error.PrematureEof(dumpStack())) + case Some(ch) => + ch match + case '-' => + // Confirmed that this is a comment. Switch to the comment state + // so that we can read the remainder of the line. + states.push(state) + nextInternal(State.InComment(reader.currentSourcePosition())) + case ' ' | '\t' | '\n' => + // This token is only a `-` character and has been delimited by + // whitespace on either side. Return the token. + Right(Token.Generic("-")) + case _ => + // This is some generic token that starts with `-`. Switch + // states now that we're sure about what we're reading. + buffer.addOne(ch) + nextInternal(State.InGeneric(startPos)) case State.InComment(startPos) => - Left(Error.NotImplemented) + reader.consume() match + case None => + // Reaching EOF during a comment is perfectly fine. + Right(Token.Comment(buffer.mkString)) + case Some(ch) => + ch match + case '\n' => + // Newlines terminate a comment. + Right(Token.Comment(buffer.mkString)) + case _ => + // Any non-newline character is considered part of a comment. + buffer.addOne(ch) + nextInternal(state) case State.InDoubleQuote(startPos) => reader.consume() match case None => Left(Error.PrematureEof(dumpStack())) @@ -106,7 +162,18 @@ class Tokenizer(private val reader: CharacterReader): val _ = buffer.addOne(actual) nextInternal(states.pop()) case State.InGeneric(startPos) => - Left(Error.NotImplemented) + reader.consume() match + case None => + // EOF is permitted for tokens - it forcibly terminates them. + Right(Token.Generic(buffer.mkString)) + case Some(ch) => + if TokenDelimiter.isDelimiter(ch) then + // Any delimiter forcibly terminates a token. + Right(Token.Generic(buffer.mkString)) + else + // Non-delimiter characters are added to the token. + buffer.addOne(ch) + nextInternal(state) private def resolveCharEscape(ch: Char): Either[Error, Char] = CharEscape.resolve(ch) match @@ -202,6 +269,10 @@ object Tokenizer: candidate: String ) extends Error + case class BackSlashNotAllowed( + sourcePosition: SourcePosition + ) extends Error + end Error end Tokenizer