Finished untested version of tokenizer.

This commit is contained in:
Pat Garrity 2024-02-21 19:59:59 -06:00
parent 7beaef3b0d
commit d265e1b617
Signed by: pfm
GPG key ID: 5CA5D21BAB7F3A76
2 changed files with 121 additions and 4 deletions

View file

@ -0,0 +1,46 @@
package ava.parser
object TokenDelimiter:
val Space: Char = ' '
val NewLine: Char = '\n'
val CarriageReturn: Char = '\r'
val Tab: Char = '\t'
val OpenParen: Char = '('
val CloseParen: Char = ')'
val Comma: Char = ','
val Colon: Char = ':'
val DoubleQuote: Char = '"'
val SingleQuote: Char = '\''
val Dot: Char = '.'
val BackSlash: Char = '\\'
val Tuple: Char = '#'
val WhiteSpace: List[Char] = List(
Space,
NewLine,
CarriageReturn,
Tab
)
val All: List[Char] = List(
Space,
NewLine,
CarriageReturn,
Tab,
OpenParen,
CloseParen,
Comma,
Colon,
DoubleQuote,
SingleQuote,
Dot,
BackSlash,
Tuple
)
def isDelimiter(ch: Char): Boolean = All.contains(ch)
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)
end TokenDelimiter

View file

@ -1,5 +1,6 @@
package ava.parser
import ava.parser.TokenDelimiter.isWhiteSpace
import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Stack
@ -33,11 +34,66 @@ class Tokenizer(private val reader: CharacterReader):
private def nextInternal(state: State): Either[Error, Token] =
state match
case State.Initial =>
Left(Error.NotImplemented)
reader.consume() match
case None => Right(Token.Eof)
case Some(ch) =>
ch match
case _ if isWhiteSpace(ch) => nextInternal(state)
case TokenDelimiter.OpenParen => Right(Token.OpenParen)
case TokenDelimiter.CloseParen => Right(Token.CloseParen)
case TokenDelimiter.Comma => Right(Token.Comma)
case TokenDelimiter.Colon => Right(Token.Colon)
case TokenDelimiter.Dot => Right(Token.Dot)
case TokenDelimiter.Tuple => Right(Token.Tuple)
case TokenDelimiter.BackSlash =>
Left(Error.BackSlashNotAllowed(reader.currentSourcePosition()))
case TokenDelimiter.DoubleQuote =>
nextInternal(
State.InDoubleQuote(reader.currentSourcePosition())
)
case TokenDelimiter.SingleQuote =>
nextInternal(
State.InSingleQuote(reader.currentSourcePosition())
)
case '-' =>
nextInternal(
State.PotentialComment(reader.currentSourcePosition())
)
case _ =>
nextInternal(State.InGeneric(reader.currentSourcePosition()))
case State.PotentialComment(startPos) =>
Left(Error.NotImplemented)
reader.consume() match
case None => Left(Error.PrematureEof(dumpStack()))
case Some(ch) =>
ch match
case '-' =>
// Confirmed that this is a comment. Switch to the comment state
// so that we can read the remainder of the line.
states.push(state)
nextInternal(State.InComment(reader.currentSourcePosition()))
case ' ' | '\t' | '\n' =>
// This token is only a `-` character and has been delimited by
// whitespace on either side. Return the token.
Right(Token.Generic("-"))
case _ =>
// This is some generic token that starts with `-`. Switch
// states now that we're sure about what we're reading.
buffer.addOne(ch)
nextInternal(State.InGeneric(startPos))
case State.InComment(startPos) =>
Left(Error.NotImplemented)
reader.consume() match
case None =>
// Reaching EOF during a comment is perfectly fine.
Right(Token.Comment(buffer.mkString))
case Some(ch) =>
ch match
case '\n' =>
// Newlines terminate a comment.
Right(Token.Comment(buffer.mkString))
case _ =>
// Any non-newline character is considered part of a comment.
buffer.addOne(ch)
nextInternal(state)
case State.InDoubleQuote(startPos) =>
reader.consume() match
case None => Left(Error.PrematureEof(dumpStack()))
@ -106,7 +162,18 @@ class Tokenizer(private val reader: CharacterReader):
val _ = buffer.addOne(actual)
nextInternal(states.pop())
case State.InGeneric(startPos) =>
Left(Error.NotImplemented)
reader.consume() match
case None =>
// EOF is permitted for tokens - it forcibly terminates them.
Right(Token.Generic(buffer.mkString))
case Some(ch) =>
if TokenDelimiter.isDelimiter(ch) then
// Any delimiter forcibly terminates a token.
Right(Token.Generic(buffer.mkString))
else
// Non-delimiter characters are added to the token.
buffer.addOne(ch)
nextInternal(state)
private def resolveCharEscape(ch: Char): Either[Error, Char] =
CharEscape.resolve(ch) match
@ -202,6 +269,10 @@ object Tokenizer:
candidate: String
) extends Error
case class BackSlashNotAllowed(
sourcePosition: SourcePosition
) extends Error
end Error
end Tokenizer