Finished untested version of tokenizer.
This commit is contained in:
parent
7beaef3b0d
commit
d265e1b617
2 changed files with 121 additions and 4 deletions
|
@ -0,0 +1,46 @@
|
|||
package ava.parser
|
||||
|
||||
object TokenDelimiter:
|
||||
|
||||
val Space: Char = ' '
|
||||
val NewLine: Char = '\n'
|
||||
val CarriageReturn: Char = '\r'
|
||||
val Tab: Char = '\t'
|
||||
val OpenParen: Char = '('
|
||||
val CloseParen: Char = ')'
|
||||
val Comma: Char = ','
|
||||
val Colon: Char = ':'
|
||||
val DoubleQuote: Char = '"'
|
||||
val SingleQuote: Char = '\''
|
||||
val Dot: Char = '.'
|
||||
val BackSlash: Char = '\\'
|
||||
val Tuple: Char = '#'
|
||||
|
||||
val WhiteSpace: List[Char] = List(
|
||||
Space,
|
||||
NewLine,
|
||||
CarriageReturn,
|
||||
Tab
|
||||
)
|
||||
|
||||
val All: List[Char] = List(
|
||||
Space,
|
||||
NewLine,
|
||||
CarriageReturn,
|
||||
Tab,
|
||||
OpenParen,
|
||||
CloseParen,
|
||||
Comma,
|
||||
Colon,
|
||||
DoubleQuote,
|
||||
SingleQuote,
|
||||
Dot,
|
||||
BackSlash,
|
||||
Tuple
|
||||
)
|
||||
|
||||
def isDelimiter(ch: Char): Boolean = All.contains(ch)
|
||||
|
||||
def isWhiteSpace(ch: Char): Boolean = WhiteSpace.contains(ch)
|
||||
|
||||
end TokenDelimiter
|
|
@ -1,5 +1,6 @@
|
|||
package ava.parser
|
||||
|
||||
import ava.parser.TokenDelimiter.isWhiteSpace
|
||||
import scala.annotation.tailrec
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.collection.mutable.Stack
|
||||
|
@ -33,11 +34,66 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
private def nextInternal(state: State): Either[Error, Token] =
|
||||
state match
|
||||
case State.Initial =>
|
||||
Left(Error.NotImplemented)
|
||||
reader.consume() match
|
||||
case None => Right(Token.Eof)
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case _ if isWhiteSpace(ch) => nextInternal(state)
|
||||
case TokenDelimiter.OpenParen => Right(Token.OpenParen)
|
||||
case TokenDelimiter.CloseParen => Right(Token.CloseParen)
|
||||
case TokenDelimiter.Comma => Right(Token.Comma)
|
||||
case TokenDelimiter.Colon => Right(Token.Colon)
|
||||
case TokenDelimiter.Dot => Right(Token.Dot)
|
||||
case TokenDelimiter.Tuple => Right(Token.Tuple)
|
||||
case TokenDelimiter.BackSlash =>
|
||||
Left(Error.BackSlashNotAllowed(reader.currentSourcePosition()))
|
||||
case TokenDelimiter.DoubleQuote =>
|
||||
nextInternal(
|
||||
State.InDoubleQuote(reader.currentSourcePosition())
|
||||
)
|
||||
case TokenDelimiter.SingleQuote =>
|
||||
nextInternal(
|
||||
State.InSingleQuote(reader.currentSourcePosition())
|
||||
)
|
||||
case '-' =>
|
||||
nextInternal(
|
||||
State.PotentialComment(reader.currentSourcePosition())
|
||||
)
|
||||
case _ =>
|
||||
nextInternal(State.InGeneric(reader.currentSourcePosition()))
|
||||
case State.PotentialComment(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
reader.consume() match
|
||||
case None => Left(Error.PrematureEof(dumpStack()))
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case '-' =>
|
||||
// Confirmed that this is a comment. Switch to the comment state
|
||||
// so that we can read the remainder of the line.
|
||||
states.push(state)
|
||||
nextInternal(State.InComment(reader.currentSourcePosition()))
|
||||
case ' ' | '\t' | '\n' =>
|
||||
// This token is only a `-` character and has been delimited by
|
||||
// whitespace on either side. Return the token.
|
||||
Right(Token.Generic("-"))
|
||||
case _ =>
|
||||
// This is some generic token that starts with `-`. Switch
|
||||
// states now that we're sure about what we're reading.
|
||||
buffer.addOne(ch)
|
||||
nextInternal(State.InGeneric(startPos))
|
||||
case State.InComment(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
reader.consume() match
|
||||
case None =>
|
||||
// Reaching EOF during a comment is perfectly fine.
|
||||
Right(Token.Comment(buffer.mkString))
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case '\n' =>
|
||||
// Newlines terminate a comment.
|
||||
Right(Token.Comment(buffer.mkString))
|
||||
case _ =>
|
||||
// Any non-newline character is considered part of a comment.
|
||||
buffer.addOne(ch)
|
||||
nextInternal(state)
|
||||
case State.InDoubleQuote(startPos) =>
|
||||
reader.consume() match
|
||||
case None => Left(Error.PrematureEof(dumpStack()))
|
||||
|
@ -106,7 +162,18 @@ class Tokenizer(private val reader: CharacterReader):
|
|||
val _ = buffer.addOne(actual)
|
||||
nextInternal(states.pop())
|
||||
case State.InGeneric(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
reader.consume() match
|
||||
case None =>
|
||||
// EOF is permitted for tokens - it forcibly terminates them.
|
||||
Right(Token.Generic(buffer.mkString))
|
||||
case Some(ch) =>
|
||||
if TokenDelimiter.isDelimiter(ch) then
|
||||
// Any delimiter forcibly terminates a token.
|
||||
Right(Token.Generic(buffer.mkString))
|
||||
else
|
||||
// Non-delimiter characters are added to the token.
|
||||
buffer.addOne(ch)
|
||||
nextInternal(state)
|
||||
|
||||
private def resolveCharEscape(ch: Char): Either[Error, Char] =
|
||||
CharEscape.resolve(ch) match
|
||||
|
@ -202,6 +269,10 @@ object Tokenizer:
|
|||
candidate: String
|
||||
) extends Error
|
||||
|
||||
case class BackSlashNotAllowed(
|
||||
sourcePosition: SourcePosition
|
||||
) extends Error
|
||||
|
||||
end Error
|
||||
|
||||
end Tokenizer
|
||||
|
|
Loading…
Add table
Reference in a new issue