Continued work on the Tokenizer. Starting to track source position.
This commit is contained in:
parent
c5ea827944
commit
7beaef3b0d
6 changed files with 289 additions and 16 deletions
2
ava.ebnf
2
ava.ebnf
|
@ -22,7 +22,7 @@ lower ::= 'a' | ... | 'z' | Ll
|
|||
| Lm lower only;
|
||||
letter ::= upper | lower;
|
||||
escape_unicode ::= '\', 'u', hex, hex, hex, hex;
|
||||
escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"');
|
||||
escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"' | "'");
|
||||
escape_seq ::= escape_unicode | escape_char;
|
||||
number_integer ::= '0' | digit, [{digit}];
|
||||
number_float ::= number_integer, '.', number_integer;
|
||||
|
|
47
modules/parser/src/main/scala/ava/parser/CharEscape.scala
Normal file
47
modules/parser/src/main/scala/ava/parser/CharEscape.scala
Normal file
|
@ -0,0 +1,47 @@
|
|||
package ava.parser
|
||||
|
||||
/** Enumeration for all supported character escapes.
|
||||
*
|
||||
* @param name
|
||||
* The character escape name - how it resolves in source code.
|
||||
* @param output
|
||||
* The character emitted by the escape.
|
||||
*/
|
||||
sealed abstract class CharEscape(
|
||||
val name: Char,
|
||||
val output: Char
|
||||
)
|
||||
|
||||
object CharEscape:
|
||||
|
||||
case object NewLine extends CharEscape('n', '\n')
|
||||
case object CarriageReturn extends CharEscape('r', '\r')
|
||||
case object Backspace extends CharEscape('b', '\b')
|
||||
case object FormFeed extends CharEscape('f', '\f')
|
||||
case object Tab extends CharEscape('t', '\t')
|
||||
case object BackSlash extends CharEscape('\\', '\\')
|
||||
case object DoubleQuote extends CharEscape('"', '"')
|
||||
case object SingleQuote extends CharEscape('\'', '\'')
|
||||
|
||||
val All: List[CharEscape] = List(
|
||||
NewLine,
|
||||
CarriageReturn,
|
||||
Backspace,
|
||||
FormFeed,
|
||||
Tab,
|
||||
BackSlash,
|
||||
DoubleQuote,
|
||||
SingleQuote
|
||||
)
|
||||
|
||||
/** Given some escape name, attempt to resolve to a supported [[CharEscape]].
|
||||
*
|
||||
* @param escapeName
|
||||
* The candidate name.
|
||||
* @return
|
||||
* The relevent [[CharEscape]], or `None` if no such escape exists.
|
||||
*/
|
||||
def resolve(escapeName: Char): Option[CharEscape] =
|
||||
All.find(_.name == escapeName)
|
||||
|
||||
end CharEscape
|
|
@ -32,10 +32,18 @@ class CharacterReader(
|
|||
private var peekedAhead: Boolean = false
|
||||
private var lookAhead: Char = 0
|
||||
|
||||
// Tracking for callers.
|
||||
private var charInFile: Long = 0
|
||||
private var lineNumber: Long = 0
|
||||
private var charInLine: Long = 0
|
||||
|
||||
/** Close the underlying stream.
|
||||
*/
|
||||
def close(): Unit = input.close()
|
||||
|
||||
def currentSourcePosition(): SourcePosition =
|
||||
SourcePosition(charInFile, lineNumber, charInLine)
|
||||
|
||||
/** Set EOF and close the underlying stream.
|
||||
*/
|
||||
private def setEof(): Unit =
|
||||
|
@ -182,6 +190,14 @@ class CharacterReader(
|
|||
if currentChar != 0 then lastChars.push(currentChar)
|
||||
else ()
|
||||
|
||||
// Update tracking information.
|
||||
charInFile = charInFile + 1
|
||||
if ch == '\n' then
|
||||
lineNumber = lineNumber + 1
|
||||
charInLine = 0
|
||||
else charInLine = charInLine + 1
|
||||
|
||||
// Finally, remap the current character.
|
||||
currentChar = ch
|
||||
|
||||
object CharacterReader:
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
package ava.parser
|
||||
|
||||
case class SourcePosition(
|
||||
characterInFile: Long,
|
||||
lineNumber: Long,
|
||||
characterInLine: Long
|
||||
)
|
|
@ -18,6 +18,32 @@ object Token:
|
|||
*/
|
||||
case class Comment(value: String) extends Token
|
||||
|
||||
/** String literals are captured in total, with escapes resolved.
|
||||
*
|
||||
* @param value
|
||||
* The resolved string literal value.
|
||||
* @param errors
|
||||
* The list of errors that occurred while resolving this string literal, if
|
||||
* any.
|
||||
*/
|
||||
case class StringLiteral(
|
||||
value: String,
|
||||
errors: List[Tokenizer.Error]
|
||||
) extends Token
|
||||
|
||||
/** Character literals are captured in total, with escapes resolved.
|
||||
*
|
||||
* @param value
|
||||
* The resolved character literal value.
|
||||
* @param errors
|
||||
* The list of errors that occurred while resolving this character literal,
|
||||
* if any.
|
||||
*/
|
||||
case class CharacterLiteral(
|
||||
value: Char,
|
||||
errors: List[Tokenizer.Error]
|
||||
) extends Token
|
||||
|
||||
/** The '(' character.
|
||||
*/
|
||||
case object OpenParen extends Token
|
||||
|
@ -34,14 +60,6 @@ object Token:
|
|||
*/
|
||||
case object Comma extends Token
|
||||
|
||||
/** The '"' character.
|
||||
*/
|
||||
case object DoubleQuote extends Token
|
||||
|
||||
/** The ''' character.
|
||||
*/
|
||||
case object SingleQuote extends Token
|
||||
|
||||
/** The ':' character.
|
||||
*/
|
||||
case object Colon extends Token
|
||||
|
@ -49,4 +67,8 @@ object Token:
|
|||
/** The '#' character.
|
||||
*/
|
||||
case object Tuple extends Token
|
||||
|
||||
/** Represents end of file (EOF), not in error.
|
||||
*/
|
||||
case object Eof extends Token
|
||||
end Token
|
||||
|
|
|
@ -1,26 +1,207 @@
|
|||
package ava.parser
|
||||
|
||||
import scala.annotation.tailrec
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.collection.mutable.Stack
|
||||
|
||||
class Tokenizer(private val reader: CharacterReader):
|
||||
import Tokenizer.*
|
||||
|
||||
private var state: State = State.Initial
|
||||
private val buffer: ListBuffer[Char] = ListBuffer.empty
|
||||
private val states: Stack[State] = Stack.empty
|
||||
private val errors: ListBuffer[Error] = ListBuffer.empty
|
||||
|
||||
def next(): Option[Token] = None
|
||||
private def dumpStack(): List[State] = states.toList
|
||||
|
||||
/** Consume the next available token.
|
||||
*
|
||||
* @return
|
||||
* The next available token, or an error if resolving a token fails.
|
||||
*/
|
||||
def next(): Either[Error, Token] =
|
||||
buffer.clear()
|
||||
states.clear()
|
||||
errors.clear()
|
||||
nextInternal(State.Initial)
|
||||
|
||||
/** Close this Tokenizer instance and free all resources, including the
|
||||
* [[CharacterReader]].
|
||||
*/
|
||||
def close(): Unit = reader.close()
|
||||
|
||||
@tailrec
|
||||
private def nextInternal(state: State): Either[Error, Token] =
|
||||
state match
|
||||
case State.Initial =>
|
||||
Left(Error.NotImplemented)
|
||||
case State.PotentialComment(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
case State.InComment(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
case State.InDoubleQuote(startPos) =>
|
||||
reader.consume() match
|
||||
case None => Left(Error.PrematureEof(dumpStack()))
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case '\n' =>
|
||||
// Literal Newlines are not allowed within string literals.
|
||||
Left(
|
||||
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
|
||||
)
|
||||
case '\\' =>
|
||||
// Character escapes are supported within string literals.
|
||||
states.push(state)
|
||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
||||
case '"' =>
|
||||
// This string literal is now closed. If anything failed inside
|
||||
// this literal, suppress those errors and return them as part
|
||||
// of the token so that parsing may continue.
|
||||
Right(
|
||||
Token.StringLiteral(
|
||||
value = buffer.mkString,
|
||||
errors = errors.toList
|
||||
)
|
||||
)
|
||||
case _ =>
|
||||
// Continue accumulating characters.
|
||||
buffer.addOne(ch)
|
||||
nextInternal(state)
|
||||
case State.InSingleQuote(startPos) =>
|
||||
reader.consume() match
|
||||
case None => Left(Error.PrematureEof(dumpStack()))
|
||||
case Some(ch) =>
|
||||
ch match
|
||||
case '\n' =>
|
||||
// Literal Newlines are not allowed within character literals.
|
||||
Left(
|
||||
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
|
||||
)
|
||||
case '\\' =>
|
||||
// Character escapse are supported within character literals.
|
||||
states.push(state)
|
||||
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
|
||||
case '\'' =>
|
||||
// This character literal is now closed.
|
||||
createCharacterLiteral()
|
||||
case _ =>
|
||||
// Continue accumulating characters.
|
||||
buffer.addOne(ch)
|
||||
nextInternal(state)
|
||||
case State.InCharEscape(startPos) =>
|
||||
reader.consume() match
|
||||
case None =>
|
||||
Left(Error.PrematureEof(dumpStack()))
|
||||
case Some(ch) =>
|
||||
resolveCharEscape(ch) match
|
||||
case Left(error) =>
|
||||
// Character escapes always live within some other state.
|
||||
// Capture the error, but don't just stop consuming -- pop the
|
||||
// state and return to wherever (string or character literal)
|
||||
// this error originated.
|
||||
errors.addOne(error)
|
||||
nextInternal(states.pop())
|
||||
case Right(actual) =>
|
||||
// Add the resolved character to the buffer and return to the
|
||||
// parent state.
|
||||
val _ = buffer.addOne(actual)
|
||||
nextInternal(states.pop())
|
||||
case State.InGeneric(startPos) =>
|
||||
Left(Error.NotImplemented)
|
||||
|
||||
private def resolveCharEscape(ch: Char): Either[Error, Char] =
|
||||
CharEscape.resolve(ch) match
|
||||
case None =>
|
||||
Left(Error.InvalidCharEscape(reader.currentSourcePosition(), ch))
|
||||
case Some(escape) =>
|
||||
Right(escape.output)
|
||||
|
||||
private def createCharacterLiteral(): Either[Error, Token] =
|
||||
val dump = buffer.mkString
|
||||
if dump.length() > 1 then
|
||||
Left(
|
||||
Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump)
|
||||
)
|
||||
else
|
||||
val out = if dump.isEmpty() then 0 else dump.charAt(0)
|
||||
Right(
|
||||
Token.CharacterLiteral(
|
||||
value = out,
|
||||
errors = errors.toList
|
||||
)
|
||||
)
|
||||
|
||||
object Tokenizer:
|
||||
|
||||
sealed trait State
|
||||
|
||||
object State:
|
||||
|
||||
case object Initial extends State
|
||||
case object PotentialComment extends State
|
||||
case object InComment extends State
|
||||
case object InQuote extends State
|
||||
case object InGeneric extends State
|
||||
/** The initial state. This is always the state of the [[Tokenizer]] when it
|
||||
* starts, as well as between each token.
|
||||
*/
|
||||
case object Initial extends State
|
||||
|
||||
case class PotentialComment(start: SourcePosition) extends State
|
||||
case class InComment(start: SourcePosition) extends State
|
||||
case class InDoubleQuote(start: SourcePosition) extends State
|
||||
case class InSingleQuote(start: SourcePosition) extends State
|
||||
case class InCharEscape(start: SourcePosition) extends State
|
||||
case class InGeneric(start: SourcePosition) extends State
|
||||
|
||||
given CanEqual[State, State] = CanEqual.derived
|
||||
|
||||
end State
|
||||
|
||||
sealed trait Error
|
||||
|
||||
object Error:
|
||||
|
||||
sealed trait Positional extends Error:
|
||||
def sourcePosition: SourcePosition
|
||||
|
||||
case object NotImplemented extends Error
|
||||
|
||||
/** This error occurs when the end of file is reached if the tokenizer is
|
||||
* still expecting more characters.
|
||||
*
|
||||
* @param stack
|
||||
* The stack of states at time of error. Helps communicate what
|
||||
* additional characters were expected.
|
||||
*/
|
||||
case class PrematureEof(stack: List[State]) extends Error
|
||||
|
||||
case class UnexpectedNewLine(
|
||||
sourcePosition: SourcePosition,
|
||||
currentState: State
|
||||
) extends Positional
|
||||
|
||||
/** This error occurs when the source contains a character escape indicated
|
||||
* by a backslash character within some string/character literal, but the
|
||||
* escape type is not recognized.
|
||||
*
|
||||
* @param sourcePosition
|
||||
* The [[SourcePosition]] where the error triggered.
|
||||
* @param candidate
|
||||
* The escape name that was attempted.
|
||||
*/
|
||||
case class InvalidCharEscape(
|
||||
sourcePosition: SourcePosition,
|
||||
candidate: Char
|
||||
) extends Positional
|
||||
|
||||
/** This error occurs when a character literal is expected, but more than
|
||||
* one character is present in the accumulated buffer.
|
||||
*
|
||||
* @param sourcePosition
|
||||
* The [[SourcePosition]] where the error triggered.
|
||||
* @param candidate
|
||||
* The candidate string.
|
||||
*/
|
||||
case class MultipleCharactersInLiteral(
|
||||
sourcePosition: SourcePosition,
|
||||
candidate: String
|
||||
) extends Error
|
||||
|
||||
end Error
|
||||
|
||||
end Tokenizer
|
||||
|
|
Loading…
Add table
Reference in a new issue