Continued work on the Tokenizer. Starting to track source position.

This commit is contained in:
Pat Garrity 2024-02-20 21:47:36 -06:00
parent c5ea827944
commit 7beaef3b0d
Signed by: pfm
GPG key ID: 5CA5D21BAB7F3A76
6 changed files with 289 additions and 16 deletions

View file

@ -22,7 +22,7 @@ lower ::= 'a' | ... | 'z' | Ll
| Lm lower only;
letter ::= upper | lower;
escape_unicode ::= '\', 'u', hex, hex, hex, hex;
escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"');
escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"' | "'");
escape_seq ::= escape_unicode | escape_char;
number_integer ::= '0' | digit, [{digit}];
number_float ::= number_integer, '.', number_integer;

View file

@ -0,0 +1,47 @@
package ava.parser
/** Enumeration for all supported character escapes.
*
* @param name
* The character escape name - how it resolves in source code.
* @param output
* The character emitted by the escape.
*/
sealed abstract class CharEscape(
val name: Char,
val output: Char
)
object CharEscape:
case object NewLine extends CharEscape('n', '\n')
case object CarriageReturn extends CharEscape('r', '\r')
case object Backspace extends CharEscape('b', '\b')
case object FormFeed extends CharEscape('f', '\f')
case object Tab extends CharEscape('t', '\t')
case object BackSlash extends CharEscape('\\', '\\')
case object DoubleQuote extends CharEscape('"', '"')
case object SingleQuote extends CharEscape('\'', '\'')
val All: List[CharEscape] = List(
NewLine,
CarriageReturn,
Backspace,
FormFeed,
Tab,
BackSlash,
DoubleQuote,
SingleQuote
)
/** Given some escape name, attempt to resolve to a supported [[CharEscape]].
*
* @param escapeName
* The candidate name.
* @return
* The relevent [[CharEscape]], or `None` if no such escape exists.
*/
def resolve(escapeName: Char): Option[CharEscape] =
All.find(_.name == escapeName)
end CharEscape

View file

@ -32,10 +32,18 @@ class CharacterReader(
private var peekedAhead: Boolean = false
private var lookAhead: Char = 0
// Tracking for callers.
private var charInFile: Long = 0
private var lineNumber: Long = 0
private var charInLine: Long = 0
/** Close the underlying stream.
*/
def close(): Unit = input.close()
def currentSourcePosition(): SourcePosition =
SourcePosition(charInFile, lineNumber, charInLine)
/** Set EOF and close the underlying stream.
*/
private def setEof(): Unit =
@ -182,6 +190,14 @@ class CharacterReader(
if currentChar != 0 then lastChars.push(currentChar)
else ()
// Update tracking information.
charInFile = charInFile + 1
if ch == '\n' then
lineNumber = lineNumber + 1
charInLine = 0
else charInLine = charInLine + 1
// Finally, remap the current character.
currentChar = ch
object CharacterReader:

View file

@ -0,0 +1,7 @@
package ava.parser
case class SourcePosition(
characterInFile: Long,
lineNumber: Long,
characterInLine: Long
)

View file

@ -18,6 +18,32 @@ object Token:
*/
case class Comment(value: String) extends Token
/** String literals are captured in total, with escapes resolved.
*
* @param value
* The resolved string literal value.
* @param errors
* The list of errors that occurred while resolving this string literal, if
* any.
*/
case class StringLiteral(
value: String,
errors: List[Tokenizer.Error]
) extends Token
/** Character literals are captured in total, with escapes resolved.
*
* @param value
* The resolved character literal value.
* @param errors
* The list of errors that occurred while resolving this character literal,
* if any.
*/
case class CharacterLiteral(
value: Char,
errors: List[Tokenizer.Error]
) extends Token
/** The '(' character.
*/
case object OpenParen extends Token
@ -34,14 +60,6 @@ object Token:
*/
case object Comma extends Token
/** The '"' character.
*/
case object DoubleQuote extends Token
/** The ''' character.
*/
case object SingleQuote extends Token
/** The ':' character.
*/
case object Colon extends Token
@ -49,4 +67,8 @@ object Token:
/** The '#' character.
*/
case object Tuple extends Token
/** Represents end of file (EOF), not in error.
*/
case object Eof extends Token
end Token

View file

@ -1,26 +1,207 @@
package ava.parser
import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Stack
class Tokenizer(private val reader: CharacterReader):
import Tokenizer.*
private var state: State = State.Initial
private val buffer: ListBuffer[Char] = ListBuffer.empty
private val states: Stack[State] = Stack.empty
private val errors: ListBuffer[Error] = ListBuffer.empty
def next(): Option[Token] = None
private def dumpStack(): List[State] = states.toList
/** Consume the next available token.
*
* @return
* The next available token, or an error if resolving a token fails.
*/
def next(): Either[Error, Token] =
buffer.clear()
states.clear()
errors.clear()
nextInternal(State.Initial)
/** Close this Tokenizer instance and free all resources, including the
* [[CharacterReader]].
*/
def close(): Unit = reader.close()
@tailrec
private def nextInternal(state: State): Either[Error, Token] =
state match
case State.Initial =>
Left(Error.NotImplemented)
case State.PotentialComment(startPos) =>
Left(Error.NotImplemented)
case State.InComment(startPos) =>
Left(Error.NotImplemented)
case State.InDoubleQuote(startPos) =>
reader.consume() match
case None => Left(Error.PrematureEof(dumpStack()))
case Some(ch) =>
ch match
case '\n' =>
// Literal Newlines are not allowed within string literals.
Left(
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
)
case '\\' =>
// Character escapes are supported within string literals.
states.push(state)
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
case '"' =>
// This string literal is now closed. If anything failed inside
// this literal, suppress those errors and return them as part
// of the token so that parsing may continue.
Right(
Token.StringLiteral(
value = buffer.mkString,
errors = errors.toList
)
)
case _ =>
// Continue accumulating characters.
buffer.addOne(ch)
nextInternal(state)
case State.InSingleQuote(startPos) =>
reader.consume() match
case None => Left(Error.PrematureEof(dumpStack()))
case Some(ch) =>
ch match
case '\n' =>
// Literal Newlines are not allowed within character literals.
Left(
Error.UnexpectedNewLine(reader.currentSourcePosition(), state)
)
case '\\' =>
// Character escapse are supported within character literals.
states.push(state)
nextInternal(State.InCharEscape(reader.currentSourcePosition()))
case '\'' =>
// This character literal is now closed.
createCharacterLiteral()
case _ =>
// Continue accumulating characters.
buffer.addOne(ch)
nextInternal(state)
case State.InCharEscape(startPos) =>
reader.consume() match
case None =>
Left(Error.PrematureEof(dumpStack()))
case Some(ch) =>
resolveCharEscape(ch) match
case Left(error) =>
// Character escapes always live within some other state.
// Capture the error, but don't just stop consuming -- pop the
// state and return to wherever (string or character literal)
// this error originated.
errors.addOne(error)
nextInternal(states.pop())
case Right(actual) =>
// Add the resolved character to the buffer and return to the
// parent state.
val _ = buffer.addOne(actual)
nextInternal(states.pop())
case State.InGeneric(startPos) =>
Left(Error.NotImplemented)
private def resolveCharEscape(ch: Char): Either[Error, Char] =
CharEscape.resolve(ch) match
case None =>
Left(Error.InvalidCharEscape(reader.currentSourcePosition(), ch))
case Some(escape) =>
Right(escape.output)
private def createCharacterLiteral(): Either[Error, Token] =
val dump = buffer.mkString
if dump.length() > 1 then
Left(
Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump)
)
else
val out = if dump.isEmpty() then 0 else dump.charAt(0)
Right(
Token.CharacterLiteral(
value = out,
errors = errors.toList
)
)
object Tokenizer:
sealed trait State
object State:
/** The initial state. This is always the state of the [[Tokenizer]] when it
* starts, as well as between each token.
*/
case object Initial extends State
case object PotentialComment extends State
case object InComment extends State
case object InQuote extends State
case object InGeneric extends State
case class PotentialComment(start: SourcePosition) extends State
case class InComment(start: SourcePosition) extends State
case class InDoubleQuote(start: SourcePosition) extends State
case class InSingleQuote(start: SourcePosition) extends State
case class InCharEscape(start: SourcePosition) extends State
case class InGeneric(start: SourcePosition) extends State
given CanEqual[State, State] = CanEqual.derived
end State
sealed trait Error
object Error:
sealed trait Positional extends Error:
def sourcePosition: SourcePosition
case object NotImplemented extends Error
/** This error occurs when the end of file is reached if the tokenizer is
* still expecting more characters.
*
* @param stack
* The stack of states at time of error. Helps communicate what
* additional characters were expected.
*/
case class PrematureEof(stack: List[State]) extends Error
case class UnexpectedNewLine(
sourcePosition: SourcePosition,
currentState: State
) extends Positional
/** This error occurs when the source contains a character escape indicated
* by a backslash character within some string/character literal, but the
* escape type is not recognized.
*
* @param sourcePosition
* The [[SourcePosition]] where the error triggered.
* @param candidate
* The escape name that was attempted.
*/
case class InvalidCharEscape(
sourcePosition: SourcePosition,
candidate: Char
) extends Positional
/** This error occurs when a character literal is expected, but more than
* one character is present in the accumulated buffer.
*
* @param sourcePosition
* The [[SourcePosition]] where the error triggered.
* @param candidate
* The candidate string.
*/
case class MultipleCharactersInLiteral(
sourcePosition: SourcePosition,
candidate: String
) extends Error
end Error
end Tokenizer