diff --git a/ava.ebnf b/ava.ebnf index 380e363..f012ae8 100644 --- a/ava.ebnf +++ b/ava.ebnf @@ -22,7 +22,7 @@ lower ::= 'a' | ... | 'z' | Ll | Lm lower only; letter ::= upper | lower; escape_unicode ::= '\', 'u', hex, hex, hex, hex; -escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"'); +escape_char ::= '\', ('b' | 'f' | 'n' | 'r' | 't' | '\' | '"' | "'"); escape_seq ::= escape_unicode | escape_char; number_integer ::= '0' | digit, [{digit}]; number_float ::= number_integer, '.', number_integer; diff --git a/modules/parser/src/main/scala/ava/parser/CharEscape.scala b/modules/parser/src/main/scala/ava/parser/CharEscape.scala new file mode 100644 index 0000000..8ab822e --- /dev/null +++ b/modules/parser/src/main/scala/ava/parser/CharEscape.scala @@ -0,0 +1,47 @@ +package ava.parser + +/** Enumeration for all supported character escapes. + * + * @param name + * The character escape name - how it resolves in source code. + * @param output + * The character emitted by the escape. + */ +sealed abstract class CharEscape( + val name: Char, + val output: Char +) + +object CharEscape: + + case object NewLine extends CharEscape('n', '\n') + case object CarriageReturn extends CharEscape('r', '\r') + case object Backspace extends CharEscape('b', '\b') + case object FormFeed extends CharEscape('f', '\f') + case object Tab extends CharEscape('t', '\t') + case object BackSlash extends CharEscape('\\', '\\') + case object DoubleQuote extends CharEscape('"', '"') + case object SingleQuote extends CharEscape('\'', '\'') + + val All: List[CharEscape] = List( + NewLine, + CarriageReturn, + Backspace, + FormFeed, + Tab, + BackSlash, + DoubleQuote, + SingleQuote + ) + + /** Given some escape name, attempt to resolve to a supported [[CharEscape]]. + * + * @param escapeName + * The candidate name. + * @return + * The relevent [[CharEscape]], or `None` if no such escape exists. + */ + def resolve(escapeName: Char): Option[CharEscape] = + All.find(_.name == escapeName) + +end CharEscape diff --git a/modules/parser/src/main/scala/ava/parser/CharacterReader.scala b/modules/parser/src/main/scala/ava/parser/CharacterReader.scala index 7c93729..65e9c05 100644 --- a/modules/parser/src/main/scala/ava/parser/CharacterReader.scala +++ b/modules/parser/src/main/scala/ava/parser/CharacterReader.scala @@ -32,10 +32,18 @@ class CharacterReader( private var peekedAhead: Boolean = false private var lookAhead: Char = 0 + // Tracking for callers. + private var charInFile: Long = 0 + private var lineNumber: Long = 0 + private var charInLine: Long = 0 + /** Close the underlying stream. */ def close(): Unit = input.close() + def currentSourcePosition(): SourcePosition = + SourcePosition(charInFile, lineNumber, charInLine) + /** Set EOF and close the underlying stream. */ private def setEof(): Unit = @@ -182,6 +190,14 @@ class CharacterReader( if currentChar != 0 then lastChars.push(currentChar) else () + // Update tracking information. + charInFile = charInFile + 1 + if ch == '\n' then + lineNumber = lineNumber + 1 + charInLine = 0 + else charInLine = charInLine + 1 + + // Finally, remap the current character. currentChar = ch object CharacterReader: diff --git a/modules/parser/src/main/scala/ava/parser/SourcePosition.scala b/modules/parser/src/main/scala/ava/parser/SourcePosition.scala new file mode 100644 index 0000000..b39e6c4 --- /dev/null +++ b/modules/parser/src/main/scala/ava/parser/SourcePosition.scala @@ -0,0 +1,7 @@ +package ava.parser + +case class SourcePosition( + characterInFile: Long, + lineNumber: Long, + characterInLine: Long +) diff --git a/modules/parser/src/main/scala/ava/parser/Token.scala b/modules/parser/src/main/scala/ava/parser/Token.scala index b2d40f9..2685cd2 100644 --- a/modules/parser/src/main/scala/ava/parser/Token.scala +++ b/modules/parser/src/main/scala/ava/parser/Token.scala @@ -18,6 +18,32 @@ object Token: */ case class Comment(value: String) extends Token + /** String literals are captured in total, with escapes resolved. + * + * @param value + * The resolved string literal value. + * @param errors + * The list of errors that occurred while resolving this string literal, if + * any. + */ + case class StringLiteral( + value: String, + errors: List[Tokenizer.Error] + ) extends Token + + /** Character literals are captured in total, with escapes resolved. + * + * @param value + * The resolved character literal value. + * @param errors + * The list of errors that occurred while resolving this character literal, + * if any. + */ + case class CharacterLiteral( + value: Char, + errors: List[Tokenizer.Error] + ) extends Token + /** The '(' character. */ case object OpenParen extends Token @@ -34,14 +60,6 @@ object Token: */ case object Comma extends Token - /** The '"' character. - */ - case object DoubleQuote extends Token - - /** The ''' character. - */ - case object SingleQuote extends Token - /** The ':' character. */ case object Colon extends Token @@ -49,4 +67,8 @@ object Token: /** The '#' character. */ case object Tuple extends Token + + /** Represents end of file (EOF), not in error. + */ + case object Eof extends Token end Token diff --git a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala index 740fb53..92287e7 100644 --- a/modules/parser/src/main/scala/ava/parser/Tokenizer.scala +++ b/modules/parser/src/main/scala/ava/parser/Tokenizer.scala @@ -1,26 +1,207 @@ package ava.parser +import scala.annotation.tailrec +import scala.collection.mutable.ListBuffer +import scala.collection.mutable.Stack + class Tokenizer(private val reader: CharacterReader): import Tokenizer.* - private var state: State = State.Initial + private val buffer: ListBuffer[Char] = ListBuffer.empty + private val states: Stack[State] = Stack.empty + private val errors: ListBuffer[Error] = ListBuffer.empty - def next(): Option[Token] = None + private def dumpStack(): List[State] = states.toList + /** Consume the next available token. + * + * @return + * The next available token, or an error if resolving a token fails. + */ + def next(): Either[Error, Token] = + buffer.clear() + states.clear() + errors.clear() + nextInternal(State.Initial) + + /** Close this Tokenizer instance and free all resources, including the + * [[CharacterReader]]. + */ def close(): Unit = reader.close() + @tailrec + private def nextInternal(state: State): Either[Error, Token] = + state match + case State.Initial => + Left(Error.NotImplemented) + case State.PotentialComment(startPos) => + Left(Error.NotImplemented) + case State.InComment(startPos) => + Left(Error.NotImplemented) + case State.InDoubleQuote(startPos) => + reader.consume() match + case None => Left(Error.PrematureEof(dumpStack())) + case Some(ch) => + ch match + case '\n' => + // Literal Newlines are not allowed within string literals. + Left( + Error.UnexpectedNewLine(reader.currentSourcePosition(), state) + ) + case '\\' => + // Character escapes are supported within string literals. + states.push(state) + nextInternal(State.InCharEscape(reader.currentSourcePosition())) + case '"' => + // This string literal is now closed. If anything failed inside + // this literal, suppress those errors and return them as part + // of the token so that parsing may continue. + Right( + Token.StringLiteral( + value = buffer.mkString, + errors = errors.toList + ) + ) + case _ => + // Continue accumulating characters. + buffer.addOne(ch) + nextInternal(state) + case State.InSingleQuote(startPos) => + reader.consume() match + case None => Left(Error.PrematureEof(dumpStack())) + case Some(ch) => + ch match + case '\n' => + // Literal Newlines are not allowed within character literals. + Left( + Error.UnexpectedNewLine(reader.currentSourcePosition(), state) + ) + case '\\' => + // Character escapse are supported within character literals. + states.push(state) + nextInternal(State.InCharEscape(reader.currentSourcePosition())) + case '\'' => + // This character literal is now closed. + createCharacterLiteral() + case _ => + // Continue accumulating characters. + buffer.addOne(ch) + nextInternal(state) + case State.InCharEscape(startPos) => + reader.consume() match + case None => + Left(Error.PrematureEof(dumpStack())) + case Some(ch) => + resolveCharEscape(ch) match + case Left(error) => + // Character escapes always live within some other state. + // Capture the error, but don't just stop consuming -- pop the + // state and return to wherever (string or character literal) + // this error originated. + errors.addOne(error) + nextInternal(states.pop()) + case Right(actual) => + // Add the resolved character to the buffer and return to the + // parent state. + val _ = buffer.addOne(actual) + nextInternal(states.pop()) + case State.InGeneric(startPos) => + Left(Error.NotImplemented) + + private def resolveCharEscape(ch: Char): Either[Error, Char] = + CharEscape.resolve(ch) match + case None => + Left(Error.InvalidCharEscape(reader.currentSourcePosition(), ch)) + case Some(escape) => + Right(escape.output) + + private def createCharacterLiteral(): Either[Error, Token] = + val dump = buffer.mkString + if dump.length() > 1 then + Left( + Error.MultipleCharactersInLiteral(reader.currentSourcePosition(), dump) + ) + else + val out = if dump.isEmpty() then 0 else dump.charAt(0) + Right( + Token.CharacterLiteral( + value = out, + errors = errors.toList + ) + ) + object Tokenizer: sealed trait State object State: - case object Initial extends State - case object PotentialComment extends State - case object InComment extends State - case object InQuote extends State - case object InGeneric extends State + /** The initial state. This is always the state of the [[Tokenizer]] when it + * starts, as well as between each token. + */ + case object Initial extends State + + case class PotentialComment(start: SourcePosition) extends State + case class InComment(start: SourcePosition) extends State + case class InDoubleQuote(start: SourcePosition) extends State + case class InSingleQuote(start: SourcePosition) extends State + case class InCharEscape(start: SourcePosition) extends State + case class InGeneric(start: SourcePosition) extends State + + given CanEqual[State, State] = CanEqual.derived end State + sealed trait Error + + object Error: + + sealed trait Positional extends Error: + def sourcePosition: SourcePosition + + case object NotImplemented extends Error + + /** This error occurs when the end of file is reached if the tokenizer is + * still expecting more characters. + * + * @param stack + * The stack of states at time of error. Helps communicate what + * additional characters were expected. + */ + case class PrematureEof(stack: List[State]) extends Error + + case class UnexpectedNewLine( + sourcePosition: SourcePosition, + currentState: State + ) extends Positional + + /** This error occurs when the source contains a character escape indicated + * by a backslash character within some string/character literal, but the + * escape type is not recognized. + * + * @param sourcePosition + * The [[SourcePosition]] where the error triggered. + * @param candidate + * The escape name that was attempted. + */ + case class InvalidCharEscape( + sourcePosition: SourcePosition, + candidate: Char + ) extends Positional + + /** This error occurs when a character literal is expected, but more than + * one character is present in the accumulated buffer. + * + * @param sourcePosition + * The [[SourcePosition]] where the error triggered. + * @param candidate + * The candidate string. + */ + case class MultipleCharactersInLiteral( + sourcePosition: SourcePosition, + candidate: String + ) extends Error + + end Error + end Tokenizer