// Copyright (c) 2012-2023 Wojciech Figat. All rights reserved.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace Flax.Build
{
///
/// Types of the tokens supported by the .
///
public enum TokenType
{
///
/// A whitespace.
///
Whitespace,
///
/// A Newline.
///
Newline,
///
/// A multi line comment.
///
CommentMultiLine,
///
/// A single line comment.
///
CommentSingleLine,
///
/// An identifier.
///
Identifier,
///
/// A number in hexadecimal form.
///
Hex,
///
/// A number.
///
Number,
///
/// The symbol '='.
///
Equal,
///
/// A comma ','.
///
Comma,
///
/// A Semicolon ';'.
///
SemiColon,
///
/// A left curly brace '{'.
///
LeftCurlyBrace,
///
/// A right curly brace '}'.
///
RightCurlyBrace,
///
/// A left parenthesis '('.
///
LeftParent,
///
/// A right parenthesis ')'.
///
RightParent,
///
/// A left bracket '['.
///
LeftBracket,
///
/// A right bracket ']'.
///
RightBracket,
///
/// A text.
///
String,
///
/// An character.
///
Character,
///
/// A preprocessor token '#'
///
Preprocessor,
///
/// A colon ':'.
///
Colon,
///
/// A double colon '::'.
///
DoubleColon,
///
/// A dot '.'.
///
Dot,
///
/// A '<'.
///
LessThan,
///
/// A '>'.
///
GreaterThan,
///
/// A '&'.
///
And,
///
/// A '*'.
///
Multiply,
///
/// A '/'.
///
Divide,
///
/// A '+'.
///
Add,
///
/// A '-'.
///
Sub,
///
/// An unknown symbol.
///
Unknown,
///
/// A end of file token.
///
EndOfFile,
///
/// A '<'.
///
LeftAngleBracket = LessThan,
///
/// A '>'.
///
RightAngleBracket = GreaterThan,
}
///
/// Contains information about a token language.
///
public class Token : IEquatable
{
///
/// Initializes a new instance of the class.
///
public Token()
{
}
///
/// Initializes a new instance of the struct.
///
/// The type.
/// The value.
public Token(TokenType type, string value)
{
Type = type;
Value = value;
}
///
/// The type of the token.
///
public TokenType Type;
///
/// Value of the token.
///
public string Value;
///
public override string ToString()
{
return string.Format("{{{0}:{1}}}", Type, Value);
}
///
public bool Equals(Token other)
{
if (ReferenceEquals(null, other))
return false;
if (ReferenceEquals(this, other))
return true;
return Type == other.Type && Value == other.Value;
}
///
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj))
return false;
if (ReferenceEquals(this, obj))
return true;
if (obj.GetType() != this.GetType())
return false;
return Equals((Token)obj);
}
///
public override int GetHashCode()
{
unchecked
{
return ((int)Type * 397) ^ (Value != null ? Value.GetHashCode() : 0);
}
}
}
///
/// The tokens parsing utility that implements basic logic for generic C-like syntax source code parsing.
///
public class Tokenizer : IDisposable
{
private static readonly Regex RegexTokenizer = new Regex
(
@"(?[ \t]+)|" +
@"(?(?:\r\n|\n))|" +
@"(?/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)|" +
@"(?//(.*?)\r?\n)|" +
@"(?[a-zA-Z_][a-zA-Z0-9_]*)|" +
@"(?0x[0-9a-fA-F]+)|" +
@"(?[\-\+]?\s*[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?f?)|" +
@"(?=)|" +
@"(?,)|" +
@"(?;)|" +
@"(?\{)|" +
@"(?\})|" +
@"(?\()|" +
@"(?\))|" +
@"(?\[)|" +
@"(?\])|" +
@"(?""[^""\\]*(?:\\.[^""\\]*)*"")|" +
@"(?'[^'\\]*(?:\\.[^'\\]*)*')|" +
@"(?#)|" +
@"(?:)|" +
@"(?::)|" +
@"(?\.)|" +
@"(?\<)|" +
@"(?\>)|" +
@"(?\&)|" +
@"(?\*)|" +
@"(?\/)|" +
@"(?
\+)|" +
@"(?\-)|" +
@"(?[^\s]+)",
RegexOptions.Compiled
);
private ITwoWayEnumerator _tokenEnumerator;
private int _line = 1;
///
/// Gets the current token.
///
public Token CurrentToken => _tokenEnumerator.Current;
///
/// Gets the current line number (starting from number 1).
///
public int CurrentLine => _line;
///
/// Tokenizes the given file (through constructor).
///
/// The source code for this tokenizer to run on.
public void Tokenize(string sourceCode)
{
if (_tokenEnumerator != null)
throw new Exception("This code is already parsed!");
var tokens = TokenizeInternal(sourceCode);
_tokenEnumerator = tokens.GetTwoWayEnumerator();
}
///
/// Gets next token.
///
/// When false, all white-space tokens will be ignored.
/// When false, all comment (single line and multi-line) tokens will be ignored.
/// The token. Check for EndOfFile token-type to detect end-of-file.
public Token NextToken(bool includeWhitespaces = false, bool includeComments = false)
{
while (_tokenEnumerator.MoveNext())
{
var token = _tokenEnumerator.Current;
if (token == null)
continue;
_line += CountLines(token);
if (token.Type == TokenType.Newline)
{
if (includeWhitespaces)
return token;
continue;
}
if (!includeWhitespaces && token.Type == TokenType.Whitespace)
{
continue;
}
return token;
}
return new Token(TokenType.EndOfFile, string.Empty);
}
///
/// Moves to the previous the token.
///
/// If set to true includes whitespaces.
/// If set to true include comments.
/// The token. Check for EndOfFile token-type to detect end-of-file.
public Token PreviousToken(bool includeWhitespaces = false, bool includeComments = false)
{
while (_tokenEnumerator.MovePrevious())
{
var token = _tokenEnumerator.Current;
if (token == null)
continue;
_line -= CountLines(token);
if (token.Type == TokenType.Newline)
{
if (includeWhitespaces)
return token;
continue;
}
if (!includeWhitespaces && token.Type == TokenType.Whitespace)
{
continue;
}
return token;
}
return new Token(TokenType.EndOfFile, string.Empty);
}
///
/// Expects any token of given types. Throws when token is not found.
///
/// The allowed token types.
/// When false, all white-space tokens will be ignored.
/// When false, all comment (single line and multi-line) tokens will be ignored.
/// The found token.
public Token ExpectAnyTokens(TokenType[] tokenTypes, bool includeWhitespaces = false, bool includeComments = false)
{
var token = NextToken(includeWhitespaces, includeComments);
if (tokenTypes.Contains(token.Type))
return token;
throw new Exception($"Expected {string.Join(" or ", tokenTypes)}, but got {token} at line {_line}.");
}
///
/// Expects token of given types in the same order. Throws when token is not found.
///
/// The allowed token types.
/// When false, all white-space tokens will be ignored.
/// When false, all comment (single line and multi-line) tokens will be ignored.
/// The found token.
public void ExpectAllTokens(TokenType[] tokenTypes, bool includeWhitespaces = false, bool includeComments = false)
{
foreach (var tokenType in tokenTypes)
{
var token = NextToken(includeWhitespaces, includeComments);
if (token.Type != tokenType)
throw new Exception($"Expected {tokenType}, but got {token} at line {_line}.");
}
}
///
/// Expects any token of given type. Throws when token is not found.
///
/// The only allowed token type.
/// When false, all white-space tokens will be ignored.
/// When false, all comment (single line and multi-line) tokens will be ignored.
/// The found token.
public Token ExpectToken(TokenType tokenType, bool includeWhitespaces = false, bool includeComments = false)
{
var token = NextToken(includeWhitespaces, includeComments);
if (token.Type == tokenType)
return token;
throw new Exception($"Expected {tokenType}, but got {token} at line {_line}.");
}
///
/// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token).
///
/// The expected token type.
public void SkipUntil(TokenType tokenType)
{
do
{
} while (NextToken(true).Type != tokenType);
}
///
/// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token).
///
/// The expected token type.
/// The output contents of the skipped tokens.
public void SkipUntil(TokenType tokenType, out string context)
{
context = string.Empty;
while (NextToken(true).Type != tokenType)
{
context += CurrentToken.Value;
}
}
///
/// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token).
///
/// The expected token type.
/// The output contents of the skipped tokens.
/// When false, all white-space tokens will be ignored.
public void SkipUntil(TokenType tokenType, out string context, bool includeWhitespaces)
{
context = string.Empty;
while (NextToken(true).Type != tokenType)
{
var token = CurrentToken;
if (!includeWhitespaces && (token.Type == TokenType.Newline || token.Type == TokenType.Whitespace))
continue;
context += token.Value;
}
}
///
/// Disposes the .
///
public void Dispose()
{
_tokenEnumerator?.Dispose();
}
private IEnumerable TokenizeInternal(string input)
{
var matches = RegexTokenizer.Matches(input);
foreach (Match match in matches)
{
var i = 0;
foreach (Group group in match.Groups)
{
var matchValue = group.Value;
if (group.Success && i > 1)
{
yield return new Token
{
Type = (TokenType)(i - 2),
Value = matchValue
};
}
i++;
}
}
}
private int CountLines(Token token)
{
int result = 0;
switch (token.Type)
{
case TokenType.Newline:
case TokenType.CommentSingleLine:
result = 1;
break;
case TokenType.CommentMultiLine:
for (int i = 0; i < token.Value.Length; i++)
{
if (token.Value[i] == '\n')
result++;
}
break;
}
return result;
}
}
}