// Copyright (c) 2012-2023 Wojciech Figat. All rights reserved. using System; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; namespace Flax.Build { /// /// Types of the tokens supported by the . /// public enum TokenType { /// /// A whitespace. /// Whitespace, /// /// A Newline. /// Newline, /// /// A multi line comment. /// CommentMultiLine, /// /// A single line comment. /// CommentSingleLine, /// /// An identifier. /// Identifier, /// /// A number in hexadecimal form. /// Hex, /// /// A number. /// Number, /// /// The symbol '='. /// Equal, /// /// A comma ','. /// Comma, /// /// A Semicolon ';'. /// SemiColon, /// /// A left curly brace '{'. /// LeftCurlyBrace, /// /// A right curly brace '}'. /// RightCurlyBrace, /// /// A left parenthesis '('. /// LeftParent, /// /// A right parenthesis ')'. /// RightParent, /// /// A left bracket '['. /// LeftBracket, /// /// A right bracket ']'. /// RightBracket, /// /// A text. /// String, /// /// An character. /// Character, /// /// A preprocessor token '#' /// Preprocessor, /// /// A colon ':'. /// Colon, /// /// A double colon '::'. /// DoubleColon, /// /// A dot '.'. /// Dot, /// /// A '<'. /// LessThan, /// /// A '>'. /// GreaterThan, /// /// A '&'. /// And, /// /// A '*'. /// Multiply, /// /// A '/'. /// Divide, /// /// A '+'. /// Add, /// /// A '-'. /// Sub, /// /// An unknown symbol. /// Unknown, /// /// A end of file token. /// EndOfFile, /// /// A '<'. /// LeftAngleBracket = LessThan, /// /// A '>'. /// RightAngleBracket = GreaterThan, } /// /// Contains information about a token language. /// public class Token : IEquatable { /// /// Initializes a new instance of the class. /// public Token() { } /// /// Initializes a new instance of the struct. /// /// The type. /// The value. public Token(TokenType type, string value) { Type = type; Value = value; } /// /// The type of the token. /// public TokenType Type; /// /// Value of the token. /// public string Value; /// public override string ToString() { return string.Format("{{{0}:{1}}}", Type, Value); } /// public bool Equals(Token other) { if (ReferenceEquals(null, other)) return false; if (ReferenceEquals(this, other)) return true; return Type == other.Type && Value == other.Value; } /// public override bool Equals(object obj) { if (ReferenceEquals(null, obj)) return false; if (ReferenceEquals(this, obj)) return true; if (obj.GetType() != this.GetType()) return false; return Equals((Token)obj); } /// public override int GetHashCode() { unchecked { return ((int)Type * 397) ^ (Value != null ? Value.GetHashCode() : 0); } } } /// /// The tokens parsing utility that implements basic logic for generic C-like syntax source code parsing. /// public class Tokenizer : IDisposable { private static readonly Regex RegexTokenizer = new Regex ( @"(?[ \t]+)|" + @"(?(?:\r\n|\n))|" + @"(?/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)|" + @"(?//(.*?)\r?\n)|" + @"(?[a-zA-Z_][a-zA-Z0-9_]*)|" + @"(?0x[0-9a-fA-F]+)|" + @"(?[\-\+]?\s*[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?f?)|" + @"(?=)|" + @"(?,)|" + @"(?;)|" + @"(?\{)|" + @"(?\})|" + @"(?\()|" + @"(?\))|" + @"(?\[)|" + @"(?\])|" + @"(?""[^""\\]*(?:\\.[^""\\]*)*"")|" + @"(?'[^'\\]*(?:\\.[^'\\]*)*')|" + @"(?#)|" + @"(?:)|" + @"(?::)|" + @"(?\.)|" + @"(?\<)|" + @"(?\>)|" + @"(?\&)|" + @"(?\*)|" + @"(?
\/)|" + @"(?\+)|" + @"(?\-)|" + @"(?[^\s]+)", RegexOptions.Compiled ); private ITwoWayEnumerator _tokenEnumerator; private int _line = 1; /// /// Gets the current token. /// public Token CurrentToken => _tokenEnumerator.Current; /// /// Gets the current line number (starting from number 1). /// public int CurrentLine => _line; /// /// Tokenizes the given file (through constructor). /// /// The source code for this tokenizer to run on. public void Tokenize(string sourceCode) { if (_tokenEnumerator != null) throw new Exception("This code is already parsed!"); var tokens = TokenizeInternal(sourceCode); _tokenEnumerator = tokens.GetTwoWayEnumerator(); } /// /// Gets next token. /// /// When false, all white-space tokens will be ignored. /// When false, all comment (single line and multi-line) tokens will be ignored. /// The token. Check for EndOfFile token-type to detect end-of-file. public Token NextToken(bool includeWhitespaces = false, bool includeComments = false) { while (_tokenEnumerator.MoveNext()) { var token = _tokenEnumerator.Current; if (token == null) continue; _line += CountLines(token); if (token.Type == TokenType.Newline) { if (includeWhitespaces) return token; continue; } if (!includeWhitespaces && token.Type == TokenType.Whitespace) { continue; } return token; } return new Token(TokenType.EndOfFile, string.Empty); } /// /// Moves to the previous the token. /// /// If set to true includes whitespaces. /// If set to true include comments. /// The token. Check for EndOfFile token-type to detect end-of-file. public Token PreviousToken(bool includeWhitespaces = false, bool includeComments = false) { while (_tokenEnumerator.MovePrevious()) { var token = _tokenEnumerator.Current; if (token == null) continue; _line -= CountLines(token); if (token.Type == TokenType.Newline) { if (includeWhitespaces) return token; continue; } if (!includeWhitespaces && token.Type == TokenType.Whitespace) { continue; } return token; } return new Token(TokenType.EndOfFile, string.Empty); } /// /// Expects any token of given types. Throws when token is not found. /// /// The allowed token types. /// When false, all white-space tokens will be ignored. /// When false, all comment (single line and multi-line) tokens will be ignored. /// The found token. public Token ExpectAnyTokens(TokenType[] tokenTypes, bool includeWhitespaces = false, bool includeComments = false) { var token = NextToken(includeWhitespaces, includeComments); if (tokenTypes.Contains(token.Type)) return token; throw new Exception($"Expected {string.Join(" or ", tokenTypes)}, but got {token} at line {_line}."); } /// /// Expects token of given types in the same order. Throws when token is not found. /// /// The allowed token types. /// When false, all white-space tokens will be ignored. /// When false, all comment (single line and multi-line) tokens will be ignored. /// The found token. public void ExpectAllTokens(TokenType[] tokenTypes, bool includeWhitespaces = false, bool includeComments = false) { foreach (var tokenType in tokenTypes) { var token = NextToken(includeWhitespaces, includeComments); if (token.Type != tokenType) throw new Exception($"Expected {tokenType}, but got {token} at line {_line}."); } } /// /// Expects any token of given type. Throws when token is not found. /// /// The only allowed token type. /// When false, all white-space tokens will be ignored. /// When false, all comment (single line and multi-line) tokens will be ignored. /// The found token. public Token ExpectToken(TokenType tokenType, bool includeWhitespaces = false, bool includeComments = false) { var token = NextToken(includeWhitespaces, includeComments); if (token.Type == tokenType) return token; throw new Exception($"Expected {tokenType}, but got {token} at line {_line}."); } /// /// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token). /// /// The expected token type. public void SkipUntil(TokenType tokenType) { do { } while (NextToken(true).Type != tokenType); } /// /// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token). /// /// The expected token type. /// The output contents of the skipped tokens. public void SkipUntil(TokenType tokenType, out string context) { context = string.Empty; while (NextToken(true).Type != tokenType) { context += CurrentToken.Value; } } /// /// Skips all tokens until the tokenizer steps into token of given type (and it is also skipped, so, NextToken will give the next token). /// /// The expected token type. /// The output contents of the skipped tokens. /// When false, all white-space tokens will be ignored. public void SkipUntil(TokenType tokenType, out string context, bool includeWhitespaces) { context = string.Empty; while (NextToken(true).Type != tokenType) { var token = CurrentToken; if (!includeWhitespaces && (token.Type == TokenType.Newline || token.Type == TokenType.Whitespace)) continue; context += token.Value; } } /// /// Disposes the . /// public void Dispose() { _tokenEnumerator?.Dispose(); } private IEnumerable TokenizeInternal(string input) { var matches = RegexTokenizer.Matches(input); foreach (Match match in matches) { var i = 0; foreach (Group group in match.Groups) { var matchValue = group.Value; if (group.Success && i > 1) { yield return new Token { Type = (TokenType)(i - 2), Value = matchValue }; } i++; } } } private int CountLines(Token token) { int result = 0; switch (token.Type) { case TokenType.Newline: case TokenType.CommentSingleLine: result = 1; break; case TokenType.CommentMultiLine: for (int i = 0; i < token.Value.Length; i++) { if (token.Value[i] == '\n') result++; } break; } return result; } } }