diff --git a/Source/Engine/Utilities/HtmlParser.cs b/Source/Engine/Utilities/HtmlParser.cs new file mode 100644 index 000000000..07296602d --- /dev/null +++ b/Source/Engine/Utilities/HtmlParser.cs @@ -0,0 +1,348 @@ +// Copyright (c) 2012-2022 Wojciech Figat. All rights reserved. + +// Parsing HTML Tags in C# by Jonathan Wood (4 Jul 2012) +// https://www.codeproject.com/Articles/57176/Parsing-HTML-Tags-in-C + +using System; +using System.Collections.Generic; + +namespace FlaxEngine.Utilities +{ + /// + /// The HTML tag description. + /// + public struct HtmlTag + { + /// + /// Name of the tag. + /// + public string Name; + + /// + /// Tag start position in the source text (character index). + /// + public int StartPosition; + + /// + /// Tag end position in the source text (character index). Includes any tag attributes data. + /// + public int EndPosition; + + /// + /// Collection of attributes for this tag (name + value pairs). + /// + public Dictionary Attributes; + + /// + /// True if this tag contained a trailing forward slash (end of the tag). + /// + public bool IsTrailingSlash; + }; + + /// + /// Utility for HTML documents parsing into series of tags. + /// + public class HtmlParser + { + private string _html; + private int _pos; + private bool _scriptBegin; + private Dictionary _attributes; + private char[] _newLineChars; + + /// + /// Initializes a new instance of the class. + /// + public HtmlParser() + : this(string.Empty) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// Text to parse. + public HtmlParser(string html) + { + _html = html; + _pos = 0; + _attributes = new Dictionary(); + _newLineChars = new[] { '\0', '\r', '\n' }; // [0] is modified by ParseAttributeValue + } + + /// + /// Resets the reading position to the text beginning. + /// + /// Text to parse. Null if unchanged. + public void Reset(string html = null) + { + if (html != null) + _html = html; + _pos = 0; + } + + /// + /// Indicates if the current position is at the end of the current document. + /// + private bool EOF => _pos >= _html.Length; + + /// + /// Parses the next tag that matches the specified tag name. + /// + /// Returns information on the next occurrence of the specified tag or null if none found. + /// Name of the tags to parse (null to parse all tags). + /// True if a tag was parsed or false if the end of the document was reached. + public bool ParseNext(out HtmlTag tag, string name = null) + { + tag = new HtmlTag(); + + // Loop until match is found or there are no more tags + while (MoveToNextTag()) + { + // Skip opening '<' + Move(); + + // Examine first tag character + char c = Peek(); + if (c == '!' && Peek(1) == '-' && Peek(2) == '-') + { + // Skip over comments + const string endComment = "-->"; + _pos = _html.IndexOf(endComment, _pos, StringComparison.Ordinal); + NormalizePosition(); + Move(endComment.Length); + } + else if (c == '/') + { + // Skip over closing tags + _pos = _html.IndexOf('>', _pos); + NormalizePosition(); + Move(); + } + else + { + // Parse tag + bool result = ParseTag(ref tag, name); + + // Because scripts may contain tag characters, we need special handling to skip over script contents + if (_scriptBegin) + { + const string endScript = "') + Move(); + } + + // Return true if requested tag was found + if (result) + return true; + } + } + + return false; + } + + /// + /// Parses the contents of an HTML tag. The current position should be at the first character following the tag's opening less-than character. + /// + /// We parse to the end of the tag even if this tag was not requested by the caller. This ensures subsequent parsing takes place after this tag. + /// Returns information on this tag if it's one the caller is requesting. + /// Name of the tags to parse (null to parse all tags). + /// True if data is being returned for a tag requested by the caller or false otherwise. + private bool ParseTag(ref HtmlTag tag, string name = null) + { + // Get name of this tag + int start = _pos; + string s = ParseTagName(); + + // Special handling + bool doctype = _scriptBegin = false; + if (string.Compare(s, "!DOCTYPE", StringComparison.OrdinalIgnoreCase) == 0) + doctype = true; + else if (string.Compare(s, "script", StringComparison.OrdinalIgnoreCase) == 0) + _scriptBegin = true; + + // Is this a tag requested by caller? + bool requested = false; + if (name == null || string.Compare(s, name, StringComparison.OrdinalIgnoreCase) == 0) + { + // Setup new tag + _attributes.Clear(); + tag = new HtmlTag + { + Name = s, + StartPosition = start - 1, + Attributes = _attributes, + }; + requested = true; + } + + // Parse attributes + SkipWhitespace(); + while (Peek() != '>') + { + if (Peek() == '/') + { + // Handle trailing forward slash + if (requested) + tag.IsTrailingSlash = true; + Move(); + SkipWhitespace(); + + // If this is a script tag, it was closed + _scriptBegin = false; + } + else + { + // Parse attribute name + s = !doctype ? ParseAttributeName() : ParseAttributeValue(); + SkipWhitespace(); + + // Parse attribute value + var value = string.Empty; + if (Peek() == '=') + { + Move(); + SkipWhitespace(); + value = ParseAttributeValue(); + SkipWhitespace(); + } + + // Add attribute to collection if requested tag + if (requested) + { + tag.Attributes[s] = value; + } + } + } + + // Skip over closing '>' + Move(); + + tag.EndPosition = _pos; + return requested; + } + + /// + /// Parses a tag name. The current position should be the first character of the name. + /// + /// Returns the parsed name string. + private string ParseTagName() + { + int start = _pos; + while (!EOF) + { + var c = Peek(); + if (!char.IsLetterOrDigit(c)) + break; + Move(); + } + return _html.Substring(start, _pos - start); + } + + /// + /// Parses an attribute name. The current position should be the first character of the name. + /// + /// Returns the parsed name string. + private string ParseAttributeName() + { + int start = _pos; + while (!EOF && !char.IsWhiteSpace(Peek()) && Peek() != '>' && Peek() != '=') + Move(); + return _html.Substring(start, _pos - start); + } + + /// + /// Parses an attribute value. The current position should be the first non-whitespace character following the equal sign. + /// + /// We terminate the name or value if we encounter a new line. This seems to be the best way of handling errors such as values missing closing quotes, etc. + /// Returns the parsed value string. + private string ParseAttributeValue() + { + int start, end; + char c = Peek(); + if (c == '"' || c == '\'') + { + // Move past opening quote + Move(); + + // Parse quoted value + start = _pos; + _newLineChars[0] = c; + _pos = _html.IndexOfAny(_newLineChars, start); + NormalizePosition(); + end = _pos; + + // Move past closing quote + if (Peek() == c) + Move(); + } + else + { + // Parse unquoted value + start = _pos; + while (!EOF && !char.IsWhiteSpace(c) && c != '>') + { + Move(); + c = Peek(); + } + end = _pos; + } + return _html.Substring(start, end - start); + } + + /// + /// Moves to the start of the next tag. + /// + /// True if another tag was found, false otherwise. + private bool MoveToNextTag() + { + _pos = _html.IndexOf('<', _pos); + NormalizePosition(); + return !EOF; + } + + /// + /// Returns the character at the specified number of characters beyond the current position, or a null character if the specified position is at the end of the document. + /// + /// The number of characters beyond the current position. + /// The character at the specified position. + private char Peek(int ahead = 0) + { + int pos = _pos + ahead; + if (pos < _html.Length) + return _html[pos]; + return (char)0; + } + + /// + /// Moves the current position ahead the specified number of characters. + /// + /// The number of characters to move ahead. + private void Move(int ahead = 1) + { + _pos = Math.Min(_pos + ahead, _html.Length); + } + + /// + /// Moves the current position to the next character that is not whitespace. + /// + private void SkipWhitespace() + { + while (!EOF && char.IsWhiteSpace(Peek())) + Move(); + } + + /// + /// Normalizes the current position. This is primarily for handling conditions where IndexOf(), etc. return negative values when the item being sought was not found. + /// + private void NormalizePosition() + { + if (_pos < 0) + _pos = _html.Length; + } + } +} diff --git a/Source/Tools/FlaxEngine.Tests/FlaxEngine.Tests.csproj b/Source/Tools/FlaxEngine.Tests/FlaxEngine.Tests.csproj index 8bd2c1ed3..7d0fb9d1d 100644 --- a/Source/Tools/FlaxEngine.Tests/FlaxEngine.Tests.csproj +++ b/Source/Tools/FlaxEngine.Tests/FlaxEngine.Tests.csproj @@ -49,6 +49,7 @@ + diff --git a/Source/Tools/FlaxEngine.Tests/TestHtmlParser.cs b/Source/Tools/FlaxEngine.Tests/TestHtmlParser.cs new file mode 100644 index 000000000..e48c7e0a6 --- /dev/null +++ b/Source/Tools/FlaxEngine.Tests/TestHtmlParser.cs @@ -0,0 +1,99 @@ +// Copyright (c) 2012-2022 Wojciech Figat. All rights reserved. + +using System.Collections.Generic; +using FlaxEngine.Utilities; +using NUnit.Framework; + +namespace FlaxEngine.Tests +{ + [TestFixture] + public class TestHtmlParser + { + [TestCase("")] + [TestCase("a")] + [TestCase("a\na")] + [TestCase("a\ra")] + [TestCase("")] + [TestCase("")] + [TestCase("")] + [TestCase("b")] + [TestCase("")] + public void TestValid(string html) + { + var parser = new HtmlParser(html); + while (parser.ParseNext(out var tag)) + { + Assert.IsNotNull(tag.Name); + Assert.IsNotNull(tag.Attributes); + } + } + + [TestCase("bb", ExpectedResult = new[] { "a" })] + [TestCase("bb", ExpectedResult = new[] { "a" })] + [TestCase("bb", ExpectedResult = new[] { "a" })] + [TestCase("b", ExpectedResult = new[] { "a", "a" })] + [TestCase("b", ExpectedResult = new[] { "a", "a" })] + [TestCase("b", ExpectedResult = new[] { "a", "a" })] + public string[] TestTags(string html) + { + var tags = new List(); + var parser = new HtmlParser(html); + while (parser.ParseNext(out var tag)) + { + tags.Add(tag.Name); + } + return tags.ToArray(); + } + + [TestCase("bb", ExpectedResult = "size=50")] + [TestCase("bb", ExpectedResult = "size=50,len=60")] + [TestCase("bb", ExpectedResult = "size=50")] + [TestCase("bb", ExpectedResult = "size=5 0")] + [TestCase("bb", ExpectedResult = "size=50%")] + [TestCase("bb", ExpectedResult = "size=#FF")] + [TestCase("bb", ExpectedResult = "=value")] + public string TestAttributes(string html) + { + var result = string.Empty; + var parser = new HtmlParser(html); + while (parser.ParseNext(out var tag)) + { + foreach (var e in tag.Attributes) + { + if (result.Length != 0) + result += ','; + result += e.Key + "=" + e.Value; + } + } + return result; + } + + [TestCase("b", ExpectedResult = 1)] + [TestCase("sd b ", ExpectedResult = 5)] + public int TestStartPosition(string html) + { + var parser = new HtmlParser(html); + while (parser.ParseNext(out var tag)) + { + return tag.StartPosition; + } + return -1; + } + + [TestCase("b", ExpectedResult = 4)] + [TestCase("b", ExpectedResult = 5)] + [TestCase("b", ExpectedResult = 6)] + [TestCase("b", ExpectedResult = 7)] + [TestCase("b", ExpectedResult = 12)] + [TestCase("b", ExpectedResult = 14)] + public int TestEndPosition(string html) + { + var parser = new HtmlParser(html); + while (parser.ParseNext(out var tag)) + { + return tag.EndPosition; + } + return -1; + } + } +}