// Copyright (c) Wojciech Figat. All rights reserved. // Parsing HTML Tags in C# by Jonathan Wood (4 Jul 2012) // https://www.codeproject.com/Articles/57176/Parsing-HTML-Tags-in-C using System; using System.Collections.Generic; namespace FlaxEngine.Utilities { /// /// The HTML tag description. /// public struct HtmlTag { /// /// Name of the tag. /// public string Name; /// /// Tag start position in the source text (character index). /// public int StartPosition; /// /// Tag end position in the source text (character index). Includes any tag attributes data. /// public int EndPosition; /// /// Collection of attributes for this tag (name + value pairs). /// public Dictionary Attributes; /// /// True if this tag contained a leading forward slash (begin of the tag). /// public bool IsLeadingSlash; /// /// True if this tag contained a trailing forward slash (end of the tag). /// public bool IsEndingSlash; /// /// True if this tag contained a leading or trailing forward slash. /// public bool IsSlash => IsLeadingSlash || IsEndingSlash; /// public override string ToString() { return Name; } }; /// /// Utility for HTML documents parsing into series of tags. /// public class HtmlParser { private string _html; private int _pos; private bool _scriptBegin; private Dictionary _attributes; private char[] _newLineChars; /// /// Initializes a new instance of the class. /// public HtmlParser() : this(string.Empty) { } /// /// Initializes a new instance of the class. /// /// Text to parse. public HtmlParser(string html) { _html = html; _pos = 0; _attributes = new Dictionary(); _newLineChars = new[] { '\0', '\r', '\n' }; // [0] is modified by ParseAttributeValue } /// /// Resets the reading position to the text beginning. /// /// Text to parse. Null if unchanged. public void Reset(string html = null) { if (html != null) _html = html; _pos = 0; } /// /// Indicates if the current position is at the end of the current document. /// private bool EOF => _pos >= _html.Length; /// /// Parses the next tag that matches the specified tag name. /// /// Returns information on the next occurrence of the specified tag or null if none found. /// Name of the tags to parse (null to parse all tags). /// True if a tag was parsed or false if the end of the document was reached. public bool ParseNext(out HtmlTag tag, string name = null) { tag = new HtmlTag(); // Loop until match is found or there are no more tags while (MoveToNextTag()) { // Skip opening '<' Move(); char c = Peek(); if (c == '!' && Peek(1) == '-' && Peek(2) == '-') { // Skip over comments const string endComment = "-->"; _pos = _html.IndexOf(endComment, _pos, StringComparison.Ordinal); NormalizePosition(); Move(endComment.Length); } else { // Skip leading slash bool isLeadingSlash = c == '/'; if (isLeadingSlash) Move(); // Dont process if wrong slash is used. if (c =='\\') return false; // Parse tag bool result = ParseTag(ref tag, name); // Because scripts may contain tag characters, we need special handling to skip over script contents if (_scriptBegin) { const string endScript = "') Move(); } if (result) { if (isLeadingSlash) { // Tag starts with '/' tag.StartPosition--; tag.IsLeadingSlash = true; } return true; } } } return false; } /// /// Parses the contents of an HTML tag. The current position should be at the first character following the tag's opening less-than character. /// /// We parse to the end of the tag even if this tag was not requested by the caller. This ensures subsequent parsing takes place after this tag. /// Returns information on this tag if it's one the caller is requesting. /// Name of the tags to parse (null to parse all tags). /// True if data is being returned for a tag requested by the caller or false otherwise. private bool ParseTag(ref HtmlTag tag, string name = null) { // Get name of this tag int start = _pos; string s = ParseTagName(); // Special handling bool doctype = _scriptBegin = false; if (string.Compare(s, "!DOCTYPE", StringComparison.OrdinalIgnoreCase) == 0) doctype = true; else if (string.Compare(s, "script", StringComparison.OrdinalIgnoreCase) == 0) _scriptBegin = true; // Is this a tag requested by caller? bool requested = false; if (name == null || string.Compare(s, name, StringComparison.OrdinalIgnoreCase) == 0) { // Setup new tag _attributes.Clear(); tag = new HtmlTag { Name = s, StartPosition = start - 1, Attributes = _attributes, }; requested = true; } // Parse attributes SkipWhitespace(); while (Peek() != '>') { // Return false if start of new html tag is detected. if (Peek() == '<') return false; if (Peek() == '/') { // Handle trailing forward slash if (requested) tag.IsEndingSlash = true; Move(); SkipWhitespace(); // If this is a script tag, it was closed _scriptBegin = false; } else { // Parse attribute name s = !doctype ? ParseAttributeName() : ParseAttributeValue(); SkipWhitespace(); // Parse attribute value var value = string.Empty; if (Peek() == '=') { Move(); SkipWhitespace(); value = ParseAttributeValue(); SkipWhitespace(); } // Add attribute to collection if requested tag if (requested) { tag.Attributes[s] = value; } } if (EOF) return false; } // Skip over closing '>' Move(); tag.EndPosition = _pos; return requested; } /// /// Parses a tag name. The current position should be the first character of the name. /// /// Returns the parsed name string. private string ParseTagName() { int start = _pos; while (!EOF) { var c = Peek(); if (!char.IsLetterOrDigit(c)) break; Move(); } return _html.Substring(start, _pos - start); } /// /// Parses an attribute name. The current position should be the first character of the name. /// /// Returns the parsed name string. private string ParseAttributeName() { int start = _pos; while (!EOF) { var c = Peek(); if (!char.IsLetterOrDigit(c) && c != '-') break; Move(); } return _html.Substring(start, _pos - start); } /// /// Parses an attribute value. The current position should be the first non-whitespace character following the equal sign. /// /// We terminate the name or value if we encounter a new line. This seems to be the best way of handling errors such as values missing closing quotes, etc. /// Returns the parsed value string. private string ParseAttributeValue() { int start, end; char c = Peek(); if (c == '"' || c == '\'') { // Move past opening quote Move(); // Parse quoted value start = _pos; _newLineChars[0] = c; _pos = _html.IndexOfAny(_newLineChars, start); NormalizePosition(); end = _pos; // Move past closing quote if (Peek() == c) Move(); } else { // Parse unquoted value start = _pos; while (!EOF && !char.IsWhiteSpace(c) && c != '>' && c != '/') { Move(); c = Peek(); } end = _pos; } return _html.Substring(start, end - start); } /// /// Moves to the start of the next tag. /// /// True if another tag was found, false otherwise. private bool MoveToNextTag() { _pos = _html.IndexOf('<', _pos); NormalizePosition(); return !EOF; } /// /// Returns the character at the specified number of characters beyond the current position, or a null character if the specified position is at the end of the document. /// /// The number of characters beyond the current position. /// The character at the specified position. private char Peek(int ahead = 0) { int pos = _pos + ahead; if (pos < _html.Length) return _html[pos]; return (char)0; } /// /// Moves the current position ahead the specified number of characters. /// /// The number of characters to move ahead. private void Move(int ahead = 1) { _pos = Math.Min(_pos + ahead, _html.Length); } /// /// Moves the current position to the next character that is not whitespace. /// private void SkipWhitespace() { while (!EOF && char.IsWhiteSpace(Peek())) Move(); } /// /// Normalizes the current position. This is primarily for handling conditions where IndexOf(), etc. return negative values when the item being sought was not found. /// private void NormalizePosition() { if (_pos < 0) _pos = _html.Length; } } }