// basisu_transcoder_internal.h - Universal texture format transcoder library.
// Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved.
//
// Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once

#ifdef _MSC_VER
#pragma warning (disable: 4127) //  conditional expression is constant
#endif

// v1.50: Added UASTC HDR 4x4 support
// v1.60: Added RDO ASTC HDR 6x6 and intermediate support
// v1.65: Added ASTC LDR 4x4-12x12 and XUASTC LDR 4x4-12x12
// v2.00: Added unified effort/quality options across all formats, fast direct transcoding of XUASTC 4x4/6x6/8x6 to BC7, adaptive deblocking, ZStd or arithmetic profiles, weight grid DCT
#define BASISD_LIB_VERSION 200
#define BASISD_VERSION_STRING "02.00"

#ifdef _DEBUG
#define BASISD_BUILD_DEBUG
#else
#define BASISD_BUILD_RELEASE
#endif

#include "basisu.h"
#include "basisu_astc_helpers.h"

#define BASISD_znew (z = 36969 * (z & 65535) + (z >> 16))

namespace basisu
{
	extern bool g_debug_printf;
}

namespace basist
{
	// Low-level formats directly supported by the transcoder (other supported texture formats are combinations of these low-level block formats).
	// You probably don't care about these enum's unless you are going pretty low-level and calling the transcoder to decode individual slices.
	enum class block_format
	{
		cETC1,								// ETC1S RGB 
		cETC2_RGBA,							// full ETC2 EAC RGBA8 block
		cBC1,								// DXT1 RGB 
		cBC3,								// BC4 block followed by a four color BC1 block
		cBC4,								// DXT5A (alpha block only)
		cBC5,								// two BC4 blocks
		cPVRTC1_4_RGB,						// opaque-only PVRTC1 4bpp
		cPVRTC1_4_RGBA,						// PVRTC1 4bpp RGBA
		cBC7,								// Full BC7 block, any mode
		cBC7_M5_COLOR,						// RGB BC7 mode 5 color (writes an opaque mode 5 block)
		cBC7_M5_ALPHA,						// alpha portion of BC7 mode 5 (cBC7_M5_COLOR output data must have been written to the output buffer first to set the mode/rot fields etc.)
		cETC2_EAC_A8,						// alpha block of ETC2 EAC (first 8 bytes of the 16-bit ETC2 EAC RGBA format)
		cASTC_LDR_4x4,						// ASTC LDR 4x4 (either color-only or color+alpha). Note that the transcoder always currently assumes sRGB decode mode is not enabled when outputting ASTC LDR for ETC1S/UASTC LDR 4x4.
											// data. If you use a sRGB ASTC format you'll get ~1 LSB of additional error, because of the different way ASTC decoders scale 8-bit endpoints to 16-bits during unpacking.
		
		cATC_RGB,
		cATC_RGBA_INTERPOLATED_ALPHA,
		cFXT1_RGB,							// Opaque-only, has oddball 8x4 pixel block size

		cPVRTC2_4_RGB,
		cPVRTC2_4_RGBA,

		cETC2_EAC_R11,
		cETC2_EAC_RG11,
												
		cIndices,							// Used internally: Write 16-bit endpoint and selector indices directly to output (output block must be at least 32-bits)

		cRGB32,								// Writes RGB components to 32bpp output pixels
		cRGBA32,							// Writes RGB255 components to 32bpp output pixels
		cA32,								// Writes alpha component to 32bpp output pixels
				
		cRGB565,
		cBGR565,
		
		cRGBA4444_COLOR,
		cRGBA4444_ALPHA,
		cRGBA4444_COLOR_OPAQUE,
		cRGBA4444,
		cRGBA_HALF,
		cRGB_HALF,
		cRGB_9E5,

		cUASTC_4x4,							// LDR, universal
		cUASTC_HDR_4x4,						// HDR, transcodes only to 4x4 HDR ASTC, BC6H, or uncompressed
		cBC6H,
		
		cASTC_HDR_4x4,
		cASTC_HDR_6x6,

		// The remaining ASTC LDR block sizes.
		cASTC_LDR_5x4,
		cASTC_LDR_5x5,
		cASTC_LDR_6x5,
		cASTC_LDR_6x6,
		cASTC_LDR_8x5,
		cASTC_LDR_8x6,
		cASTC_LDR_10x5,
		cASTC_LDR_10x6,
		cASTC_LDR_8x8,
		cASTC_LDR_10x8,
		cASTC_LDR_10x10,
		cASTC_LDR_12x10,
		cASTC_LDR_12x12,
										
		cTotalBlockFormats
	};

	inline bool block_format_is_hdr(block_format fmt)
	{
		switch (fmt)
		{
		case block_format::cUASTC_HDR_4x4:
		case block_format::cBC6H:
		case block_format::cASTC_HDR_4x4:
		case block_format::cASTC_HDR_6x6:
			return true;
		default:
			break;
		}

		return false;
	}

	// LDR or HDR ASTC?
	inline bool block_format_is_astc(block_format fmt)
	{
		switch (fmt)
		{
		case block_format::cASTC_LDR_4x4:
		case block_format::cASTC_LDR_5x4:
		case block_format::cASTC_LDR_5x5:
		case block_format::cASTC_LDR_6x5:
		case block_format::cASTC_LDR_6x6:
		case block_format::cASTC_LDR_8x5:
		case block_format::cASTC_LDR_8x6:
		case block_format::cASTC_LDR_10x5:
		case block_format::cASTC_LDR_10x6:
		case block_format::cASTC_LDR_8x8:
		case block_format::cASTC_LDR_10x8:
		case block_format::cASTC_LDR_10x10:
		case block_format::cASTC_LDR_12x10:
		case block_format::cASTC_LDR_12x12:
		case block_format::cASTC_HDR_4x4:
		case block_format::cASTC_HDR_6x6:
			return true;
		default:
			break;
		}

		return false;
	}

	inline uint32_t get_block_width(block_format fmt)
	{
		switch (fmt)
		{
		case block_format::cFXT1_RGB:
			return 8;
		case block_format::cASTC_HDR_6x6:
			return 6;

		case block_format::cASTC_LDR_5x4: return 5;
		case block_format::cASTC_LDR_5x5: return 5;
		case block_format::cASTC_LDR_6x5: return 6;
		case block_format::cASTC_LDR_6x6: return 6;
		case block_format::cASTC_LDR_8x5: return 8;
		case block_format::cASTC_LDR_8x6: return 8;
		case block_format::cASTC_LDR_10x5: return 10;
		case block_format::cASTC_LDR_10x6: return 10;
		case block_format::cASTC_LDR_8x8: return 8;
		case block_format::cASTC_LDR_10x8: return 10;
		case block_format::cASTC_LDR_10x10: return 10;
		case block_format::cASTC_LDR_12x10: return 12;
		case block_format::cASTC_LDR_12x12: return 12;

		default:
			break;
		}
		return 4;
	}

	inline uint32_t get_block_height(block_format fmt)
	{
		switch (fmt)
		{
		case block_format::cASTC_HDR_6x6:
			return 6;
					
		case block_format::cASTC_LDR_5x5: return 5;
		case block_format::cASTC_LDR_6x5: return 5;
		case block_format::cASTC_LDR_6x6: return 6;
		case block_format::cASTC_LDR_8x5: return 5;
		case block_format::cASTC_LDR_8x6: return 6;
		case block_format::cASTC_LDR_10x5: return 5;
		case block_format::cASTC_LDR_10x6: return 6;
		case block_format::cASTC_LDR_8x8: return 8;
		case block_format::cASTC_LDR_10x8: return 8;
		case block_format::cASTC_LDR_10x10: return 10;
		case block_format::cASTC_LDR_12x10: return 10;
		case block_format::cASTC_LDR_12x12: return 12;

		default:
			break;
		}
		return 4;
	}

	const int COLOR5_PAL0_PREV_HI = 9, COLOR5_PAL0_DELTA_LO = -9, COLOR5_PAL0_DELTA_HI = 31;
	const int COLOR5_PAL1_PREV_HI = 21, COLOR5_PAL1_DELTA_LO = -21, COLOR5_PAL1_DELTA_HI = 21;
	const int COLOR5_PAL2_PREV_HI = 31, COLOR5_PAL2_DELTA_LO = -31, COLOR5_PAL2_DELTA_HI = 9;
	const int COLOR5_PAL_MIN_DELTA_B_RUNLEN = 3, COLOR5_PAL_DELTA_5_RUNLEN_VLC_BITS = 3;

	const uint32_t ENDPOINT_PRED_TOTAL_SYMBOLS = (4 * 4 * 4 * 4) + 1;
	const uint32_t ENDPOINT_PRED_REPEAT_LAST_SYMBOL = ENDPOINT_PRED_TOTAL_SYMBOLS - 1;
	const uint32_t ENDPOINT_PRED_MIN_REPEAT_COUNT = 3;
	const uint32_t ENDPOINT_PRED_COUNT_VLC_BITS = 4;

	const uint32_t NUM_ENDPOINT_PREDS = 3;// BASISU_ARRAY_SIZE(g_endpoint_preds);
	const uint32_t CR_ENDPOINT_PRED_INDEX = NUM_ENDPOINT_PREDS - 1;
	const uint32_t NO_ENDPOINT_PRED_INDEX = 3;//NUM_ENDPOINT_PREDS;
	const uint32_t MAX_SELECTOR_HISTORY_BUF_SIZE = 64;
	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH = 3;
	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_BITS = 6;
	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL = (1 << SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
		
	uint16_t crc16(const void *r, size_t size, uint16_t crc);

	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);

	template <typename Key>
	struct bit_hasher
	{
		inline std::size_t operator()(const Key& k) const
		{
			return hash_hsieh(reinterpret_cast<const uint8_t*>(&k), sizeof(k));
		}
	};

	struct string_hasher
	{
		inline std::size_t operator()(const std::string& k) const
		{
			size_t l = k.size();
			if (!l)
				return 0;
			return hash_hsieh(reinterpret_cast<const uint8_t*>(k.c_str()), l);
		}
	};

	class huffman_decoding_table
	{
		friend class bitwise_decoder;

	public:
		huffman_decoding_table()
		{
		}

		void clear()
		{
			basisu::clear_vector(m_code_sizes);
			basisu::clear_vector(m_lookup);
			basisu::clear_vector(m_tree);
		}

		bool init(uint32_t total_syms, const uint8_t *pCode_sizes, uint32_t fast_lookup_bits = basisu::cHuffmanFastLookupBits)
		{
			if (!total_syms)
			{
				clear();
				return true;
			}

			m_code_sizes.resize(total_syms);
			memcpy(&m_code_sizes[0], pCode_sizes, total_syms);

			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;

			m_lookup.resize(0);
			m_lookup.resize(huffman_fast_lookup_size);

			m_tree.resize(0);
			m_tree.resize(total_syms * 2);

			uint32_t syms_using_codesize[basisu::cHuffmanMaxSupportedInternalCodeSize + 1];
			basisu::clear_obj(syms_using_codesize);
			for (uint32_t i = 0; i < total_syms; i++)
			{
				if (pCode_sizes[i] > basisu::cHuffmanMaxSupportedInternalCodeSize)
					return false;
				syms_using_codesize[pCode_sizes[i]]++;
			}

			uint32_t next_code[basisu::cHuffmanMaxSupportedInternalCodeSize + 1];
			next_code[0] = next_code[1] = 0;

			uint32_t used_syms = 0, total = 0;
			for (uint32_t i = 1; i < basisu::cHuffmanMaxSupportedInternalCodeSize; i++)
			{
				used_syms += syms_using_codesize[i];
				next_code[i + 1] = (total = ((total + syms_using_codesize[i]) << 1));
			}

			if (((1U << basisu::cHuffmanMaxSupportedInternalCodeSize) != total) && (used_syms != 1U))
				return false;

			for (int tree_next = -1, sym_index = 0; sym_index < (int)total_syms; ++sym_index)
			{
				uint32_t rev_code = 0, l, cur_code, code_size = pCode_sizes[sym_index];
				if (!code_size)
					continue;

				cur_code = next_code[code_size]++;

				for (l = code_size; l > 0; l--, cur_code >>= 1)
					rev_code = (rev_code << 1) | (cur_code & 1);

				if (code_size <= fast_lookup_bits)
				{
					uint32_t k = (code_size << 16) | sym_index;
					while (rev_code < huffman_fast_lookup_size)
					{
						if (m_lookup[rev_code] != 0)
						{
							// Supplied codesizes can't create a valid prefix code.
							return false;
						}

						m_lookup[rev_code] = k;
						rev_code += (1 << code_size);
					}
					continue;
				}

				int tree_cur;
				if (0 == (tree_cur = m_lookup[rev_code & (huffman_fast_lookup_size - 1)]))
				{
					const uint32_t idx = rev_code & (huffman_fast_lookup_size - 1);
					if (m_lookup[idx] != 0)
					{
						// Supplied codesizes can't create a valid prefix code.
						return false;
					}

					m_lookup[idx] = tree_next;
					tree_cur = tree_next;
					tree_next -= 2;
				}

				if (tree_cur >= 0)
				{
					// Supplied codesizes can't create a valid prefix code.
					return false;
				}

				rev_code >>= (fast_lookup_bits - 1);

				for (int j = code_size; j > ((int)fast_lookup_bits + 1); j--)
				{
					tree_cur -= ((rev_code >>= 1) & 1);

					const int idx = -tree_cur - 1;
					if (idx < 0)
						return false;
					else if (idx >= (int)m_tree.size())
						m_tree.resize(idx + 1);
										
					if (!m_tree[idx])
					{
						m_tree[idx] = (int16_t)tree_next;
						tree_cur = tree_next;
						tree_next -= 2;
					}
					else
					{
						tree_cur = m_tree[idx];
						if (tree_cur >= 0)
						{
							// Supplied codesizes can't create a valid prefix code.
							return false;
						}
					}
				}

				tree_cur -= ((rev_code >>= 1) & 1);

				const int idx = -tree_cur - 1;
				if (idx < 0)
					return false;
				else if (idx >= (int)m_tree.size())
					m_tree.resize(idx + 1);

				if (m_tree[idx] != 0)
				{
					// Supplied codesizes can't create a valid prefix code.
					return false;
				}

				m_tree[idx] = (int16_t)sym_index;
			}

			return true;
		}

		const basisu::uint8_vec &get_code_sizes() const { return m_code_sizes; }
		const basisu::int_vec &get_lookup() const { return m_lookup; }
		const basisu::int16_vec &get_tree() const { return m_tree; }

		bool is_valid() const { return m_code_sizes.size() > 0; }

	private:
		basisu::uint8_vec m_code_sizes;
		basisu::int_vec m_lookup;
		basisu::int16_vec m_tree;
	};

	class bitwise_decoder
	{
	public:
		bitwise_decoder() :
			m_buf_size(0),
			m_pBuf(nullptr),
			m_pBuf_start(nullptr),
			m_pBuf_end(nullptr),
			m_bit_buf(0),
			m_bit_buf_size(0)
		{
		}

		void clear()
		{
			m_buf_size = 0;
			m_pBuf = nullptr;
			m_pBuf_start = nullptr;
			m_pBuf_end = nullptr;
			m_bit_buf = 0;
			m_bit_buf_size = 0;
		}

		bool init(const uint8_t *pBuf, uint32_t buf_size)
		{
			if ((!pBuf) && (buf_size))
				return false;

			m_buf_size = buf_size;
			m_pBuf = pBuf;
			m_pBuf_start = pBuf;
			m_pBuf_end = pBuf + buf_size;
			m_bit_buf = 0;
			m_bit_buf_size = 0;
			return true;
		}

		void stop()
		{
		}
				
		inline uint32_t peek_bits(uint32_t num_bits)
		{
			if (!num_bits)
				return 0;

			assert(num_bits <= 25);

			while (m_bit_buf_size < num_bits)
			{
				uint32_t c = 0;
				if (m_pBuf < m_pBuf_end)
					c = *m_pBuf++;

				m_bit_buf |= (c << m_bit_buf_size);
				m_bit_buf_size += 8;
				assert(m_bit_buf_size <= 32);
			}

			return m_bit_buf & ((1 << num_bits) - 1);
		}

		void remove_bits(uint32_t num_bits)
		{
			assert(m_bit_buf_size >= num_bits);

			m_bit_buf >>= num_bits;
			m_bit_buf_size -= num_bits;
		}

		uint32_t get_bits(uint32_t num_bits)
		{
			if (num_bits > 25)
			{
				assert(num_bits <= 32);

				const uint32_t bits0 = peek_bits(25);
				m_bit_buf >>= 25;
				m_bit_buf_size -= 25;
				num_bits -= 25;

				const uint32_t bits = peek_bits(num_bits);
				m_bit_buf >>= num_bits;
				m_bit_buf_size -= num_bits;

				return bits0 | (bits << 25);
			}

			const uint32_t bits = peek_bits(num_bits);

			m_bit_buf >>= num_bits;
			m_bit_buf_size -= num_bits;

			return bits;
		}

		uint32_t decode_truncated_binary(uint32_t n)
		{
			assert(n >= 2);

			const uint32_t k = basisu::floor_log2i(n);
			const uint32_t u = (1 << (k + 1)) - n;

			uint32_t result = get_bits(k);

			if (result >= u)
				result = ((result << 1) | get_bits(1)) - u;

			return result;
		}

		uint32_t decode_rice(uint32_t m)
		{
			assert(m);

			uint32_t q = 0;
			for (;;)
			{
				uint32_t k = peek_bits(16);
				
				uint32_t l = 0;
				while (k & 1)
				{
					l++;
					k >>= 1;
				}
				
				q += l;

				remove_bits(l);

				if (l < 16)
					break;
			}

			return (q << m) + (get_bits(m + 1) >> 1);
		}

		inline uint32_t decode_vlc(uint32_t chunk_bits)
		{
			assert(chunk_bits);

			const uint32_t chunk_size = 1 << chunk_bits;
			const uint32_t chunk_mask = chunk_size - 1;
					
			uint32_t v = 0;
			uint32_t ofs = 0;

			for ( ; ; )
			{
				uint32_t s = get_bits(chunk_bits + 1);
				v |= ((s & chunk_mask) << ofs);
				ofs += chunk_bits;

				if ((s & chunk_size) == 0)
					break;
				
				if (ofs >= 32)
				{
					assert(0);
					break;
				}
			}

			return v;
		}

		inline uint32_t decode_huffman(const huffman_decoding_table &ct, int fast_lookup_bits = basisu::cHuffmanFastLookupBits)
		{
			assert(ct.m_code_sizes.size());

			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;
						
			while (m_bit_buf_size < 16)
			{
				uint32_t c = 0;
				if (m_pBuf < m_pBuf_end)
					c = *m_pBuf++;

				m_bit_buf |= (c << m_bit_buf_size);
				m_bit_buf_size += 8;
				assert(m_bit_buf_size <= 32);
			}
						
			int code_len;

			int sym;
			if ((sym = ct.m_lookup[m_bit_buf & (huffman_fast_lookup_size - 1)]) >= 0)
			{
				code_len = sym >> 16;
				sym &= 0xFFFF;
			}
			else
			{
				code_len = fast_lookup_bits;
				do
				{
					sym = ct.m_tree[~sym + ((m_bit_buf >> code_len++) & 1)]; // ~sym = -sym - 1
				} while (sym < 0);
			}

			m_bit_buf >>= code_len;
			m_bit_buf_size -= code_len;

			return sym;
		}

		bool read_huffman_table(huffman_decoding_table &ct)
		{
			ct.clear();

			const uint32_t total_used_syms = get_bits(basisu::cHuffmanMaxSymsLog2);

			if (!total_used_syms)
				return true;
			if (total_used_syms > basisu::cHuffmanMaxSyms)
				return false;

			uint8_t code_length_code_sizes[basisu::cHuffmanTotalCodelengthCodes];
			basisu::clear_obj(code_length_code_sizes);

			const uint32_t num_codelength_codes = get_bits(5);
			if ((num_codelength_codes < 1) || (num_codelength_codes > basisu::cHuffmanTotalCodelengthCodes))
				return false;

			for (uint32_t i = 0; i < num_codelength_codes; i++)
				code_length_code_sizes[basisu::g_huffman_sorted_codelength_codes[i]] = static_cast<uint8_t>(get_bits(3));

			huffman_decoding_table code_length_table;
			if (!code_length_table.init(basisu::cHuffmanTotalCodelengthCodes, code_length_code_sizes))
				return false;

			if (!code_length_table.is_valid())
				return false;

			basisu::uint8_vec code_sizes(total_used_syms);

			uint32_t cur = 0;
			while (cur < total_used_syms)
			{
				int c = decode_huffman(code_length_table);

				if (c <= 16)
					code_sizes[cur++] = static_cast<uint8_t>(c);
				else if (c == basisu::cHuffmanSmallZeroRunCode)
					cur += get_bits(basisu::cHuffmanSmallZeroRunExtraBits) + basisu::cHuffmanSmallZeroRunSizeMin;
				else if (c == basisu::cHuffmanBigZeroRunCode)
					cur += get_bits(basisu::cHuffmanBigZeroRunExtraBits) + basisu::cHuffmanBigZeroRunSizeMin;
				else
				{
					if (!cur)
						return false;

					uint32_t l;
					if (c == basisu::cHuffmanSmallRepeatCode)
						l = get_bits(basisu::cHuffmanSmallRepeatExtraBits) + basisu::cHuffmanSmallRepeatSizeMin;
					else
						l = get_bits(basisu::cHuffmanBigRepeatExtraBits) + basisu::cHuffmanBigRepeatSizeMin;

					const uint8_t prev = code_sizes[cur - 1];
					if (prev == 0)
						return false;
					do
					{
						if (cur >= total_used_syms)
							return false;
						code_sizes[cur++] = prev;
					} while (--l > 0);
				}
			}

			if (cur != total_used_syms)
				return false;

			return ct.init(total_used_syms, &code_sizes[0]);
		}

		size_t get_bits_remaining() const
		{
			size_t total_bytes_remaining = m_pBuf_end - m_pBuf;
			return total_bytes_remaining * 8 + m_bit_buf_size;
		}

	private:
		uint32_t m_buf_size;
		const uint8_t *m_pBuf;
		const uint8_t *m_pBuf_start;
		const uint8_t *m_pBuf_end;

		uint32_t m_bit_buf;
		uint32_t m_bit_buf_size;
	};

	class simplified_bitwise_decoder
	{
	public:
		simplified_bitwise_decoder() :
			m_pBuf(nullptr),
			m_pBuf_end(nullptr),
			m_bit_buf(0)
		{
		}

		void clear()
		{
			m_pBuf = nullptr;
			m_pBuf_end = nullptr;
			m_bit_buf = 0;
		}

		bool init(const uint8_t* pBuf, size_t buf_size)
		{
			if ((!pBuf) && (buf_size))
				return false;

			m_pBuf = pBuf;
			m_pBuf_end = pBuf + buf_size;
			m_bit_buf = 1;
			return true;
		}

		bool init(const basisu::uint8_vec& buf)
		{
			return init(buf.data(), buf.size());
		}

		// num_bits must be 1, 2, 4 or 8 and codes cannot cross bytes
		inline uint32_t get_bits(uint32_t num_bits)
		{
			assert(m_pBuf);

			if (m_bit_buf <= 1)
				m_bit_buf = 256 | ((m_pBuf < m_pBuf_end) ? *m_pBuf++ : 0);

			const uint32_t mask = (1 << num_bits) - 1;
			const uint32_t res = m_bit_buf & mask;
			m_bit_buf >>= num_bits;
			assert(m_bit_buf >= 1);

			return res;
		}

		inline uint32_t get_bits1()
		{
			assert(m_pBuf);
			if (m_bit_buf <= 1)
				m_bit_buf = 256 | ((m_pBuf < m_pBuf_end) ? *m_pBuf++ : 0);
			const uint32_t res = m_bit_buf & 1;
			m_bit_buf >>= 1;
			assert(m_bit_buf >= 1);
			return res;
		}

		inline uint32_t get_bits2()
		{
			assert(m_pBuf);
			if (m_bit_buf <= 1)
				m_bit_buf = 256 | ((m_pBuf < m_pBuf_end) ? *m_pBuf++ : 0);
			const uint32_t res = m_bit_buf & 3;
			m_bit_buf >>= 2;
			assert(m_bit_buf >= 1);
			return res;
		}

		inline uint32_t get_bits4()
		{
			assert(m_pBuf);
			if (m_bit_buf <= 1)
				m_bit_buf = 256 | ((m_pBuf < m_pBuf_end) ? *m_pBuf++ : 0);
			const uint32_t res = m_bit_buf & 15;
			m_bit_buf >>= 4;
			assert(m_bit_buf >= 1);
			return res;
		}

		// No bitbuffer, can only ever retrieve bytes correctly.
		inline uint32_t get_bits8()
		{
			assert(m_pBuf);
			return (m_pBuf < m_pBuf_end) ? *m_pBuf++ : 0;
		}

		const uint8_t* m_pBuf;
		const uint8_t* m_pBuf_end;
		uint32_t m_bit_buf;
	};

	inline uint32_t basisd_rand(uint32_t seed)
	{
		if (!seed)
			seed++;
		uint32_t z = seed;
		BASISD_znew;
		return z;
	}

	// Returns random number in [0,limit). Max limit is 0xFFFF.
	inline uint32_t basisd_urand(uint32_t& seed, uint32_t limit)
	{
		seed = basisd_rand(seed);
		return (((seed ^ (seed >> 16)) & 0xFFFF) * limit) >> 16;
	}

	class approx_move_to_front
	{
	public:
		approx_move_to_front(uint32_t n)
		{
			init(n);
		}

		void init(uint32_t n)
		{
			m_values.resize(n);
			m_rover = n / 2;
		}

		const basisu::int_vec& get_values() const { return m_values; }
		basisu::int_vec& get_values() { return m_values; }

		uint32_t size() const { return (uint32_t)m_values.size(); }

		const int& operator[] (uint32_t index) const { return m_values[index]; }
		int operator[] (uint32_t index) { return m_values[index]; }

		void add(int new_value)
		{
			m_values[m_rover++] = new_value;
			if (m_rover == m_values.size())
				m_rover = (uint32_t)m_values.size() / 2;
		}

		void use(uint32_t index)
		{
			if (index)
			{
				//std::swap(m_values[index / 2], m_values[index]);
				int x = m_values[index / 2];
				int y = m_values[index];
				m_values[index / 2] = y;
				m_values[index] = x;
			}
		}

		// returns -1 if not found
		int find(int value) const
		{
			for (uint32_t i = 0; i < m_values.size(); i++)
				if (m_values[i] == value)
					return i;
			return -1;
		}

		void reset()
		{
			const uint32_t n = (uint32_t)m_values.size();

			m_values.clear();

			init(n);
		}

	private:
		basisu::int_vec m_values;
		uint32_t m_rover;
	};

	struct decoder_etc_block;
	
	inline uint8_t clamp255(int32_t i)
	{
		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
	}

	enum eNoClamp
	{
		cNoClamp = 0
	};

	struct color32
	{
		union
		{
			struct
			{
				uint8_t r;
				uint8_t g;
				uint8_t b;
				uint8_t a;
			};

			uint8_t c[4];
			
			uint32_t m;
		};

		//color32() { }
		color32() = default;

		color32(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }
		color32(eNoClamp unused, uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { (void)unused; set_noclamp_rgba(vr, vg, vb, va); }

		void set(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); c[3] = static_cast<uint8_t>(va); }

		void set_noclamp_rgb(uint32_t vr, uint32_t vg, uint32_t vb) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); }
		void set_noclamp_rgba(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }

		void set_clamped(int vr, int vg, int vb, int va) { c[0] = clamp255(vr); c[1] = clamp255(vg);	c[2] = clamp255(vb); c[3] = clamp255(va); }

		uint8_t operator[] (uint32_t idx) const { assert(idx < 4); return c[idx]; }
		uint8_t &operator[] (uint32_t idx) { assert(idx < 4); return c[idx]; }

		bool operator== (const color32&rhs) const { return m == rhs.m; }

		static color32 comp_min(const color32& a, const color32& b) { return color32(cNoClamp, basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
		static color32 comp_max(const color32& a, const color32& b) { return color32(cNoClamp, basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
	};

	struct endpoint
	{
		color32 m_color5;
		uint8_t m_inten5;
		bool operator== (const endpoint& rhs) const
		{
			return (m_color5.r == rhs.m_color5.r) && (m_color5.g == rhs.m_color5.g) && (m_color5.b == rhs.m_color5.b) && (m_inten5 == rhs.m_inten5);
		}
		bool operator!= (const endpoint& rhs) const { return !(*this == rhs); }
	};

	// This duplicates key functionality in the encoder library's color_rgba class. Porting and retesting code that uses it to color32 is impractical.
	class color_rgba
	{
	public:
		union
		{
			uint8_t m_comps[4];

			struct
			{
				uint8_t r;
				uint8_t g;
				uint8_t b;
				uint8_t a;
			};
		};

		inline color_rgba()
		{
			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
			static_assert(sizeof(*this) == sizeof(color32), "sizeof(*this) != sizeof(basist::color32)");
		}

		inline color_rgba(const color32& other) :
			r(other.r),
			g(other.g),
			b(other.b),
			a(other.a)
		{
		}

		color_rgba& operator= (const basist::color32& rhs)
		{
			r = rhs.r;
			g = rhs.g;
			b = rhs.b;
			a = rhs.a;
			return *this;
		}

		inline color_rgba(int y)
		{
			set(y);
		}

		inline color_rgba(int y, int na)
		{
			set(y, na);
		}

		inline color_rgba(int sr, int sg, int sb, int sa)
		{
			set(sr, sg, sb, sa);
		}

		inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
		{
			set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
		}

		inline color_rgba& set_noclamp_y(int y)
		{
			m_comps[0] = (uint8_t)y;
			m_comps[1] = (uint8_t)y;
			m_comps[2] = (uint8_t)y;
			m_comps[3] = (uint8_t)255;
			return *this;
		}

		inline color_rgba& set_noclamp_rgba(int sr, int sg, int sb, int sa)
		{
			m_comps[0] = (uint8_t)sr;
			m_comps[1] = (uint8_t)sg;
			m_comps[2] = (uint8_t)sb;
			m_comps[3] = (uint8_t)sa;
			return *this;
		}

		inline color_rgba& set(int y)
		{
			m_comps[0] = static_cast<uint8_t>(basisu::clamp<int>(y, 0, 255));
			m_comps[1] = m_comps[0];
			m_comps[2] = m_comps[0];
			m_comps[3] = 255;
			return *this;
		}

		inline color_rgba& set(int y, int na)
		{
			m_comps[0] = static_cast<uint8_t>(basisu::clamp<int>(y, 0, 255));
			m_comps[1] = m_comps[0];
			m_comps[2] = m_comps[0];
			m_comps[3] = static_cast<uint8_t>(basisu::clamp<int>(na, 0, 255));
			return *this;
		}

		inline color_rgba& set(int sr, int sg, int sb, int sa)
		{
			m_comps[0] = static_cast<uint8_t>(basisu::clamp<int>(sr, 0, 255));
			m_comps[1] = static_cast<uint8_t>(basisu::clamp<int>(sg, 0, 255));
			m_comps[2] = static_cast<uint8_t>(basisu::clamp<int>(sb, 0, 255));
			m_comps[3] = static_cast<uint8_t>(basisu::clamp<int>(sa, 0, 255));
			return *this;
		}

		inline color_rgba& set_rgb(int sr, int sg, int sb)
		{
			m_comps[0] = static_cast<uint8_t>(basisu::clamp<int>(sr, 0, 255));
			m_comps[1] = static_cast<uint8_t>(basisu::clamp<int>(sg, 0, 255));
			m_comps[2] = static_cast<uint8_t>(basisu::clamp<int>(sb, 0, 255));
			return *this;
		}

		inline color_rgba& set_rgb(const color_rgba& other)
		{
			r = other.r;
			g = other.g;
			b = other.b;
			return *this;
		}

		inline const uint8_t& operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }
		inline uint8_t& operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }

		inline void clear()
		{
			m_comps[0] = 0;
			m_comps[1] = 0;
			m_comps[2] = 0;
			m_comps[3] = 0;
		}

		inline bool operator== (const color_rgba& rhs) const
		{
			if (m_comps[0] != rhs.m_comps[0]) return false;
			if (m_comps[1] != rhs.m_comps[1]) return false;
			if (m_comps[2] != rhs.m_comps[2]) return false;
			if (m_comps[3] != rhs.m_comps[3]) return false;
			return true;
		}

		inline bool operator!= (const color_rgba& rhs) const
		{
			return !(*this == rhs);
		}

		inline bool operator<(const color_rgba& rhs) const
		{
			for (int i = 0; i < 4; i++)
			{
				if (m_comps[i] < rhs.m_comps[i])
					return true;
				else if (m_comps[i] != rhs.m_comps[i])
					return false;
			}
			return false;
		}

		inline color32 get_color32() const
		{
			return color32(r, g, b, a);
		}

		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; }
	};

	struct selector
	{
		// Plain selectors (2-bits per value)
		uint8_t m_selectors[4];

		// ETC1 selectors
		uint8_t m_bytes[4];

		uint8_t m_lo_selector, m_hi_selector;
		uint8_t m_num_unique_selectors;
		bool operator== (const selector& rhs) const
		{
			return (m_selectors[0] == rhs.m_selectors[0]) &&
				(m_selectors[1] == rhs.m_selectors[1]) &&
				(m_selectors[2] == rhs.m_selectors[2]) &&
				(m_selectors[3] == rhs.m_selectors[3]);
		}
		bool operator!= (const selector& rhs) const
		{
			return !(*this == rhs);
		}

		void init_flags()
		{
			uint32_t hist[4] = { 0, 0, 0, 0 };
			for (uint32_t y = 0; y < 4; y++)
			{
				for (uint32_t x = 0; x < 4; x++)
				{
					uint32_t s = get_selector(x, y);
					hist[s]++;
				}
			}

			m_lo_selector = 3;
			m_hi_selector = 0;
			m_num_unique_selectors = 0;

			for (uint32_t i = 0; i < 4; i++)
			{
				if (hist[i])
				{
					m_num_unique_selectors++;
					if (i < m_lo_selector) m_lo_selector = static_cast<uint8_t>(i);
					if (i > m_hi_selector) m_hi_selector = static_cast<uint8_t>(i);
				}
			}
		}

		// Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
		inline uint32_t get_selector(uint32_t x, uint32_t y) const
		{
			assert((x < 4) && (y < 4));
			return (m_selectors[y] >> (x * 2)) & 3;
		}

		void set_selector(uint32_t x, uint32_t y, uint32_t val)
		{
			static const uint8_t s_selector_index_to_etc1[4] = { 3, 2, 0, 1 };

			assert((x | y | val) < 4);

			m_selectors[y] &= ~(3 << (x * 2));
			m_selectors[y] |= (val << (x * 2));

			const uint32_t etc1_bit_index = x * 4 + y;

			uint8_t *p = &m_bytes[3 - (etc1_bit_index >> 3)];

			const uint32_t byte_bit_ofs = etc1_bit_index & 7;
			const uint32_t mask = 1 << byte_bit_ofs;

			const uint32_t etc1_val = s_selector_index_to_etc1[val];

			const uint32_t lsb = etc1_val & 1;
			const uint32_t msb = etc1_val >> 1;

			p[0] &= ~mask;
			p[0] |= (lsb << byte_bit_ofs);

			p[-2] &= ~mask;
			p[-2] |= (msb << byte_bit_ofs);
		}
	};

	bool basis_block_format_is_uncompressed(block_format tex_type);

	//------------------------------------

	typedef uint16_t half_float;

	const double MIN_DENORM_HALF_FLOAT = 0.000000059604645; // smallest positive subnormal number
	const double MIN_HALF_FLOAT = 0.00006103515625; // smallest positive normal number
	const double MAX_HALF_FLOAT = 65504.0; // largest normal number
	const uint32_t MAX_HALF_FLOAT_AS_INT_BITS = 0x7BFF; // the half float rep for 65504.0

	inline uint32_t get_bits(uint32_t val, int low, int high)
	{
		const int num_bits = (high - low) + 1;
		assert((num_bits >= 1) && (num_bits <= 32));

		val >>= low;
		if (num_bits != 32)
			val &= ((1u << num_bits) - 1);

		return val;
	}

	inline bool is_half_inf_or_nan(half_float v)
	{
		return get_bits(v, 10, 14) == 31;
	}

	inline bool is_half_denorm(half_float v)
	{
		int e = (v >> 10) & 31;
		return !e;
	}

	inline int get_half_exp(half_float v)
	{
		int e = ((v >> 10) & 31);
		return e ? (e - 15) : -14;
	}

	inline int get_half_mantissa(half_float v)
	{
		if (is_half_denorm(v))
			return v & 0x3FF;
		return (v & 0x3FF) | 0x400;
	}

	inline float get_half_mantissaf(half_float v)
	{
		return ((float)get_half_mantissa(v)) / 1024.0f;
	}

	inline int get_half_sign(half_float v)
	{
		return v ? ((v & 0x8000) ? -1 : 1) : 0;
	}

	inline bool half_is_signed(half_float v)
	{
		return (v & 0x8000) != 0;
	}

#if 0
	int hexp = get_half_exp(Cf);
	float hman = get_half_mantissaf(Cf);
	int hsign = get_half_sign(Cf);
	float k = powf(2.0f, hexp) * hman * hsign;
	if (is_half_inf_or_nan(Cf))
		k = std::numeric_limits<float>::quiet_NaN();
#endif

	half_float float_to_half(float val);

	inline float half_to_float(half_float hval)
	{
		union { float f; uint32_t u; } x = { 0 };

		uint32_t s = ((uint32_t)hval >> 15) & 1;
		uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
		uint32_t m = (uint32_t)hval & 0x3FF;

		if (!e)
		{
			if (!m)
			{
				// +- 0
				x.u = s << 31;
				return x.f;
			}
			else
			{
				// denormalized
				while (!(m & 0x00000400))
				{
					m <<= 1;
					--e;
				}

				++e;
				m &= ~0x00000400;
			}
		}
		else if (e == 31)
		{
			if (m == 0)
			{
				// +/- INF
				x.u = (s << 31) | 0x7f800000;
				return x.f;
			}
			else
			{
				// +/- NaN
				x.u = (s << 31) | 0x7f800000 | (m << 13);
				return x.f;
			}
		}

		e = e + (127 - 15);
		m = m << 13;

		assert(s <= 1);
		assert(m <= 0x7FFFFF);
		assert(e <= 255);

		x.u = m | (e << 23) | (s << 31);
		return x.f;
	}

	// Originally from bc6h_enc.h

	void bc6h_enc_init();

	const uint32_t MAX_BLOG16_VAL = 0xFFFF;

	// BC6H internals
	const uint32_t NUM_BC6H_MODES = 14;
	const uint32_t BC6H_LAST_MODE_INDEX = 13;
	const uint32_t BC6H_FIRST_1SUBSET_MODE_INDEX = 10; // in the MS docs, this is "mode 11" (where the first mode is 1), 60 bits for endpoints (10.10, 10.10, 10.10), 63 bits for weights
	const uint32_t TOTAL_BC6H_PARTITION_PATTERNS = 32;

	extern const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4]; // base, r, g, b

	struct bc6h_bit_layout
	{
		int8_t m_comp; // R=0,G=1,B=2,D=3 (D=partition index)
		int8_t m_index; // 0-3, 0-1 Low/High subset 1, 2-3 Low/High subset 2, -1=partition index (d)
		int8_t m_last_bit;
		int8_t m_first_bit; // may be -1 if a single bit, may be >m_last_bit if reversed
	};

	const uint32_t MAX_BC6H_LAYOUT_INDEX = 25;
	extern const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX];

	extern const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4]; // [y][x]

	extern const uint8_t g_bc6h_weight3[8];
	extern const uint8_t g_bc6h_weight4[16];

	extern const int8_t g_bc6h_mode_lookup[32];
		
	// Converts b16 to half float
	inline half_float bc6h_blog16_to_half(uint32_t comp)
	{
		assert(comp <= 0xFFFF);

		// scale the magnitude by 31/64
		comp = (comp * 31u) >> 6u;
		return (half_float)comp;
	}

	const uint32_t MAX_BC6H_HALF_FLOAT_AS_UINT = 0x7BFF;

	// Inverts bc6h_blog16_to_half().
	// Returns the nearest blog16 given a half value. 
	inline uint32_t bc6h_half_to_blog16(half_float h)
	{
		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
		return (h * 64 + 30) / 31;
	}

	// Suboptimal, but very close.
	inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits)
	{
		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
		return (h * 64 + 30) / (31 * (1 << (16 - num_bits)));
	}

	struct bc6h_block
	{
		uint8_t m_bytes[16];
	};

	void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
	void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
	void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
	void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
	void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
	void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
	bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]);

	struct bc6h_logical_block
	{
		uint32_t m_mode;
		uint32_t m_partition_pattern;	// must be 0 if 1 subset
		uint32_t m_endpoints[3][4];		// [comp][subset*2+lh_index] - must be already properly packed
		uint8_t m_weights[16];			// weights must be of the proper size, taking into account skipped MSB's which must be 0

		void clear()
		{
			basisu::clear_obj(*this);
		}
	};

	void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk);
		
	namespace bc7_mode_5_encoder
	{
		void encode_bc7_mode_5_block(void* pDst_block, color32* pPixels, bool hq_mode);
	}

	namespace astc_6x6_hdr
	{
		extern uint8_t g_quantize_tables_preserve2[21 - 1][256]; // astc_helpers::TOTAL_ISE_RANGES=21
		extern uint8_t g_quantize_tables_preserve3[21 - 1][256];
	} // namespace astc_6x6_hdr

#if BASISD_SUPPORT_XUASTC
	namespace astc_ldr_t
	{
		const uint32_t ARITH_HEADER_MARKER = 0x01;
		const uint32_t ARITH_HEADER_MARKER_BITS = 5;

		const uint32_t FULL_ZSTD_HEADER_MARKER = 0x01;
		const uint32_t FULL_ZSTD_HEADER_MARKER_BITS = 5;

		const uint32_t FINAL_SYNC_MARKER = 0xAF;
		const uint32_t FINAL_SYNC_MARKER_BITS = 8;

		const uint32_t cMaxConfigReuseNeighbors = 3;

#pragma pack(push, 1)
		struct xuastc_ldr_arith_header
		{
			uint8_t m_flags;
			basisu::packed_uint<4> m_arith_bytes_len;
			basisu::packed_uint<4> m_mean0_bits_len;
			basisu::packed_uint<4> m_mean1_bytes_len;
			basisu::packed_uint<4> m_run_bytes_len;
			basisu::packed_uint<4> m_coeff_bytes_len;
			basisu::packed_uint<4> m_sign_bits_len;
			basisu::packed_uint<4> m_weight2_bits_len; // 2-bit weights (4 per byte), up to BISE_4_LEVELS
			basisu::packed_uint<4> m_weight3_bits_len; // 3-bit weights (2 per byte), up to BISE_8_LEVELS
			basisu::packed_uint<4> m_weight4_bits_len; // 4-bit weights (2 per byte), up to BISE_16_LEVELS
			basisu::packed_uint<4> m_weight8_bytes_len; // 8-bit weights (1 per byte), up to BISE_32_LEVELS
			basisu::packed_uint<4> m_unused;			// Future expansion
		};

		struct xuastc_ldr_full_zstd_header
		{
			uint8_t m_flags;

			// Control
			basisu::packed_uint<4> m_raw_bits_len; // uncompressed
			basisu::packed_uint<4> m_mode_bytes_len;
			basisu::packed_uint<4> m_solid_dpcm_bytes_len;
			
			// Endpoint DPCM
			basisu::packed_uint<4> m_endpoint_dpcm_reuse_indices_len;
			basisu::packed_uint<4> m_use_bc_bits_len;
			basisu::packed_uint<4> m_endpoint_dpcm_3bit_len;
			basisu::packed_uint<4> m_endpoint_dpcm_4bit_len;
			basisu::packed_uint<4> m_endpoint_dpcm_5bit_len;
			basisu::packed_uint<4> m_endpoint_dpcm_6bit_len;
			basisu::packed_uint<4> m_endpoint_dpcm_7bit_len;
			basisu::packed_uint<4> m_endpoint_dpcm_8bit_len;

			// Weight grid DCT
			basisu::packed_uint<4> m_mean0_bits_len;
			basisu::packed_uint<4> m_mean1_bytes_len;
			basisu::packed_uint<4> m_run_bytes_len;
			basisu::packed_uint<4> m_coeff_bytes_len;
			basisu::packed_uint<4> m_sign_bits_len;
			
			// Weight DPCM
			basisu::packed_uint<4> m_weight2_bits_len; // 2-bit weights (4 per byte), up to BISE_4_LEVELS
			basisu::packed_uint<4> m_weight3_bits_len; // 3-bit weights (4 per byte), up to BISE_8_LEVELS
			basisu::packed_uint<4> m_weight4_bits_len; // 4-bit weights (2 per byte), up to BISE_16_LEVELS
			basisu::packed_uint<4> m_weight8_bytes_len; // 8-bit weights (1 per byte), up to BISE_32_LEVELS

			basisu::packed_uint<4> m_unused;			// Future expansion
		};
#pragma pack(pop)

		const uint32_t DCT_RUN_LEN_EOB_SYM_INDEX = 64;
		const uint32_t DCT_MAX_ARITH_COEFF_MAG = 255;
		
		const uint32_t DCT_MEAN_LEVELS0 = 9, DCT_MEAN_LEVELS1 = 33;
		
		const uint32_t PART_HASH_BITS = 6u;
		const uint32_t PART_HASH_SIZE = 1u << PART_HASH_BITS;

		const uint32_t TM_HASH_BITS = 7u;
		const uint32_t TM_HASH_SIZE = 1u << TM_HASH_BITS;
				
		typedef basisu::vector<float> fvec;

		void init();

		color_rgba blue_contract_enc(color_rgba orig, bool& did_clamp, int encoded_b);
		color_rgba blue_contract_dec(int enc_r, int enc_g, int enc_b, int enc_a);
								
		struct astc_block_grid_config
		{
			uint16_t m_block_width, m_block_height;
			uint16_t m_grid_width, m_grid_height;

			astc_block_grid_config() {}

			astc_block_grid_config(uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height)
			{
				assert((block_width >= 4) && (block_width <= 12));
				assert((block_height >= 4) && (block_height <= 12));
				m_block_width = (uint16_t)block_width;
				m_block_height = (uint16_t)block_height;

				assert((grid_width >= 2) && (grid_width <= block_width));
				assert((grid_height >= 2) && (grid_height <= block_height));
				m_grid_width = (uint16_t)grid_width;
				m_grid_height = (uint16_t)grid_height;
			}

			bool operator==(const astc_block_grid_config& other) const
			{
				return (m_block_width == other.m_block_width) && (m_block_height == other.m_block_height) &&
					(m_grid_width == other.m_grid_width) && (m_grid_height == other.m_grid_height);
			}
		};

		struct astc_block_grid_data
		{
			float m_weight_gamma;

			// An unfortunate difference of containers, but in memory these matrices are both addressed as [r][c].
			basisu::vector2D<float> m_upsample_matrix;

			basisu::vector<float> m_downsample_matrix;

			astc_block_grid_data() {}
			astc_block_grid_data(float weight_gamma) : m_weight_gamma(weight_gamma) {}
		};

		typedef basisu::hash_map<astc_block_grid_config, astc_block_grid_data, bit_hasher<astc_block_grid_config> > astc_block_grid_data_hash_t;
						
		void decode_endpoints_ise20(uint32_t cem_index, const uint8_t* pEndpoint_vals, color32& l, color32& h);
		void decode_endpoints(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color32& l, color32& h, float* pScale = nullptr);

		void decode_endpoints_ise20(uint32_t cem_index, const uint8_t* pEndpoint_vals, color_rgba& l, color_rgba& h);
		void decode_endpoints(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba& l, color_rgba& h, float* pScale = nullptr);

		void compute_adjoint_downsample_matrix(basisu::vector<float>& downsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);
		void compute_upsample_matrix(basisu::vector2D<float>& upsample_matrix, uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);

		class dct2f
		{
			enum { cMaxSize = 12 };

		public:
			dct2f() : m_rows(0u), m_cols(0u) {}

			// call with grid_height/grid_width (INVERTED)
			bool init(uint32_t rows, uint32_t cols);
			
			uint32_t rows() const { return m_rows; }
			uint32_t cols() const { return m_cols; }

			void forward(const float* pSrc, float* pDst, fvec& work) const;
			
			void inverse(const float* pSrc, float* pDst, fvec& work) const;
			
			// check variants use a less optimized implementation, used for sanity checking
			void inverse_check(const float* pSrc, float* pDst, fvec& work) const;
			
			void forward(const float* pSrc, uint32_t src_stride,
				float* pDst, uint32_t dst_stride, fvec& work) const;
			
			void inverse(const float* pSrc, uint32_t src_stride,
				float* pDst, uint32_t dst_stride, fvec& work) const;
			
			void inverse_check(const float* pSrc, uint32_t src_stride,
				float* pDst, uint32_t dst_stride, fvec& work) const;

		private:
			uint32_t m_rows, m_cols;
			fvec m_c_col;   // [u*m_rows + x]
			fvec m_c_row;   // [v*m_cols + y]
			fvec m_a_col;   // alpha(u)
			fvec m_a_row;   // alpha(v)
		};

		struct dct_syms
		{
			dct_syms()
			{
				clear();
			}

			void clear()
			{
				m_dc_sym = 0;
				m_num_dc_levels = 0;
				m_coeffs.resize(0);
				m_max_coeff_mag = 0;
				m_max_zigzag_index = 0;
			}

			uint32_t m_dc_sym;
			uint32_t m_num_dc_levels;

			struct coeff
			{
				uint16_t m_num_zeros;
				int16_t m_coeff; // or INT16_MAX if invalid

				coeff() {}
				coeff(uint16_t num_zeros, int16_t coeff) : m_num_zeros(num_zeros), m_coeff(coeff) {}
			};

			basisu::static_vector<coeff, 65> m_coeffs;

			uint32_t m_max_coeff_mag;
			uint32_t m_max_zigzag_index;
		};
				
		struct grid_dim_key
		{
			int m_grid_width;
			int m_grid_height;

			grid_dim_key() {}

			grid_dim_key(int w, int h) : m_grid_width(w), m_grid_height(h) {}

			bool operator== (const grid_dim_key& rhs) const
			{
				return (m_grid_width == rhs.m_grid_width) && (m_grid_height == rhs.m_grid_height);
			}
		};

		struct grid_dim_value
		{
			basisu::int_vec m_zigzag;
			dct2f m_dct;
		};

		typedef basisu::hash_map<grid_dim_key, grid_dim_value, bit_hasher<grid_dim_key> > grid_dim_hash_map;

		void init_astc_block_grid_data_hash();

		const astc_block_grid_data* find_astc_block_grid_data(uint32_t block_width, uint32_t block_height, uint32_t grid_width, uint32_t grid_height);

		const float DEADZONE_ALPHA = .5f;
		const float SCALED_WEIGHT_BASE_CODING_SCALE = .5f; // typically ~5 bits [0,32], or 3 [0,8]

		struct sample_quant_table_state
		{
			float m_q, m_sx, m_sy, m_level_scale;

			void init(float q,
				uint32_t block_width, uint32_t block_height,
				float level_scale)
			{
				m_q = q;
				m_level_scale = level_scale;

				const int Bx = block_width, By = block_height;

				m_sx = (float)8.0f / (float)Bx;
				m_sy = (float)8.0f / (float)By;
			}
		};

		class grid_weight_dct
		{
		public:
			grid_weight_dct() { }

			void init(uint32_t block_width, uint32_t block_height);
			
			static uint32_t get_num_weight_dc_levels(uint32_t weight_ise_range)
			{
				float scaled_weight_coding_scale = SCALED_WEIGHT_BASE_CODING_SCALE;
				if (weight_ise_range <= astc_helpers::BISE_8_LEVELS)
					scaled_weight_coding_scale = 1.0f / 8.0f;

				return (uint32_t)(64.0f * scaled_weight_coding_scale) + 1;
			}

			struct block_stats
			{
				float m_mean_weight;
				uint32_t m_total_coded_acs;
				uint32_t m_max_ac_coeff;
			};

			bool decode_block_weights(
				float q, uint32_t plane_index, // plane of weights to decode and IDCT from stream
				astc_helpers::log_astc_block& log_blk, // must be initialized except for the plane weights which are decoded
				basist::bitwise_decoder* pDec,
				const astc_block_grid_data* pGrid_data, // grid data for this grid size
				block_stats* pS,
				fvec& dct_work, // thread local
				const dct_syms* pSyms = nullptr) const;
						
			enum { m_zero_run = 3, m_coeff = 2 };

			uint32_t m_block_width, m_block_height;

			grid_dim_hash_map m_grid_dim_key_vals;
						
			// Adaptively compensate for weight level quantization noise being fed into the DCT. 
			// The more coursely the weight levels are quantized, the more noise injected, and the more noise will be spread between multiple AC coefficients.
			// This will cause some previously 0 coefficients to increase in mag, but they're likely noise. So carefully nudge the quant step size to compensate.
			static float scale_quant_steps(int Q_astc, float gamma = 0.1f /*.13f*/, float clamp_max = 2.0f)
			{
				assert(Q_astc >= 2);
				float factor = 63.0f / (Q_astc - 1);
				// TODO: Approximate powf()
				float scaled = powf(factor, gamma);
				scaled = basisu::clamp<float>(scaled, 1.0f, clamp_max);
				return scaled;
			}

			float compute_level_scale(float q, float span_len, float weight_gamma, uint32_t grid_width, uint32_t grid_height, uint32_t weight_ise_range) const;

			int sample_quant_table(sample_quant_table_state& state, uint32_t x, uint32_t y) const;

			void compute_quant_table(float q,
				uint32_t grid_width, uint32_t grid_height,
				float level_scale, int* dct_quant_tab) const;
			
			float get_max_span_len(const astc_helpers::log_astc_block& log_blk, uint32_t plane_index) const;
			
			inline int quantize_deadzone(float d, int L, float alpha, uint32_t x, uint32_t y) const
			{
				assert((x < m_block_width) && (y < m_block_height));

				if (((x == 1) && (y == 0)) ||
					((x == 0) && (y == 1)))
				{
					return (int)std::round(d / (float)L);
				}

				// L = quant step, alpha in [0,1.2] (typical 0.7�0.85)
				if (L <= 0) 
					return 0;

				float s = fabsf(d);
				float tau = alpha * float(L);                 // half-width of the zero band

				if (s <= tau) 
					return 0;                       // inside dead-zone towards zero

				// Quantize the residual outside the dead-zone with mid-tread rounding
				float qf = (s - tau) / float(L);
				int   q = (int)floorf(qf + 0.5f);            // ties-nearest
				return (d < 0.0f) ? -q : q;
			}

			inline float dequant_deadzone(int q, int L, float alpha, uint32_t x, uint32_t y) const
			{
				assert((x < m_block_width) && (y < m_block_height));

				if (((x == 1) && (y == 0)) ||
					((x == 0) && (y == 1)))
				{
					return (float)q * (float)L;
				}

				if (q == 0 || L <= 0) 
					return 0.0f;

				float tau = alpha * float(L);
				float mag = tau + float(abs(q)) * float(L);   // center of the (nonzero) bin
				return (q < 0) ? -mag : mag;
			}
		};

		struct trial_mode
		{
			uint32_t m_grid_width;
			uint32_t m_grid_height;
			uint32_t m_cem;
			int m_ccs_index;
			uint32_t m_endpoint_ise_range;
			uint32_t m_weight_ise_range;
			uint32_t m_num_parts;

			bool operator==(const trial_mode& other) const
			{
#define BU_COMP(a) if (a != other.a) return false;
				BU_COMP(m_grid_width);
				BU_COMP(m_grid_height);
				BU_COMP(m_cem);
				BU_COMP(m_ccs_index);
				BU_COMP(m_endpoint_ise_range);
				BU_COMP(m_weight_ise_range);
				BU_COMP(m_num_parts);
#undef BU_COMP
				return true;
			}

			bool operator<(const trial_mode& rhs) const
			{
#define BU_COMP(a) if (a < rhs.a) return true; else if (a > rhs.a) return false;
				BU_COMP(m_grid_width);
				BU_COMP(m_grid_height);
				BU_COMP(m_cem);
				BU_COMP(m_ccs_index);
				BU_COMP(m_endpoint_ise_range);
				BU_COMP(m_weight_ise_range);
				BU_COMP(m_num_parts);
#undef BU_COMP
				return false;
			}

			operator size_t() const
			{
				size_t h = 0xABC1F419;
#define BU_FIELD(a) do { h ^= hash_hsieh(reinterpret_cast<const uint8_t *>(&a), sizeof(a)); } while(0)
				BU_FIELD(m_grid_width);
				BU_FIELD(m_grid_height);
				BU_FIELD(m_cem);
				BU_FIELD(m_ccs_index);
				BU_FIELD(m_endpoint_ise_range);
				BU_FIELD(m_weight_ise_range);
				BU_FIELD(m_num_parts);
#undef BU_FIELD
				return h;
			}
		};

		// Organize trial modes for faster initial mode triaging.
		const uint32_t OTM_NUM_CEMS = 14; // 0-13 (13=highest valid LDR CEM)
		const uint32_t OTM_NUM_SUBSETS = 3; // 1-3
		const uint32_t OTM_NUM_CCS = 5; // -1 to 3
		const uint32_t OTM_NUM_GRID_SIZES = 2; // 0=small or 1=large (grid_w>=block_w-1 and grid_h>=block_h-1)
		const uint32_t OTM_NUM_GRID_ANISOS = 3; // 0=W=H, 1=W>H, 2=W<H

		inline uint32_t calc_grid_aniso_val(uint32_t gw, uint32_t gh, uint32_t bw, uint32_t bh)
		{
			assert((gw > 0) && (gh > 0));
			assert((bw > 0) && (bh > 0));
			assert((gw <= 12) && (gh <= 12) && (bw <= 12) && (bh <= 12));
			assert((gw <= bw) && (gh <= bh));
						
#if 0
			// Prev. code:
			uint32_t grid_aniso = 0;
			if (tm.m_grid_width != tm.m_grid_height) // not optimal for non-square block sizes
			{
				const float grid_x_fract = (float)tm.m_grid_width / (float)block_width;
				const float grid_y_fract = (float)tm.m_grid_height / (float)block_height;
				if (grid_x_fract >= grid_y_fract)
					grid_aniso = 1;
				else if (grid_x_fract < grid_y_fract)
					grid_aniso = 2;
			}
#endif
			// Compare gw/bw vs. gh/bh using integer math:
			// gw*bh >= gh*bw  -> X-dominant (1), else Y-dominant (2)
			const uint32_t lhs = gw * bh;
			const uint32_t rhs = gh * bw;

			// Equal (isotropic), X=Y
			if (lhs == rhs)
				return 0;

			// Anisotropic - 1=X, 2=Y
			return (lhs >= rhs) ? 1 : 2;
		}

		struct grouped_trial_modes
		{
			basisu::uint_vec m_tm_groups[OTM_NUM_CEMS][OTM_NUM_SUBSETS][OTM_NUM_CCS][OTM_NUM_GRID_SIZES][OTM_NUM_GRID_ANISOS]; // indices of encoder trial modes in each bucket

			void clear()
			{
				for (uint32_t cem_iter = 0; cem_iter < OTM_NUM_CEMS; cem_iter++)
					for (uint32_t subsets_iter = 0; subsets_iter < OTM_NUM_SUBSETS; subsets_iter++)
						for (uint32_t ccs_iter = 0; ccs_iter < OTM_NUM_CCS; ccs_iter++)
							for (uint32_t grid_sizes_iter = 0; grid_sizes_iter < OTM_NUM_GRID_SIZES; grid_sizes_iter++)
								for (uint32_t grid_anisos_iter = 0; grid_anisos_iter < OTM_NUM_GRID_ANISOS; grid_anisos_iter++)
									m_tm_groups[cem_iter][subsets_iter][ccs_iter][grid_sizes_iter][grid_anisos_iter].clear();
			}

			void add(uint32_t block_width, uint32_t block_height,
				const trial_mode& tm, uint32_t tm_index)
			{
				const uint32_t cem_index = tm.m_cem;
				assert(cem_index < OTM_NUM_CEMS);

				const uint32_t subset_index = tm.m_num_parts - 1;
				assert(subset_index < OTM_NUM_SUBSETS);

				const uint32_t ccs_index = tm.m_ccs_index + 1;
				assert(ccs_index < OTM_NUM_CCS);

				const uint32_t grid_size = (tm.m_grid_width >= (block_width - 1)) && (tm.m_grid_height >= (block_height - 1));
				const uint32_t grid_aniso = calc_grid_aniso_val(tm.m_grid_width, tm.m_grid_height, block_width, block_height);

				basisu::uint_vec& v = m_tm_groups[cem_index][subset_index][ccs_index][grid_size][grid_aniso];
				if (!v.capacity())
					v.reserve(64);

				v.push_back(tm_index);
			}

			uint32_t count_used_groups() const
			{
				uint32_t n = 0;

				for (uint32_t cem_iter = 0; cem_iter < OTM_NUM_CEMS; cem_iter++)
					for (uint32_t subsets_iter = 0; subsets_iter < OTM_NUM_SUBSETS; subsets_iter++)
						for (uint32_t ccs_iter = 0; ccs_iter < OTM_NUM_CCS; ccs_iter++)
							for (uint32_t grid_sizes_iter = 0; grid_sizes_iter < OTM_NUM_GRID_SIZES; grid_sizes_iter++)
								for (uint32_t grid_anisos_iter = 0; grid_anisos_iter < OTM_NUM_GRID_ANISOS; grid_anisos_iter++)
								{
									if (m_tm_groups[cem_iter][subsets_iter][ccs_iter][grid_sizes_iter][grid_anisos_iter].size())
										n++;
								}
				return n;
			}
		};

		extern grouped_trial_modes g_grouped_encoder_trial_modes[astc_helpers::cTOTAL_BLOCK_SIZES];

		inline const basisu::uint_vec& get_tm_candidates(const grouped_trial_modes& grouped_enc_trial_modes,
			uint32_t cem_index, uint32_t subset_index, uint32_t ccs_index, uint32_t grid_size, uint32_t grid_aniso)
		{
			assert(cem_index < OTM_NUM_CEMS);
			assert(subset_index < OTM_NUM_SUBSETS);
			assert(ccs_index < OTM_NUM_CCS);
			assert(grid_size < OTM_NUM_GRID_SIZES);
			assert(grid_aniso < OTM_NUM_GRID_ANISOS);

			const basisu::uint_vec& modes = grouped_enc_trial_modes.m_tm_groups[cem_index][subset_index][ccs_index][grid_size][grid_aniso];
			return modes;
		}

		const uint32_t CFG_PACK_GRID_BITS = 7;
		const uint32_t CFG_PACK_CEM_BITS = 3;
		const uint32_t CFG_PACK_CCS_BITS = 3;
		const uint32_t CFG_PACK_SUBSETS_BITS = 2;
		const uint32_t CFG_PACK_WISE_BITS = 4;
		const uint32_t CFG_PACK_EISE_BITS = 5;

		extern const int s_unique_ldr_index_to_astc_cem[6];

		enum class xuastc_mode
		{
			cMODE_SOLID = 0,
			cMODE_RAW = 1,

			// Full cfg, partition ID, and all endpoint value reuse.
			cMODE_REUSE_CFG_ENDPOINTS_LEFT = 2,
			cMODE_REUSE_CFG_ENDPOINTS_UP = 3,
			cMODE_REUSE_CFG_ENDPOINTS_DIAG = 4,

			cMODE_RUN = 5,

			cMODE_TOTAL,
		};

		enum class xuastc_zstd_mode
		{
			// len=1 bits
			cMODE_RAW = 0b0,

			// len=2 bits
			cMODE_RUN = 0b01,

			// len=4 bits
			cMODE_SOLID = 0b0011,
			cMODE_REUSE_CFG_ENDPOINTS_LEFT = 0b0111, 
			cMODE_REUSE_CFG_ENDPOINTS_UP = 0b1011,   
			cMODE_REUSE_CFG_ENDPOINTS_DIAG = 0b1111  
		};

		const uint32_t XUASTC_LDR_MODE_BYTE_IS_BASE_OFS_FLAG = 1 << 3;
		const uint32_t XUASTC_LDR_MODE_BYTE_PART_HASH_HIT = 1 << 4;
		const uint32_t XUASTC_LDR_MODE_BYTE_DPCM_ENDPOINTS_FLAG = 1 << 5;
		const uint32_t XUASTC_LDR_MODE_BYTE_TM_HASH_HIT_FLAG = 1 << 6;
		const uint32_t XUASTC_LDR_MODE_BYTE_USE_DCT = 1 << 7;

		enum class xuastc_ldr_syntax
		{
			cFullArith = 0,
			cHybridArithZStd = 1,
			cFullZStd = 2,

			cTotal
		};

		void create_encoder_trial_modes_table(uint32_t block_width, uint32_t block_height,
			basisu::vector<trial_mode>& encoder_trial_modes, grouped_trial_modes& grouped_encoder_trial_modes,
			bool print_debug_info, bool print_modes);

		extern basisu::vector<trial_mode> g_encoder_trial_modes[astc_helpers::cTOTAL_BLOCK_SIZES];
				
		inline uint32_t part_hash_index(uint32_t x)
		{
			// fib hash
			return (x * 2654435769u) & (PART_HASH_SIZE - 1);
		}

		// Full ZStd syntax only
		inline uint32_t tm_hash_index(uint32_t x)
		{
			// fib hash
			return (x * 2654435769u) & (TM_HASH_SIZE - 1);
		}

		// TODO: Some fields are unused during transcoding.
		struct prev_block_state
		{
			bool m_was_solid_color;
			bool m_used_weight_dct;
			bool m_first_endpoint_uses_bc;
			bool m_reused_full_cfg;
			bool m_used_part_hash;

			int m_tm_index; // -1 if invalid (solid color block)
			uint32_t m_base_cem_index; // doesn't include base+ofs
			uint32_t m_subset_index, m_ccs_index, m_grid_size, m_grid_aniso;

			prev_block_state()
			{
				clear();
			}

			void clear()
			{
				basisu::clear_obj(*this);
			}
		};

		struct prev_block_state_full_zstd
		{
			int m_tm_index; // -1 if invalid (solid color block)
			
			bool was_solid_color() const { return m_tm_index < 0; }

			prev_block_state_full_zstd()
			{
				clear();
			}

			void clear()
			{
				basisu::clear_obj(*this);
			}
		};

		inline uint32_t cem_to_ldrcem_index(uint32_t cem)
		{
			switch (cem)
			{
			case astc_helpers::CEM_LDR_LUM_DIRECT: return 0;
			case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: return 1;
			case astc_helpers::CEM_LDR_RGB_BASE_SCALE: return 2;
			case astc_helpers::CEM_LDR_RGB_DIRECT: return 3;
			case astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET: return 4;
			case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: return 5;
			case astc_helpers::CEM_LDR_RGBA_DIRECT: return 6;
			case astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET: return 7;
			default:
				assert(0);
				break;
			}

			return 0;
		}
				
		bool pack_base_offset(
			uint32_t cem_index, uint32_t dst_ise_endpoint_range, uint8_t* pPacked_endpoints,
			const color_rgba& l, const color_rgba& h,
			bool use_blue_contraction, bool auto_disable_blue_contraction_if_clamped,
			bool& blue_contraction_clamped_flag, bool& base_ofs_clamped_flag, bool& endpoints_swapped);

		bool convert_endpoints_across_cems(
			uint32_t prev_cem, uint32_t prev_endpoint_ise_range, const uint8_t* pPrev_endpoints,
			uint32_t dst_cem, uint32_t dst_endpoint_ise_range, uint8_t* pDst_endpoints,
			bool always_repack,
			bool use_blue_contraction, bool auto_disable_blue_contraction_if_clamped,
			bool& blue_contraction_clamped_flag, bool& base_ofs_clamped_flag);

		uint32_t get_total_unique_patterns(uint32_t astc_block_size_index, uint32_t num_parts);
		//uint16_t unique_pat_index_to_part_seed(uint32_t astc_block_size_index, uint32_t num_parts, uint32_t unique_pat_index);

		typedef bool (*xuastc_decomp_image_init_callback_ptr)(uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t block_width, uint32_t block_height, bool srgb_decode_profile, float dct_q, bool has_alpha, void* pData);
		typedef bool (*xuastc_decomp_image_block_callback_ptr)(uint32_t bx, uint32_t by, const astc_helpers::log_astc_block& log_blk, void* pData);
				
		bool xuastc_ldr_decompress_image(
			const uint8_t* pComp_data, size_t comp_data_size,
			uint32_t& astc_block_width, uint32_t& astc_block_height,
			uint32_t& actual_width, uint32_t& actual_height, bool& has_alpha, bool& uses_srgb_astc_decode_mode,
			bool debug_output,
			xuastc_decomp_image_init_callback_ptr pInit_callback, void *pInit_callback_data,
			xuastc_decomp_image_block_callback_ptr pBlock_callback, void *pBlock_callback_data);
		
	} // namespace astc_ldr_t

	namespace arith_fastbits_f32
	{
		enum { TABLE_BITS = 8 };               // 256..1024 entries typical (8..10)
		enum { TABLE_SIZE = 1 << TABLE_BITS };
		enum { MANT_BITS = 23 };
		enum { FRAC_BITS = MANT_BITS - TABLE_BITS };
		enum { FRAC_MASK = (1u << FRAC_BITS) - 1u };

		extern bool g_initialized;
		extern float g_lut_edge[TABLE_SIZE + 1]; // samples at m = 1 + i/TABLE_SIZE (for linear)

		inline void init()
		{
			if (g_initialized)
				return;

			const float inv_ln2 = 1.4426950408889634f; // 1/ln(2)

			for (int i = 0; i <= TABLE_SIZE; ++i)
			{
				float m = 1.0f + float(i) / float(TABLE_SIZE);      // m in [1,2]
				g_lut_edge[i] = logf(m) * inv_ln2;             // log2(m)
			}

			g_initialized = true;
		}

		inline void unpack(float p, int& e_unbiased, uint32_t& mant)
		{
			// kill any denorms
			if (p < FLT_MIN)
				p = 0;

			union { float f; uint32_t u; } x;
			x.f = p;
			e_unbiased = int((x.u >> 23) & 0xFF) - 127;
			mant = (x.u & 0x7FFFFFu); // 23-bit mantissa
		}

		// Returns estimated bits given probability p, approximates -log2f(p).
		inline float bits_from_prob_linear(float p)
		{
			assert((p > 0.0f) && (p <= 1.0f));
			if (!g_initialized)
				init();

			int e; uint32_t mant;
			unpack(p, e, mant);

			uint32_t idx = mant >> FRAC_BITS;            // 0..TABLE_SIZE-1
			uint32_t frac = mant & FRAC_MASK;             // low FRAC_BITS
			const float inv_scale = 1.0f / float(1u << FRAC_BITS);
			float t = float(frac) * inv_scale;                 // [0,1)

			float y0 = g_lut_edge[idx];
			float y1 = g_lut_edge[idx + 1];
			float log2m = y0 + t * (y1 - y0);

			return -(float(e) + log2m);
		}

	} // namespace arith_fastbits_f32

	namespace arith
	{
		// A simple range coder
		const uint32_t ArithMaxSyms = 2048;
		const uint32_t DMLenShift = 15u;
		const uint32_t DMMaxCount = 1u << DMLenShift;
		const uint32_t BMLenShift = 13u;
		const uint32_t BMMaxCount = 1u << BMLenShift;
		const uint32_t ArithMinLen = 1u << 24u;
		const uint32_t ArithMaxLen = UINT32_MAX;
		const uint32_t ArithMinExpectedDataBufSize = 5;

		class arith_bit_model
		{
		public:
			arith_bit_model()
			{
				reset();
			}

			void init()
			{
				reset();
			}

			void reset()
			{
				m_bit0_count = 1;
				m_bit_count = 2;
				m_bit0_prob = 1U << (BMLenShift - 1);
				m_update_interval = 4;
				m_bits_until_update = 4;
			}

			float get_price(bool bit) const
			{
				const float prob_0 = (float)m_bit0_prob / (float)BMMaxCount;
				const float prob = bit ? (1.0f - prob_0) : prob_0;
				const float bits = arith_fastbits_f32::bits_from_prob_linear(prob);
				assert(fabs(bits - (-log2f(prob))) < .00125f); // basic sanity check
				return bits;
			}

			void update()
			{
				assert(m_bit_count >= 2);
				assert(m_bit0_count < m_bit_count);

				if (m_bit_count >= BMMaxCount)
				{
					assert(m_bit_count && m_bit0_count);

					m_bit_count = (m_bit_count + 1) >> 1;
					m_bit0_count = (m_bit0_count + 1) >> 1;

					if (m_bit0_count == m_bit_count)
						++m_bit_count;

					assert(m_bit0_count < m_bit_count);
				}

				const uint32_t scale = 0x80000000U / m_bit_count;
				m_bit0_prob = (m_bit0_count * scale) >> (31 - BMLenShift);

				m_update_interval = basisu::clamp<uint32_t>((5 * m_update_interval) >> 2, 4u, 128);

				m_bits_until_update = m_update_interval;
			}

			void print_prices(const char* pDesc)
			{
				if (pDesc)
					printf("arith_data_model bit prices for model %s:\n", pDesc);
				for (uint32_t i = 0; i < 2; i++)
					printf("%u: %3.3f bits\n", i, get_price(i));
				printf("\n");
			}

		private:
			friend class arith_enc;
			friend class arith_dec;

			uint32_t m_bit0_prob;	// snapshot made at last update

			uint32_t m_bit0_count;	// live
			uint32_t m_bit_count;	// live

			int m_bits_until_update;
			uint32_t m_update_interval;
		};

		enum { cARITH_GAMMA_MAX_TAIL_CTX = 4, cARITH_GAMMA_MAX_PREFIX_CTX = 3 };
		struct arith_gamma_contexts
		{
			arith_bit_model m_ctx_prefix[cARITH_GAMMA_MAX_PREFIX_CTX]; // for unary continue prefix
			arith_bit_model m_ctx_tail[cARITH_GAMMA_MAX_TAIL_CTX];     // for binary suffix bits
		};

		class arith_data_model
		{
		public:
			arith_data_model() :
				m_num_data_syms(0),
				m_total_sym_freq(0),
				m_update_interval(0),
				m_num_syms_until_next_update(0)
			{
			}

			arith_data_model(uint32_t num_syms, bool faster_update = false) :
				m_num_data_syms(0),
				m_total_sym_freq(0),
				m_update_interval(0),
				m_num_syms_until_next_update(0)
			{
				init(num_syms, faster_update);
			}

			void clear()
			{
				m_cum_sym_freqs.clear();
				m_sym_freqs.clear();

				m_num_data_syms = 0;
				m_total_sym_freq = 0;
				m_update_interval = 0;
				m_num_syms_until_next_update = 0;
			}

			void init(uint32_t num_syms, bool faster_update = false)
			{
				assert((num_syms >= 2) && (num_syms <= ArithMaxSyms));

				m_num_data_syms = num_syms;

				m_sym_freqs.resize(num_syms);
				m_cum_sym_freqs.resize(num_syms + 1);

				reset(faster_update);
			}

			void reset(bool faster_update = false)
			{
				if (!m_num_data_syms)
					return;

				m_sym_freqs.set_all(1);
				m_total_sym_freq = m_num_data_syms;

				m_update_interval = m_num_data_syms;
				m_num_syms_until_next_update = 0;

				update(false);

				if (faster_update)
				{
					m_update_interval = basisu::clamp<uint32_t>((m_num_data_syms + 7) / 8, 4u, (m_num_data_syms + 6) << 3);
					m_num_syms_until_next_update = m_update_interval;
				}
			}

			void update(bool enc_flag)
			{
				assert(m_num_data_syms);
				BASISU_NOTE_UNUSED(enc_flag);

				if (!m_num_data_syms)
					return;

				while (m_total_sym_freq >= DMMaxCount)
				{
					m_total_sym_freq = 0;

					for (uint32_t n = 0; n < m_num_data_syms; n++)
					{
						m_sym_freqs[n] = (m_sym_freqs[n] + 1u) >> 1u;
						m_total_sym_freq += m_sym_freqs[n];
					}
				}

				const uint32_t scale = 0x80000000U / m_total_sym_freq;

				uint32_t sum = 0;
				for (uint32_t i = 0; i < m_num_data_syms; ++i)
				{
					assert(((uint64_t)scale * sum) <= UINT32_MAX);
					m_cum_sym_freqs[i] = (scale * sum) >> (31 - DMLenShift);
					sum += m_sym_freqs[i];
				}
				assert(sum == m_total_sym_freq);

				m_cum_sym_freqs[m_num_data_syms] = DMMaxCount;

				m_update_interval = basisu::clamp<uint32_t>((5 * m_update_interval) >> 2, 4u, (m_num_data_syms + 6) << 3);

				m_num_syms_until_next_update = m_update_interval;
			}

			float get_price(uint32_t sym_index) const
			{
				assert(sym_index < m_num_data_syms);

				if (sym_index >= m_num_data_syms)
					return 0.0f;

				const float prob = (float)(m_cum_sym_freqs[sym_index + 1] - m_cum_sym_freqs[sym_index]) / (float)DMMaxCount;

				const float bits = arith_fastbits_f32::bits_from_prob_linear(prob);
				assert(fabs(bits - (-log2f(prob))) < .00125f); // basic sanity check
				return bits;
			}

			void print_prices(const char* pDesc)
			{
				if (pDesc)
					printf("arith_data_model bit prices for model %s:\n", pDesc);
				for (uint32_t i = 0; i < m_num_data_syms; i++)
					printf("%u: %3.3f bits\n", i, get_price(i));
				printf("\n");
			}

			uint32_t get_num_data_syms() const { return m_num_data_syms; }

		private:
			friend class arith_enc;
			friend class arith_dec;

			uint32_t m_num_data_syms;

			basisu::uint_vec m_sym_freqs;		// live histogram
			uint32_t m_total_sym_freq; // always live vs. m_sym_freqs

			basisu::uint_vec m_cum_sym_freqs; // has 1 extra entry, snapshot from last update

			uint32_t m_update_interval;
			int m_num_syms_until_next_update;

			uint32_t get_last_sym_index() const { return m_num_data_syms - 1; }
		};

		class arith_enc
		{
		public:
			arith_enc()
			{
				clear();
			}

			void clear()
			{
				m_data_buf.clear();

				m_base = 0;
				m_length = ArithMaxLen;
			}

			void init(size_t reserve_size)
			{
				m_data_buf.reserve(reserve_size);
				m_data_buf.resize(0);

				m_base = 0;
				m_length = ArithMaxLen;

				// Place 8-bit marker at beginning. 
				// This virtually always guarantees no backwards carries can be lost at the very beginning of the stream. (Should be impossible with this design.)
				// It always pushes out 1 0 byte at the very beginning to absorb future carries.
				// Caller does this now, we send a tiny header anyway
				//put_bits(0x1, 8);
				//assert(m_data_buf[0] != 0xFF);
			}

			void put_bit(uint32_t bit)
			{
				m_length >>= 1;

				if (bit)
				{
					const uint32_t orig_base = m_base;

					m_base += m_length;

					if (orig_base > m_base)
						prop_carry();
				}

				if (m_length < ArithMinLen)
					renorm();
			}

			enum { cMaxPutBitsLen = 20 };
			void put_bits(uint32_t val, uint32_t num_bits)
			{
				assert(num_bits && (num_bits <= cMaxPutBitsLen));
				assert(val < (1u << num_bits));

				m_length >>= num_bits;

				const uint32_t orig_base = m_base;

				m_base += val * m_length;

				if (orig_base > m_base)
					prop_carry();

				if (m_length < ArithMinLen)
					renorm();
			}

			// returns # of bits actually written
			inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)
			{
				assert((n >= 2) && (v < n));

				uint32_t k = basisu::floor_log2i(n);
				uint32_t u = (1 << (k + 1)) - n;

				if (v < u)
				{
					put_bits(v, k);
					return k;
				}

				uint32_t x = v + u;
				assert((x >> 1) >= u);

				put_bits(x >> 1, k);
				put_bits(x & 1, 1);
				return k + 1;
			}

			static inline uint32_t get_truncated_binary_bits(uint32_t v, uint32_t n)
			{
				assert((n >= 2) && (v < n));

				uint32_t k = basisu::floor_log2i(n);
				uint32_t u = (1 << (k + 1)) - n;

				if (v < u)
					return k;

#ifdef _DEBUG
				uint32_t x = v + u;
				assert((x >> 1) >= u);
#endif

				return k + 1;
			}

			inline uint32_t put_rice(uint32_t v, uint32_t m)
			{
				assert(m);

				uint32_t q = v >> m, r = v & ((1 << m) - 1);

				// rice coding sanity check
				assert(q <= 64);

				uint32_t total_bits = q;

				// TODO: put_bits the pattern inverted in bit order
				while (q)
				{
					put_bit(1);
					q--;
				}

				put_bit(0);

				put_bits(r, m);

				total_bits += (m + 1);

				return total_bits;
			}

			static inline uint32_t get_rice_price(uint32_t v, uint32_t m)
			{
				assert(m);

				uint32_t q = v >> m;

				// rice coding sanity check
				assert(q <= 64);

				uint32_t total_bits = q + 1 + m;

				return total_bits;
			}

			inline void put_gamma(uint32_t n, arith_gamma_contexts& ctxs)
			{
				assert(n);
				if (!n)
					return;

				const int k = basisu::floor_log2i(n);
				if (k > 16)
				{
					assert(0);
					return;
				}

				// prefix: k times '1' then a '0'
				for (int i = 0; i < k; ++i)
					encode(1, ctxs.m_ctx_prefix[basisu::minimum<int>(i, cARITH_GAMMA_MAX_PREFIX_CTX - 1)]);

				encode(0, ctxs.m_ctx_prefix[basisu::minimum(k, cARITH_GAMMA_MAX_PREFIX_CTX - 1)]);

				// suffix: the k low bits of n
				for (int i = k - 1; i >= 0; --i)
				{
					uint32_t bit = (n >> i) & 1u;
					encode(bit, ctxs.m_ctx_tail[basisu::minimum<int>(i, cARITH_GAMMA_MAX_TAIL_CTX - 1)]);
				}
			}

			inline float put_gamma_and_return_price(uint32_t n, arith_gamma_contexts& ctxs)
			{
				assert(n);
				if (!n)
					return 0.0f;

				const int k = basisu::floor_log2i(n);
				if (k > 16)
				{
					assert(0);
					return 0.0f;
				}

				float total_price = 0.0f;

				// prefix: k times '1' then a '0'
				for (int i = 0; i < k; ++i)
				{
					total_price += ctxs.m_ctx_prefix[basisu::minimum<int>(i, cARITH_GAMMA_MAX_PREFIX_CTX - 1)].get_price(1);
					encode(1, ctxs.m_ctx_prefix[basisu::minimum<int>(i, cARITH_GAMMA_MAX_PREFIX_CTX - 1)]);
				}

				total_price += ctxs.m_ctx_prefix[basisu::minimum(k, cARITH_GAMMA_MAX_PREFIX_CTX - 1)].get_price(0);
				encode(0, ctxs.m_ctx_prefix[basisu::minimum(k, cARITH_GAMMA_MAX_PREFIX_CTX - 1)]);

				// suffix: the k low bits of n
				for (int i = k - 1; i >= 0; --i)
				{
					uint32_t bit = (n >> i) & 1u;
					total_price += ctxs.m_ctx_tail[basisu::minimum<int>(i, cARITH_GAMMA_MAX_TAIL_CTX - 1)].get_price(bit);
					encode(bit, ctxs.m_ctx_tail[basisu::minimum<int>(i, cARITH_GAMMA_MAX_TAIL_CTX - 1)]);
				}

				return total_price;
			}

			// prediced price, won't be accurate if a binary arith model decides to update in between
			inline float get_gamma_price(uint32_t n, const arith_gamma_contexts& ctxs)
			{
				assert(n);
				if (!n)
					return 0.0f;

				const int k = basisu::floor_log2i(n);
				if (k > 16)
				{
					assert(0);
					return 0.0f;
				}

				float total_price = 0.0f;

				// prefix: k times '1' then a '0'
				for (int i = 0; i < k; ++i)
					total_price += ctxs.m_ctx_prefix[basisu::minimum<int>(i, cARITH_GAMMA_MAX_PREFIX_CTX - 1)].get_price(1);

				total_price += ctxs.m_ctx_prefix[basisu::minimum(k, cARITH_GAMMA_MAX_PREFIX_CTX - 1)].get_price(0);

				// suffix: the k low bits of n
				for (int i = k - 1; i >= 0; --i)
				{
					uint32_t bit = (n >> i) & 1u;
					total_price += ctxs.m_ctx_tail[basisu::minimum<int>(i, cARITH_GAMMA_MAX_TAIL_CTX - 1)].get_price(bit);
				}

				return total_price;
			}

			void encode(uint32_t bit, arith_bit_model& dm)
			{
				uint32_t x = dm.m_bit0_prob * (m_length >> BMLenShift);

				if (!bit)
				{
					m_length = x;
					++dm.m_bit0_count;
				}
				else
				{
					const uint32_t orig_base = m_base;
					m_base += x;
					m_length -= x;

					if (orig_base > m_base)
						prop_carry();
				}
				++dm.m_bit_count;

				if (m_length < ArithMinLen)
					renorm();

				if (--dm.m_bits_until_update <= 0)
					dm.update();
			}

			float encode_and_return_price(uint32_t bit, arith_bit_model& dm)
			{
				const float price = dm.get_price(bit);
				encode(bit, dm);
				return price;
			}

			void encode(uint32_t sym, arith_data_model& dm)
			{
				assert(sym < dm.m_num_data_syms);

				const uint32_t orig_base = m_base;

				if (sym == dm.get_last_sym_index())
				{
					uint32_t x = dm.m_cum_sym_freqs[sym] * (m_length >> DMLenShift);
					m_base += x;
					m_length -= x;
				}
				else
				{
					m_length >>= DMLenShift;
					uint32_t x = dm.m_cum_sym_freqs[sym] * m_length;
					m_base += x;
					m_length = dm.m_cum_sym_freqs[sym + 1] * m_length - x;
				}

				if (orig_base > m_base)
					prop_carry();

				if (m_length < ArithMinLen)
					renorm();

				++dm.m_sym_freqs[sym];
				++dm.m_total_sym_freq;

				if (--dm.m_num_syms_until_next_update <= 0)
					dm.update(true);
			}

			float encode_and_return_price(uint32_t sym, arith_data_model& dm)
			{
				const float price = dm.get_price(sym);
				encode(sym, dm);
				return price;
			}

			void flush()
			{
				const uint32_t orig_base = m_base;

				if (m_length <= (2 * ArithMinLen))
				{
					m_base += ArithMinLen >> 1;
					m_length = ArithMinLen >> 9;
				}
				else
				{
					m_base += ArithMinLen;
					m_length = ArithMinLen >> 1;
				}

				if (orig_base > m_base)
					prop_carry();

				renorm();

				// Pad output to min 5 bytes - quite conservative; we're typically compressing large streams so the overhead shouldn't matter.
				if (m_data_buf.size() < ArithMinExpectedDataBufSize)
					m_data_buf.resize(ArithMinExpectedDataBufSize);
			}

			basisu::uint8_vec& get_data_buf() { return m_data_buf; }
			const basisu::uint8_vec& get_data_buf() const { return m_data_buf; }

		private:
			basisu::uint8_vec m_data_buf;
			uint32_t m_base, m_length;

			inline void prop_carry()
			{
				int64_t ofs = m_data_buf.size() - 1;

				for (; (ofs >= 0) && (m_data_buf[(size_t)ofs] == 0xFF); --ofs)
					m_data_buf[(size_t)ofs] = 0;

				if (ofs >= 0)
					++m_data_buf[(size_t)ofs];
			}

			inline void renorm()
			{
				assert(m_length < ArithMinLen);
				do
				{
					m_data_buf.push_back((uint8_t)(m_base >> 24u));
					m_base <<= 8u;
					m_length <<= 8u;
				} while (m_length < ArithMinLen);
			}
		};

		class arith_dec
		{
		public:
			arith_dec()
			{
				clear();
			}

			void clear()
			{
				m_pData_buf = nullptr;
				m_pData_buf_last_byte = nullptr;
				m_pData_buf_cur = nullptr;
				m_data_buf_size = 0;

				m_value = 0;
				m_length = 0;
			}

			bool init(const uint8_t* pBuf, size_t buf_size)
			{
				if (buf_size < ArithMinExpectedDataBufSize)
				{
					assert(0);
					return false;
				}

				m_pData_buf = pBuf;
				m_pData_buf_last_byte = pBuf + buf_size - 1;
				m_pData_buf_cur = m_pData_buf + 4;
				m_data_buf_size = buf_size;

				m_value = ((uint32_t)(pBuf[0]) << 24u) | ((uint32_t)(pBuf[1]) << 16u) | ((uint32_t)(pBuf[2]) << 8u) | (uint32_t)(pBuf[3]);
				m_length = ArithMaxLen;

				// Check for the 8-bit marker we always place at the beginning of the stream.
				//uint32_t marker = get_bits(8);
				//if (marker != 0x1)
				//	return false;

				return true;
			}

			uint32_t get_bit()
			{
				assert(m_data_buf_size);

				m_length >>= 1;

				uint32_t bit = (m_value >= m_length);

				if (bit)
					m_value -= m_length;

				if (m_length < ArithMinLen)
					renorm();

				return bit;
			}

			enum { cMaxGetBitsLen = 20 };

			uint32_t get_bits(uint32_t num_bits)
			{
				assert(m_data_buf_size);

				if ((num_bits < 1) || (num_bits > cMaxGetBitsLen))
				{
					assert(0);
					return 0;
				}

				m_length >>= num_bits;
				assert(m_length);

				const uint32_t v = m_value / m_length;

				m_value -= m_length * v;

				if (m_length < ArithMinLen)
					renorm();

				return v;
			}

			uint32_t decode_truncated_binary(uint32_t n)
			{
				assert(n >= 2);

				const uint32_t k = basisu::floor_log2i(n);
				const uint32_t u = (1 << (k + 1)) - n;

				uint32_t result = get_bits(k);

				if (result >= u)
					result = ((result << 1) | get_bits(1)) - u;

				return result;
			}

			uint32_t decode_rice(uint32_t m)
			{
				assert(m);

				uint32_t q = 0;
				for (;;)
				{
					uint32_t k = get_bit();
					if (!k)
						break;

					q++;
					if (q > 64)
					{
						assert(0);
						return 0;
					}
				}

				return (q << m) + get_bits(m);
			}

			uint32_t decode_bit(arith_bit_model& dm)
			{
				assert(m_data_buf_size);

				uint32_t x = dm.m_bit0_prob * (m_length >> BMLenShift);
				uint32_t bit = (m_value >= x);

				if (bit == 0)
				{
					m_length = x;
					++dm.m_bit0_count;
				}
				else
				{
					m_value -= x;
					m_length -= x;
				}
				++dm.m_bit_count;

				if (m_length < ArithMinLen)
					renorm();

				if (--dm.m_bits_until_update <= 0)
					dm.update();

				return bit;
			}

			inline uint32_t decode_gamma(arith_gamma_contexts& ctxs)
			{
				int k = 0;
				while (decode_bit(ctxs.m_ctx_prefix[basisu::minimum<int>(k, cARITH_GAMMA_MAX_PREFIX_CTX - 1)]))
				{
					++k;

					if (k > 16)
					{
						// something is very wrong
						assert(0);
						return 0;
					}
				}

				int n = 1 << k;
				for (int i = k - 1; i >= 0; --i)
				{
					uint32_t bit = decode_bit(ctxs.m_ctx_tail[basisu::minimum<int>(i, cARITH_GAMMA_MAX_TAIL_CTX - 1)]);
					n |= (bit << i);
				}

				return n;
			}

			uint32_t decode_sym(arith_data_model& dm)
			{
				assert(m_data_buf_size);
				assert(dm.m_num_data_syms);

				uint32_t x = 0, y = m_length;

				m_length >>= DMLenShift;

				uint32_t low_idx = 0, hi_idx = dm.m_num_data_syms;
				uint32_t mid_idx = hi_idx >> 1;

				do
				{
					uint32_t z = m_length * dm.m_cum_sym_freqs[mid_idx];

					if (z > m_value)
					{
						hi_idx = mid_idx;
						y = z;
					}
					else
					{
						low_idx = mid_idx;
						x = z;
					}
					mid_idx = (low_idx + hi_idx) >> 1;

				} while (mid_idx != low_idx);

				m_value -= x;
				m_length = y - x;

				if (m_length < ArithMinLen)
					renorm();

				++dm.m_sym_freqs[low_idx];
				++dm.m_total_sym_freq;

				if (--dm.m_num_syms_until_next_update <= 0)
					dm.update(false);

				return low_idx;
			}

		private:
			const uint8_t* m_pData_buf;
			const uint8_t* m_pData_buf_last_byte;
			const uint8_t* m_pData_buf_cur;
			size_t m_data_buf_size;

			uint32_t m_value, m_length;

			inline void renorm()
			{
				do
				{
					const uint32_t next_byte = (m_pData_buf_cur > m_pData_buf_last_byte) ? 0 : *m_pData_buf_cur++;

					m_value = (m_value << 8u) | next_byte;

				} while ((m_length <<= 8u) < ArithMinLen);
			}
		};

	} // namespace arith
#endif // BASISD_SUPPORT_XUASTC

#if BASISD_SUPPORT_XUASTC
	namespace bc7u
	{
		int determine_bc7_mode(const void* pBlock);
		int determine_bc7_mode_4_index_mode(const void* pBlock);
		int determine_bc7_mode_4_or_5_rotation(const void* pBlock);
		bool unpack_bc7_mode6(const void* pBlock_bits, color_rgba* pPixels);
		bool unpack_bc7(const void* pBlock, color_rgba* pPixels);
	} // namespace bc7u

	namespace bc7f 
	{
		enum
		{
			// Low-level BC7 encoder configuration flags.
			cPackBC7FlagUse2SubsetsRGB = 1, // use mode 1/3 for RGB blocks
			cPackBC7FlagUse2SubsetsRGBA = 2, // use mode 7 for RGBA blocks

			cPackBC7FlagUse3SubsetsRGB = 4, // also use mode 0/2, cPackBC7FlagUse2SubsetsRGB MUST be enabled too

			cPackBC7FlagUseDualPlaneRGB = 8, // enable mode 4/5 usage for RGB blocks
			cPackBC7FlagUseDualPlaneRGBA = 16, // enable mode 4/5 usage for RGBA blocks

			cPackBC7FlagPBitOpt = 32, // enable to disable usage of fixed p-bits on some modes; slower
			cPackBC7FlagPBitOptMode6 = 64, // enable to disable usage of fixed p-bits on mode 6, alpha on fully opaque blocks may be 254 however; slower

			cPackBC7FlagUseTrivialMode6 = 128, // enable trivial fast mode 6 encoder on blocks with very low variances (highly recommended)

			cPackBC7FlagPartiallyAnalyticalRGB = 256, // partially analytical mode for RGB blocks, slower but higher quality, computes actual SSE's on complex blocks to resolve which mode to use vs. predictions
			cPackBC7FlagPartiallyAnalyticalRGBA = 512, // partially analytical mode for RGBA blocks, slower but higher quality, computes actual SSE's on complex blocks to resolve which mode to use vs. predictions

			// Non-analytical is really still partially analytical on the mode pairs (0 vs. 2, 1 vs 3, 4 vs. 5).
			cPackBC7FlagNonAnalyticalRGB = 1024, // very slow/brute force, totally abuses the encoder, MUST use with cPackBC7FlagPartiallyAnalyticalRGB flag
			cPackBC7FlagNonAnalyticalRGBA = 2048, // very slow/brute force, totally abuses the encoder, MUST use with cPackBC7FlagPartiallyAnalyticalRGBA flag

			// Default to use first:

			// Decent analytical BC7 defaults
			cPackBC7FlagDefaultFastest = cPackBC7FlagUseTrivialMode6, // very weak particularly on alpha, mode 6 only for RGB/RGBA, 

			// Mode 6 with pbits for RGB, Modes 4,5,6 for alpha.
			cPackBC7FlagDefaultFaster = cPackBC7FlagPBitOpt | cPackBC7FlagUseDualPlaneRGBA | cPackBC7FlagUseTrivialMode6,

			cPackBC7FlagDefaultFast = cPackBC7FlagUse2SubsetsRGB | cPackBC7FlagUse2SubsetsRGBA | cPackBC7FlagUseDualPlaneRGBA |
				cPackBC7FlagPBitOpt | cPackBC7FlagUseTrivialMode6,

			cPackBC7FlagDefault = (cPackBC7FlagUse2SubsetsRGB | cPackBC7FlagUse2SubsetsRGBA | cPackBC7FlagUse3SubsetsRGB) |
				(cPackBC7FlagUseDualPlaneRGB | cPackBC7FlagUseDualPlaneRGBA) |
				(cPackBC7FlagPBitOpt | cPackBC7FlagPBitOptMode6) |
				cPackBC7FlagUseTrivialMode6,

			// Default partially analytical BC7 defaults (slower)
			cPackBC7FlagDefaultPartiallyAnalytical = cPackBC7FlagDefault | (cPackBC7FlagPartiallyAnalyticalRGB | cPackBC7FlagPartiallyAnalyticalRGBA),

			// Default non-analytical BC7 defaults (very slow). In reality the encoder is still analytical on the mode pairs, but at the highest level is non-analytical.
			cPackBC7FlagDefaultNonAnalytical = (cPackBC7FlagDefaultPartiallyAnalytical | (cPackBC7FlagNonAnalyticalRGB | cPackBC7FlagNonAnalyticalRGBA)) & ~cPackBC7FlagUseTrivialMode6
		};

		void init();
		
		void fast_pack_bc7_rgb_analytical(uint8_t* pBlock, const color_rgba* pPixels, uint32_t flags);
		uint32_t fast_pack_bc7_rgb_partial_analytical(uint8_t* pBlock, const color_rgba* pPixels, uint32_t flags);

		void fast_pack_bc7_rgba_analytical(uint8_t* pBlock, const color_rgba* pPixels, uint32_t flags);
		uint32_t fast_pack_bc7_rgba_partial_analytical(uint8_t* pBlock, const color_rgba* pPixels, uint32_t flags);
		
		uint32_t fast_pack_bc7_auto_rgba(uint8_t* pBlock, const color_rgba* pPixels, uint32_t flags);

		void print_perf_stats();

#if 0
		// Very basic BC7 mode 6 only to ASTC.
		void fast_pack_astc(void* pBlock, const color_rgba* pPixels);
#endif
		
		uint32_t calc_sse(const uint8_t* pBlock, const color_rgba* pPixels);

	} // namespace bc7f

	namespace etc1f
	{
		struct pack_etc1_state
		{
			uint64_t m_prev_solid_block;
			//decoder_etc_block m_prev_solid_block;

			int m_prev_solid_r8;
			int m_prev_solid_g8;
			int m_prev_solid_b8;

			pack_etc1_state()
			{
				clear();
			}

			void clear()
			{
				m_prev_solid_r8 = -1;
				m_prev_solid_g8 = -1;
				m_prev_solid_b8 = -1;
			}
		};

		void init();

		void pack_etc1_solid(uint8_t* pBlock, const color_rgba& color, pack_etc1_state& state, bool init_flag = false);

		void pack_etc1(uint8_t* pBlock, const color_rgba* pPixels, pack_etc1_state& state);
		
		void pack_etc1_grayscale(uint8_t* pBlock, const uint8_t* pPixels, pack_etc1_state& state);

	} // namespace etc1f
#endif // BASISD_SUPPORT_XUASTC

	// Private/internal XUASTC LDR transcoding helpers

	// XUASTC LDR formats only
	enum class transcoder_texture_format;
	block_format xuastc_get_block_format(transcoder_texture_format tex_fmt);

#if BASISD_SUPPORT_XUASTC
	// Low-quality, but fast, PVRTC1 RGB/RGBA encoder. Power of 2 texture dimensions required.
	// Note: Not yet part of our public API: this API may change! 
	void encode_pvrtc1(
		block_format fmt, void* pDst_blocks,
		const basisu::vector2D<color32>& temp_image,
		uint32_t dst_num_blocks_x, uint32_t dst_num_blocks_y, bool from_alpha);
		
	void transcode_4x4_block(
		block_format fmt, // desired output block format
		uint32_t block_x, uint32_t block_y, // 4x4 block being processed
		void* pDst_blocks, // base pointer to output buffer/bitmap
		uint8_t* pDst_block_u8, // pointer to output block/or first pixel to write
		const color32* block_pixels, // pointer to 4x4 (16) 32bpp RGBA pixels
		uint32_t output_block_or_pixel_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels, uint32_t output_rows_in_pixels, // output buffer dimensions
		int channel0, int channel1, // channels to process, used by some block formats
		bool high_quality, bool from_alpha,	// Flags specific to certain block formats
		uint32_t bc7f_flags, // Real-time bc7f BC7 encoder flags, see bc7f::cPackBC7FlagDefault etc.
		etc1f::pack_etc1_state& etc1_pack_state,  // etc1f thread local state
		int has_alpha = -1); // has_alpha = -1 unknown, 0=definitely no (a all 255's), 1=potentially yes
#endif // BASISD_SUPPORT_XUASTC

	struct bc7_mode_5
	{
		union
		{
			struct
			{
				uint64_t m_mode : 6;
				uint64_t m_rot : 2;

				uint64_t m_r0 : 7;
				uint64_t m_r1 : 7;
				uint64_t m_g0 : 7;
				uint64_t m_g1 : 7;
				uint64_t m_b0 : 7;
				uint64_t m_b1 : 7;
				uint64_t m_a0 : 8;
				uint64_t m_a1_0 : 6;

			} m_lo;

			uint64_t m_lo_bits;
		};

		union
		{
			struct
			{
				uint64_t m_a1_1 : 2;

				// bit 2
				uint64_t m_c00 : 1;
				uint64_t m_c10 : 2;
				uint64_t m_c20 : 2;
				uint64_t m_c30 : 2;

				uint64_t m_c01 : 2;
				uint64_t m_c11 : 2;
				uint64_t m_c21 : 2;
				uint64_t m_c31 : 2;

				uint64_t m_c02 : 2;
				uint64_t m_c12 : 2;
				uint64_t m_c22 : 2;
				uint64_t m_c32 : 2;

				uint64_t m_c03 : 2;
				uint64_t m_c13 : 2;
				uint64_t m_c23 : 2;
				uint64_t m_c33 : 2;

				// bit 33
				uint64_t m_a00 : 1;
				uint64_t m_a10 : 2;
				uint64_t m_a20 : 2;
				uint64_t m_a30 : 2;

				uint64_t m_a01 : 2;
				uint64_t m_a11 : 2;
				uint64_t m_a21 : 2;
				uint64_t m_a31 : 2;

				uint64_t m_a02 : 2;
				uint64_t m_a12 : 2;
				uint64_t m_a22 : 2;
				uint64_t m_a32 : 2;

				uint64_t m_a03 : 2;
				uint64_t m_a13 : 2;
				uint64_t m_a23 : 2;
				uint64_t m_a33 : 2;

			} m_hi;

			uint64_t m_hi_bits;
		};
	};

} // namespace basist