/****************************************************************************** * Universal Analytics for C * -- URL encoding module for UTF-8 compatibility with Google Analytics * Copyright (c) 2013, Analytics Pros * * This project is free software, distributed under the BSD license. * Analytics Pros offers consulting and integration services if your firm needs * assistance in strategy, implementation, or auditing existing work. ******************************************************************************/ #include #include #include #include #include #include "encode.h" /* The following constants are symbolized for readability in later encoding phases */ #define ASCII_HIGH 0x7F #define UTF8_INVALID 0xFFFF #define UTF8_SWAPCHAR 0xFFFE #define UTF8_HIGH_TWO_BYTES 0x7FF #define UTF8_HIGH_THREE_BYTES 0xFFFD #define UTF8_HIGH_FOUR_BYTES 0x1FFFFF #define UTF8_HIGH_FIVE_BYTES 0x3FFFFFF #define UTF8_HIGH_SIX_BYTES 0x7FFFFFFF #define _minimum(a, b) ((a < b) ? a : b) /* Mapping for hexadecimal conversion */ static const char _hexchar[] = "0123456789abcdef"; // Write a hexidecimal value (32 bit) to a character buffer unsigned int hexadecimal(char* output, unsigned int value){ assert(NULL != output); __uint a = (value >> 28) & 0xF; __uint b = (value >> 24) & 0xF; __uint c = (value >> 20) & 0xF; __uint d = (value >> 16) & 0xF; __uint e = (value >> 12) & 0xF; __uint f = (value >> 8) & 0xF; __uint g = (value >> 4) & 0xF; __uint h = (value & 0xF); __uint i = 0; if(a) output[i++] = _hexchar[ (int) a ]; if(b || a) output[i++] = _hexchar[ (int) b ]; if(c || b || a) output[i++] = _hexchar[ (int) c ]; if(d || c || b || a) output[i++] = _hexchar[ (int) d ]; if(e || d || c || b || a) output[i++] = _hexchar[ (int) e ]; if(f || e || d || c || b || a) output[i++] = _hexchar[ (int) f ]; if(g || f || e || d || c || b || a) output[i++] = _hexchar[ (int) g ]; output[i++] = _hexchar[ (int) h ]; return i; } // Primarily intended to aid the translation of binary MD5 digests unsigned int hexdigest(char* hex_output, unsigned char* binary, unsigned int binary_len){ unsigned int i; unsigned int o = 0; for(i = 0; i < binary_len; i++){ o += hexadecimal(hex_output + o, (unsigned int) binary[i]); } return o; } static int isASCIIcompat_char(char char_val){ return (char_val == 0x09 || char_val == 0x0A || char_val == 0x0D || (0x20 <= char_val && char_val <= 0x7F)); } size_t urlencode_put(char* result, size_t result_max, const char *multibyte_input, size_t input_len){ assert(NULL != result); assert(NULL != multibyte_input); unsigned int i = 0; unsigned int r = 0; unsigned int offset = 0; wchar_t current; mbtowc(NULL, NULL, 0); // Reset multibyte state do { // Convert the current multi-byte character into a wide representation (i.e. unsigned long int) offset += mbtowc(& current, (multibyte_input) + (offset), MB_CUR_MAX); if(current == 0){ break; // Stop on NULL termination } // Spaces are encoded as "plus" (+) else if(current == ' ' && r < result_max){ result[r++] = '+'; continue; } // These characters are allowed as literals else if((iswalnum(current) || current == '-' || current == '.' || current == '~') && r < result_max){ result[r++] = (char)current; } // Standard ASCII characters are encoded simply else if(isASCIIcompat_char((char)current) && r < result_max){ result[r++] = '%'; r += hexadecimal(result + r, (unsigned int) (current & 0xFF)); } // The method |mbtowc| (above) takes care of splitting UTF8 bytes, // so this can run directly... else if(current >= ASCII_HIGH && current <= UTF8_HIGH_SIX_BYTES && (r +2) < result_max){ result[r++] = '%'; r+= hexadecimal(result + r, (unsigned int) current); } // This would seem to be an encoding error. // Considering fall-back to "hexdigest" representation. else if(result_max > r) { result[r++] = '*'; break; } } while((i++) < input_len && (r < result_max)); return r; } size_t urlencode_put_limit(const char* mb_input, char* output, size_t output_limit){ assert(NULL != mb_input); assert(NULL != output); size_t input_len = mbstowcs(NULL, mb_input, 0) +1; memset(output, 0, output_limit); return urlencode_put(output, output_limit, mb_input, input_len); } /* Create a new character buffer, and write the given input as URL-encoded (UTF-8) */ char* urlencode(const char* mb_input){ assert(NULL != mb_input); size_t input_len = mbstowcs(NULL, mb_input, 0) +1; // Prepare the output buffer; in some cases each character input could result // in 12 characters encoded output ['%' + XX (hex), for up to 4 bytes] unsigned long int output_allocation = sizeof(char) * ((unsigned long int)input_len * 12); char* output = (char*)malloc(output_allocation); memset(output, 0, output_allocation); urlencode_put(output, output_allocation, mb_input, input_len); return output; } /* For compatibility with our former encoding model... */ unsigned int encodeURIComponent(const char input[], char output[], const unsigned int input_len, const unsigned int output_max){ return (unsigned int)urlencode_put(output, output_max, input, input_len); }