179 lines
5.1 KiB
C++
179 lines
5.1 KiB
C++
/******************************************************************************
|
|
* Universal Analytics for C
|
|
* -- URL encoding module for UTF-8 compatibility with Google Analytics
|
|
* Copyright (c) 2013, Analytics Pros
|
|
*
|
|
* This project is free software, distributed under the BSD license.
|
|
* Analytics Pros offers consulting and integration services if your firm needs
|
|
* assistance in strategy, implementation, or auditing existing work.
|
|
******************************************************************************/
|
|
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <wctype.h>
|
|
|
|
#include "encode.h"
|
|
|
|
/* The following constants are symbolized for readability in later encoding phases */
|
|
#define ASCII_HIGH 0x7F
|
|
#define UTF8_INVALID 0xFFFF
|
|
#define UTF8_SWAPCHAR 0xFFFE
|
|
#define UTF8_HIGH_TWO_BYTES 0x7FF
|
|
#define UTF8_HIGH_THREE_BYTES 0xFFFD
|
|
#define UTF8_HIGH_FOUR_BYTES 0x1FFFFF
|
|
#define UTF8_HIGH_FIVE_BYTES 0x3FFFFFF
|
|
#define UTF8_HIGH_SIX_BYTES 0x7FFFFFFF
|
|
|
|
|
|
#define _minimum(a, b) ((a < b) ? a : b)
|
|
|
|
|
|
/* Mapping for hexadecimal conversion */
|
|
static const char _hexchar[] = "0123456789abcdef";
|
|
|
|
|
|
// Write a hexidecimal value (32 bit) to a character buffer
|
|
unsigned int hexadecimal(char* output, unsigned int value){
|
|
|
|
assert(NULL != output);
|
|
|
|
__uint a = (value >> 28) & 0xF;
|
|
__uint b = (value >> 24) & 0xF;
|
|
__uint c = (value >> 20) & 0xF;
|
|
__uint d = (value >> 16) & 0xF;
|
|
__uint e = (value >> 12) & 0xF;
|
|
__uint f = (value >> 8) & 0xF;
|
|
__uint g = (value >> 4) & 0xF;
|
|
__uint h = (value & 0xF);
|
|
|
|
__uint i = 0;
|
|
|
|
if(a) output[i++] = _hexchar[ (int) a ];
|
|
if(b || a) output[i++] = _hexchar[ (int) b ];
|
|
if(c || b || a) output[i++] = _hexchar[ (int) c ];
|
|
if(d || c || b || a) output[i++] = _hexchar[ (int) d ];
|
|
if(e || d || c || b || a) output[i++] = _hexchar[ (int) e ];
|
|
if(f || e || d || c || b || a) output[i++] = _hexchar[ (int) f ];
|
|
if(g || f || e || d || c || b || a) output[i++] = _hexchar[ (int) g ];
|
|
output[i++] = _hexchar[ (int) h ];
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
// Primarily intended to aid the translation of binary MD5 digests
|
|
unsigned int hexdigest(char* hex_output, unsigned char* binary, unsigned int binary_len){
|
|
unsigned int i;
|
|
unsigned int o = 0;
|
|
for(i = 0; i < binary_len; i++){
|
|
o += hexadecimal(hex_output + o, (unsigned int) binary[i]);
|
|
}
|
|
return o;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int isASCIIcompat_char(char char_val){
|
|
return (char_val == 0x09 || char_val == 0x0A || char_val == 0x0D || (0x20 <= char_val && char_val <= 0x7F));
|
|
}
|
|
|
|
|
|
|
|
size_t urlencode_put(char* result, size_t result_max, const char *multibyte_input, size_t input_len){
|
|
|
|
assert(NULL != result);
|
|
assert(NULL != multibyte_input);
|
|
|
|
unsigned int i = 0;
|
|
unsigned int r = 0;
|
|
unsigned int offset = 0;
|
|
wchar_t current;
|
|
|
|
mbtowc(NULL, NULL, 0); // Reset multibyte state
|
|
|
|
do {
|
|
// Convert the current multi-byte character into a wide representation (i.e. unsigned long int)
|
|
offset += mbtowc(& current, (multibyte_input) + (offset), MB_CUR_MAX);
|
|
|
|
if(current == 0){
|
|
break; // Stop on NULL termination
|
|
}
|
|
|
|
// Spaces are encoded as "plus" (+)
|
|
else if(current == ' ' && r < result_max){
|
|
result[r++] = '+';
|
|
continue;
|
|
}
|
|
|
|
// These characters are allowed as literals
|
|
else if((iswalnum(current) || current == '-' || current == '.' || current == '~') && r < result_max){
|
|
result[r++] = (char)current;
|
|
}
|
|
|
|
// Standard ASCII characters are encoded simply
|
|
else if(isASCIIcompat_char((char)current) && r < result_max){
|
|
result[r++] = '%';
|
|
r += hexadecimal(result + r, (unsigned int) (current & 0xFF));
|
|
}
|
|
|
|
// The method |mbtowc| (above) takes care of splitting UTF8 bytes,
|
|
// so this can run directly...
|
|
else if(current >= ASCII_HIGH && current <= UTF8_HIGH_SIX_BYTES && (r +2) < result_max){
|
|
result[r++] = '%';
|
|
r+= hexadecimal(result + r, (unsigned int) current);
|
|
}
|
|
|
|
|
|
// This would seem to be an encoding error.
|
|
// Considering fall-back to "hexdigest" representation.
|
|
else if(result_max > r) {
|
|
result[r++] = '*';
|
|
break;
|
|
}
|
|
|
|
} while((i++) < input_len && (r < result_max));
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
size_t urlencode_put_limit(const char* mb_input, char* output, size_t output_limit){
|
|
assert(NULL != mb_input);
|
|
assert(NULL != output);
|
|
|
|
size_t input_len = mbstowcs(NULL, mb_input, 0) +1;
|
|
memset(output, 0, output_limit);
|
|
|
|
return urlencode_put(output, output_limit, mb_input, input_len);
|
|
}
|
|
|
|
/* Create a new character buffer, and write the given input as URL-encoded (UTF-8) */
|
|
char* urlencode(const char* mb_input){
|
|
assert(NULL != mb_input);
|
|
|
|
size_t input_len = mbstowcs(NULL, mb_input, 0) +1;
|
|
|
|
// Prepare the output buffer; in some cases each character input could result
|
|
// in 12 characters encoded output ['%' + XX (hex), for up to 4 bytes]
|
|
unsigned long int output_allocation = sizeof(char) * ((unsigned long int)input_len * 12);
|
|
char* output = (char*)malloc(output_allocation);
|
|
memset(output, 0, output_allocation);
|
|
|
|
urlencode_put(output, output_allocation, mb_input, input_len);
|
|
return output;
|
|
}
|
|
|
|
|
|
/* For compatibility with our former encoding model... */
|
|
unsigned int encodeURIComponent(char input[], char output[], const unsigned int input_len, const unsigned int output_max){
|
|
return (unsigned int)urlencode_put(output, output_max, input, input_len);
|
|
}
|
|
|
|
|
|
|
|
|