From 3c539e69ce6ddf6421ee8b7ee590ef6ca6e314c4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Mal=C3=BD?= Date: Wed, 10 Feb 2016 17:39:49 +0100 Subject: [PATCH] - Convert correctly from ISO-8859-1 to UTF-8 - Fix new format type autodetection --- libhpcs.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-------- libhpcs_p.h | 3 ++- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/libhpcs.c b/libhpcs.c index 40baaea..7d56eaa 100644 --- a/libhpcs.c +++ b/libhpcs.c @@ -283,7 +283,7 @@ static enum HPCS_ParseCode autodetect_file_type(FILE* datafile, enum HPCS_FileTy enum HPCS_ParseCode pret; const HPCS_offset devsig_info_offset = OLD_FORMAT(gentype) ? DATA_OFFSET_DEVSIG_INFO_OLD : DATA_OFFSET_DEVSIG_INFO; - pret = read_string_at_offset(datafile, devsig_info_offset, &type_id, gentype); + pret = read_string_at_offset(datafile, devsig_info_offset, &type_id, OLD_FORMAT(gentype)); if (pret != PARSE_OK) return pret; @@ -790,7 +790,7 @@ static enum HPCS_ParseCode read_file_header(FILE* datafile, enum HPCS_ChemStatio return pret; } - pret = autodetect_file_type(datafile, &mdata->file_type, p_means_pressure(*cs_ver), old_format); + pret = autodetect_file_type(datafile, &mdata->file_type, p_means_pressure(*cs_ver), gentype); if (pret != PARSE_OK) { PR_DEBUG("Cannot determine the type of file\n"); return pret; @@ -1069,6 +1069,8 @@ static enum HPCS_ParseCode __read_string_at_offset_v1(FILE* datafile, const HPCS { size_t r; char ch; + char* string; + enum HPCS_ParseCode ret; size_t str_length = 0; fseek(datafile, offset, SEEK_SET); @@ -1090,21 +1092,28 @@ static enum HPCS_ParseCode __read_string_at_offset_v1(FILE* datafile, const HPCS } /* Allocate read buffer */ - *result = calloc(str_length + 1, SMALL_SEGMENT_SIZE); - if (*result == NULL) + string = calloc(str_length + 1, SMALL_SEGMENT_SIZE); + if (string == NULL) return PARSE_E_NO_MEM; - memset(*result, 0, (str_length + 1)); + memset(string, 0, (str_length + 1)); /* Rewind the file and read the string */ fseek(datafile, offset, SEEK_SET); - r = fread(*result, SMALL_SEGMENT_SIZE, str_length, datafile); + r = fread(string, SMALL_SEGMENT_SIZE, str_length, datafile); if (r < str_length) { - free(*result); + free(string); return PARSE_E_CANT_READ; } - return PARSE_OK; +#ifdef _WIN32 + ret = __win32_latin1_to_utf8(result, string); +#else + ret = __unix_data_to_utf8(result, string, "ISO-8859-1", str_length); +#endif + free(string); + + return ret; } static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS_offset offset, char** const result) @@ -1146,7 +1155,7 @@ static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS ret = __win32_wchar_to_utf8(result, (WCHAR*)string); #else /* Explicitly convert from UTF-16LE (internal WCHAR representation) */ - ret = __unix_wchar_to_utf8(result, string, str_length * SEGMENT_SIZE); + ret = __unix_data_to_utf8(result, string, "UTF-16LE", str_length * SEGMENT_SIZE); #endif free(string); @@ -1247,6 +1256,51 @@ static enum HPCS_ParseCode __win32_parse_native_method_info_line(char** name, ch return PARSE_OK; } +static enum HPCS_ParseCode __win32_latin1_to_utf8(char** target, const char *s) +{ + wchar_t* intermediate; + size_t mb_size; + + size_t w_size = MultiByteToWideChar(28591, MB_ERR_INVALID_CHARS, s, -1, NULL, 0); + if (w_size == 0) { + PR_DEBUGF("Count MultiByteToWideChar() error 0x%x\n", GetLastError()); + return PARSE_E_INTERNAL; + } + PR_DEBUGF("w_size: %d\n", w_size); + + intermediate = malloc(sizeof(wchar_t) * w_size); + if (intermediate == NULL) + return PARSE_E_NO_MEM; + + size_t w_size = MultiByteToWideChar(28591, MB_ERR_INVALID_CHARS, s, -1, intermediate, 0); + if (w_size == 0) { + PR_DEBUGF("Convert MultiByteToWideChar() error 0x%x\n", GetLastError()); + return PARSE_E_INTERNAL; + } + + mb_size = WideCharToMultiByte(CP_UTF8, 0, intermediate, -1, NULL, 0, NULL, NULL); + if (mb_size == 0) { + PR_DEBUGF("Count WideCharToMultiByte() error: 0x%x\n", GetLastError()); + return PARSE_E_INTERNAL; + } + + *target = malloc(mb_size); + if (*target == NULL) { + free(intermediate); + return PARSE_E_NO_MEM; + } + + if (WideCharToMultiByte(CP_UTF8, 0, intermediate, -1, *target, mb_size, NULL, NULL) == 0) { + free(*target); + PR_DEBUGF("Convert WideCharToMultiByte() error: 0x%x\n", GetLastError()); + return PARSE_E_INTERNAL; + } + + free(intermediate) + + return PARSE_OK; +} + static bool __win32_utf8_to_wchar(wchar_t** target, const char *s) { size_t w_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s, -1, NULL, 0); @@ -1395,7 +1449,7 @@ static enum HPCS_ParseCode __unix_parse_native_method_info_line(char** name, cha return PARSE_OK; } -static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes, const size_t bytes_count) +static enum HPCS_ParseCode __unix_data_to_utf8(char** target, const char* bytes, const char* encoding, const size_t bytes_count) { int32_t u_size; UChar* u_str; @@ -1403,7 +1457,7 @@ static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes enum HPCS_ParseCode ret; UErrorCode uec = U_ZERO_ERROR; - cnv = ucnv_open("UTF-16LE", &uec); + cnv = ucnv_open(encoding, &uec); if (U_FAILURE(uec)) { PR_DEBUGF("Unable to create converter, error: %s\n", u_errorName(uec)); return PARSE_E_INTERNAL; diff --git a/libhpcs_p.h b/libhpcs_p.h index 5092b36..3705712 100644 --- a/libhpcs_p.h +++ b/libhpcs_p.h @@ -201,6 +201,7 @@ static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS static enum HPCS_ParseCode __win32_next_native_line(FILE* fh, WCHAR* line, int32_t length); static HPCS_UFH __win32_open_data_file(const char* filename); static enum HPCS_ParseCode __win32_parse_native_method_info_line(char** name, char** value, WCHAR* line); +static enum HPCS_ParseCode __win32_latin1_to_utf8(char** target, const char *s) static bool __win32_utf8_to_wchar(wchar_t** target, const char* s); static enum HPCS_ParseCode __win32_wchar_to_utf8(char** target, const WCHAR* s); #else @@ -210,7 +211,7 @@ static enum HPCS_ParseCode __unix_icu_to_utf8(char** target, const UChar* s); static HPCS_UFH __unix_open_data_file(const char* filename); static enum HPCS_ParseCode __unix_next_native_line(UFILE* fh, UChar* line, int32_t length); static enum HPCS_ParseCode __unix_parse_native_method_info_line(char** name, char** value, UChar* line); -static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes, const size_t bytes_count); +static enum HPCS_ParseCode __unix_data_to_utf8(char** target, const char* bytes, const char* encoding, const size_t bytes_count); static char* __DEFAULT_CS_REV(); static char* __DEFAULT_CS_VER(); -- 2.43.5