]> Devoid-pointer.net GitWeb - libHPCS.git/commitdiff
- Convert correctly from ISO-8859-1 to UTF-8
authorMichal Malý <madcatxster@devoid-pointer.net>
Wed, 10 Feb 2016 16:39:49 +0000 (17:39 +0100)
committerMichal Malý <madcatxster@devoid-pointer.net>
Wed, 10 Feb 2016 16:39:49 +0000 (17:39 +0100)
- Fix new format type autodetection

libhpcs.c
libhpcs_p.h

index 40baaeafbcbe08acef0d1580d656a7802a9ff01e..7d56eaa0718a40410d1f9a26b86a1eb14b625411 100644 (file)
--- a/libhpcs.c
+++ b/libhpcs.c
@@ -283,7 +283,7 @@ static enum HPCS_ParseCode autodetect_file_type(FILE* datafile, enum HPCS_FileTy
        enum HPCS_ParseCode pret;
        const HPCS_offset devsig_info_offset = OLD_FORMAT(gentype) ? DATA_OFFSET_DEVSIG_INFO_OLD : DATA_OFFSET_DEVSIG_INFO;
 
-       pret = read_string_at_offset(datafile, devsig_info_offset, &type_id, gentype);
+       pret = read_string_at_offset(datafile, devsig_info_offset, &type_id, OLD_FORMAT(gentype));
        if (pret != PARSE_OK)
                return pret;
 
@@ -790,7 +790,7 @@ static enum HPCS_ParseCode read_file_header(FILE* datafile, enum HPCS_ChemStatio
                return pret;
        }
 
-       pret = autodetect_file_type(datafile, &mdata->file_type, p_means_pressure(*cs_ver), old_format);
+       pret = autodetect_file_type(datafile, &mdata->file_type, p_means_pressure(*cs_ver), gentype);
        if (pret != PARSE_OK) {
            PR_DEBUG("Cannot determine the type of file\n");
            return pret;
@@ -1069,6 +1069,8 @@ static enum HPCS_ParseCode __read_string_at_offset_v1(FILE* datafile, const HPCS
 {
        size_t r;
        char ch;
+       char* string;
+       enum HPCS_ParseCode ret;
        size_t str_length = 0;
 
        fseek(datafile, offset, SEEK_SET);
@@ -1090,21 +1092,28 @@ static enum HPCS_ParseCode __read_string_at_offset_v1(FILE* datafile, const HPCS
        }
 
        /* Allocate read buffer */
-       *result = calloc(str_length + 1, SMALL_SEGMENT_SIZE);
-       if (*result == NULL)
+       string = calloc(str_length + 1, SMALL_SEGMENT_SIZE);
+       if (string == NULL)
                return PARSE_E_NO_MEM;
 
-       memset(*result, 0, (str_length + 1));
+       memset(string, 0, (str_length + 1));
 
        /* Rewind the file and read the string */
        fseek(datafile, offset, SEEK_SET);
-       r = fread(*result, SMALL_SEGMENT_SIZE, str_length, datafile);
+       r = fread(string, SMALL_SEGMENT_SIZE, str_length, datafile);
        if (r < str_length) {
-               free(*result);
+               free(string);
                return PARSE_E_CANT_READ;
        }
 
-       return PARSE_OK;
+#ifdef _WIN32
+       ret = __win32_latin1_to_utf8(result, string);
+#else
+       ret = __unix_data_to_utf8(result, string, "ISO-8859-1", str_length);
+#endif
+       free(string);
+
+       return ret;
 }
 
 static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS_offset offset, char** const result)
@@ -1146,7 +1155,7 @@ static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS
        ret = __win32_wchar_to_utf8(result, (WCHAR*)string);
 #else
        /* Explicitly convert from UTF-16LE (internal WCHAR representation) */
-       ret = __unix_wchar_to_utf8(result, string, str_length * SEGMENT_SIZE);
+       ret = __unix_data_to_utf8(result, string, "UTF-16LE", str_length * SEGMENT_SIZE);
 #endif
 
        free(string);
@@ -1247,6 +1256,51 @@ static enum HPCS_ParseCode __win32_parse_native_method_info_line(char** name, ch
        return PARSE_OK;
 }
 
+static enum HPCS_ParseCode __win32_latin1_to_utf8(char** target, const char *s)
+{
+       wchar_t* intermediate;
+       size_t mb_size;
+
+       size_t w_size = MultiByteToWideChar(28591, MB_ERR_INVALID_CHARS, s, -1, NULL, 0);
+       if (w_size == 0) {
+               PR_DEBUGF("Count MultiByteToWideChar() error 0x%x\n", GetLastError());
+               return PARSE_E_INTERNAL;
+       }
+       PR_DEBUGF("w_size: %d\n", w_size);
+
+       intermediate = malloc(sizeof(wchar_t) * w_size);
+       if (intermediate == NULL)
+               return PARSE_E_NO_MEM;
+
+       size_t w_size = MultiByteToWideChar(28591, MB_ERR_INVALID_CHARS, s, -1, intermediate, 0);
+       if (w_size == 0) {
+               PR_DEBUGF("Convert MultiByteToWideChar() error 0x%x\n", GetLastError());
+               return PARSE_E_INTERNAL;
+       }
+
+       mb_size = WideCharToMultiByte(CP_UTF8, 0, intermediate, -1, NULL, 0, NULL, NULL);
+       if (mb_size == 0) {
+               PR_DEBUGF("Count WideCharToMultiByte() error: 0x%x\n", GetLastError());
+               return PARSE_E_INTERNAL;
+       }
+
+       *target = malloc(mb_size);
+       if (*target == NULL) {
+               free(intermediate);
+               return PARSE_E_NO_MEM;
+       }
+
+       if (WideCharToMultiByte(CP_UTF8, 0, intermediate, -1, *target, mb_size, NULL, NULL) == 0) {
+               free(*target);
+               PR_DEBUGF("Convert WideCharToMultiByte() error: 0x%x\n", GetLastError());
+               return PARSE_E_INTERNAL;
+       }
+
+       free(intermediate)
+
+       return PARSE_OK;
+}
+
 static bool __win32_utf8_to_wchar(wchar_t** target, const char *s)
 {
        size_t w_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s, -1, NULL, 0);
@@ -1395,7 +1449,7 @@ static enum HPCS_ParseCode __unix_parse_native_method_info_line(char** name, cha
        return PARSE_OK;
 }
 
-static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes, const size_t bytes_count)
+static enum HPCS_ParseCode __unix_data_to_utf8(char** target, const char* bytes, const char* encoding, const size_t bytes_count)
 {
        int32_t u_size;
        UChar* u_str;
@@ -1403,7 +1457,7 @@ static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes
        enum HPCS_ParseCode ret;
        UErrorCode uec = U_ZERO_ERROR;
 
-       cnv = ucnv_open("UTF-16LE", &uec);
+       cnv = ucnv_open(encoding, &uec);
        if (U_FAILURE(uec)) {
                PR_DEBUGF("Unable to create converter, error: %s\n", u_errorName(uec));
                return PARSE_E_INTERNAL;
index 5092b36a27ce6eccffbf71ee50fa45115cff0bb3..3705712d89c28f3fa4e8568a1e4c743f076a413e 100644 (file)
@@ -201,6 +201,7 @@ static enum HPCS_ParseCode __read_string_at_offset_v2(FILE* datafile, const HPCS
 static enum HPCS_ParseCode __win32_next_native_line(FILE* fh, WCHAR* line, int32_t length);
 static HPCS_UFH __win32_open_data_file(const char* filename);
 static enum HPCS_ParseCode __win32_parse_native_method_info_line(char** name, char** value, WCHAR* line);
+static enum HPCS_ParseCode __win32_latin1_to_utf8(char** target, const char *s)
 static bool __win32_utf8_to_wchar(wchar_t** target, const char* s);
 static enum HPCS_ParseCode __win32_wchar_to_utf8(char** target, const WCHAR* s);
 #else
@@ -210,7 +211,7 @@ static enum HPCS_ParseCode __unix_icu_to_utf8(char** target, const UChar* s);
 static HPCS_UFH __unix_open_data_file(const char* filename);
 static enum HPCS_ParseCode __unix_next_native_line(UFILE* fh, UChar* line, int32_t length);
 static enum HPCS_ParseCode __unix_parse_native_method_info_line(char** name, char** value, UChar* line);
-static enum HPCS_ParseCode __unix_wchar_to_utf8(char** target, const char* bytes, const size_t bytes_count);
+static enum HPCS_ParseCode __unix_data_to_utf8(char** target, const char* bytes, const char* encoding, const size_t bytes_count);
 
 static char* __DEFAULT_CS_REV();
 static char* __DEFAULT_CS_VER();