vc++读取文本文件并自动识别编码且转换
版权声明:
本文为博主原创文章,转载请声明原文链接...谢谢。o_0。
更新时间:
2022-10-06 11:50:44
温馨提示:
学无止境,技术类文章有它的时效性,请留意文章更新时间,如发现内容有误请留言指出,防止别人"踩坑",我会及时更新文章
窄字符到宽字符转换
// string => wstring std::wstring Utils::String2WString(const std::string& str, int m_encode) { wstring result; //获取缓冲区大小,并申请空间,缓冲区大小按字符计算 const int len = MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), nullptr, 0); const auto buffer = new TCHAR[len + 1]; //多字节编码转换成宽字节编码 MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), buffer, len); //添加字符串结尾 buffer[len] = '\0'; //删除缓冲区并返回值 result.append(buffer); delete[] buffer; return result; }
utf8字符串判断
读取不带bom头的utf8时需要此函数来识别
bool Utils::IsUTF8Text(const void* pBuffer, const long size) { bool IsUTF8 = true; auto start = (unsigned char*)pBuffer; const unsigned char* end = (unsigned char*)pBuffer + size; while (start < end) { if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符 { start++; } else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符 { IsUTF8 = false; break; } else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符 { if (start >= end - 1) { break; } if ((start[1] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 2; } else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符 { if (start >= end - 2) { break; } if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) { IsUTF8 = false; break; } start += 3; } else { IsUTF8 = false; break; } } return IsUTF8; }
读取文本
CDuiString Utils::FileGetContents(const CDuiString& filePath) { FILE* pFile = _tfopen(filePath.GetData(), _T("rb")); if (pFile == nullptr) { //Log::Error(filePath); return{}; } // 取三个字节查看文件类型 char szFlag[3] = { 0 }; fread(szFlag, 1, 3, pFile); // 取数据总长度 fseek(pFile, 0L, SEEK_END); const int total = ftell(pFile); int offset = 0; //https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers //FE FF UTF16BE //FF FE UTF16LE //EF BB BF UTF8 int encode = CP_ACP; if ((unsigned char)szFlag[0] == 0xFF && (unsigned char)szFlag[1] == 0xFE) { //Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications encode = 1201; offset = 2; } else if ((unsigned char)szFlag[0] == 0xFE && (unsigned char)szFlag[1] == 0xFF) { // Unicode UTF-16, big endian byte order; available only to managed applications encode = 1200; offset = 2; } else if ((unsigned char)szFlag[0] == 0xEF && (unsigned char)szFlag[1] == 0xBB && (unsigned char)szFlag[2] == 0xBF) { encode = CP_UTF8; offset = 3; } fseek(pFile, offset, SEEK_SET); const auto buf = new char[total - offset + 1]{ 0 }; fread(buf, sizeof(char), total - offset, pFile); const string strContent = buf; delete[] buf; fclose(pFile); // 无utf8 bom头的字符串再判断下 if (encode == CP_ACP && IsUTF8Text(strContent.c_str(), strContent.length())) { encode = CP_UTF8; } CDuiString tmp = String2WString(strContent, encode).c_str(); return tmp; }