iconv (libiconv.dylib) broken
iconv (libiconv.dylib) broken
- Subject: iconv (libiconv.dylib) broken
- From: Andreas Grosam <email@hidden>
- Date: Tue, 9 Feb 2010 20:58:11 +0100
Hi All,
after experimenting with the iconv library it seems that it is broken on Mac OS X.
<http://www.gnu.org/software/libiconv/>
The library provides functions for character encoding. The library is installed on Max OS X per default, but is not normally used in Cocoa applications. So, this may be of interest only for sources which get ported over to Mac OS X or any application or library linking against libiconv.dylib.
I would appreciate it if somebody could confirm this.
Below is a test program (console) which shows my findings.
(In order to compile, you need to link against libiconv.dylib. There is also a dependency to boost, but this can be easily eliminated).
Regards,
Andreas
// File: main.cpp
// preferred editor encoding: UTF-8
//
// iconv test
//
#include <iostream>
#include <iomanip>
#include <stdexcept>
#include <string>
#include <errno.h>
#include <boost/format.hpp>
#include <iconv.h>
std::wstring convertToWstring(const char* in_buffer,
size_t in_size,
const char* fromCharset,
const char* toCharset = "WCHAR_T")
{
iconv_t cd = iconv_open (toCharset, fromCharset);
if (cd == (iconv_t) -1)
{
// Something went wrong:
if (errno == EINVAL) {
std::string message = (boost::format("conversion from '%1%' to %2% not available") % fromCharset % toCharset).str();
throw std::runtime_error(message);
}
else {
throw std::runtime_error (strerror(errno));
}
}
// Determine size of buffer for the resulting string, with no assumptions about the
// encoding and allocate it:
const size_t wbuffer_size = (in_size)*sizeof(wchar_t); // worst case is 4*in_size
std::auto_ptr<char> wbuffer(new char[wbuffer_size]);
#ifndef NDEBUG
memset(wbuffer.get(), 0, wbuffer_size);
#endif
char* in_ptr = const_cast<char*>(in_buffer); // work around the unfortunate C declartion of iconv
size_t in_buffer_bytes_left = in_size;
char* out_ptr = wbuffer.get();
size_t out_buffer_bytes_left = wbuffer_size;
// convert the character sequence according the encodings:
size_t nconv = iconv (cd, &in_ptr, &in_buffer_bytes_left, &out_ptr, &out_buffer_bytes_left);
if (nconv == (size_t) -1) {
int err = errno;
iconv_close (cd);
throw std::runtime_error (strerror(err));
}
// flush any pending characters to the output buffer, if any:
nconv = iconv (cd, NULL, NULL, &out_ptr, &out_buffer_bytes_left);
if (nconv == (size_t) -1) {
int err = errno;
iconv_close (cd);
throw std::runtime_error (strerror(err));
}
// close the handle:
if (iconv_close (cd) != 0) {
throw std::runtime_error (strerror(errno));
}
size_t ob_size = wbuffer_size - out_buffer_bytes_left;
int nelem = ob_size/sizeof(wchar_t);
// Sanity checks:
assert(ob_size == out_ptr - wbuffer.get());
assert(in_buffer_bytes_left == 0);
assert(in_ptr == in_buffer + in_size);
assert(nelem * sizeof(wchar_t) == ob_size);
#if 1
// So, print out what's going on:
std::cout << "--- iconv start ---" << std::endl;
std::cout << " in buffer: ";
const char* istart = in_buffer;
const char* iend = istart + in_size;
while (istart < iend) {
unsigned int c = (unsigned char)(*istart);
std::cout << std::setw(2) << std::setfill('0') << std::hex << c << " ";
++istart;
}
std::cout << std::endl;
std::cout << " out buffer: ";
char* start = wbuffer.get();
char* end = start + ob_size;
while (start < end) {
unsigned int c = (unsigned char)(*start);
std::cout << std::setw(2) << std::setfill('0') << std::hex << c << " ";
++start;
}
std::cout << std::endl;
std::cout << "wchar buffer: ";
wchar_t* wstart = reinterpret_cast<wchar_t*>(wbuffer.get());
wchar_t* wend = wstart + nelem;
while (wstart < wend) {
wchar_t c = *wstart;
std::cout << std::setw(sizeof(wchar_t)*2) << std::setfill('0') << std::hex << c << " ";
++wstart;
}
std::cout << std::endl;
std::cout << "--- iconv end ---" << std::endl;
#endif
std::wstring result((wchar_t*)wbuffer.get(), nelem);
return result;
}
void check(const wchar_t* oughtTo, const std::wstring& test)
{
if (std::wstring(oughtTo) != test)
{
std::cout << "conversion failed:" << std::endl;
const wchar_t* p = oughtTo;
std::cout << "ought to: ";
while (*p != 0) {
std::cout << std::setw(sizeof(wchar_t)*2) << std::setfill('0') << std::hex << *p++ << " ";
}
std::cout << std::endl;
std::cout << "test: ";
for (int i = 0; i < test.size(); ++i) {
std::cout << std::setw(sizeof(wchar_t)*2) << std::setfill('0') << std::hex << test.at(i) << " ";
}
std::cout << std::endl;
}
}
int main (int argc, const char * argv[])
{
try {
// Test editor encoding:
std::wstring wStr = L"TüT";
std::wstring wcheck = L"T\u00fcT"; // <http://www.fileformat.info/info/unicode/char/00fc/index.htm>
if (wStr != wcheck) {
std::cout << "error: bogus string representation" << std::endl;
return -1;
}
std::string utf8Str = "TüT";
std::string utf8Check = "T\xC3\xBCT"; // <http://www.fileformat.info/info/unicode/char/00fc/index.htm>
if (utf8Str != utf8Check) {
std::cout << "error: bogus string representation" << std::endl;
return -1;
}
// Test iconv
std::wstring wtest;
wtest= convertToWstring("T\xC3\xBCT", 4, "UTF-8", "WCHAR_T"); // "TüT"
check(L"T\u00fcT", wtest);
wtest = convertToWstring("T\xC3\x83T", 4, "UTF-8", "WCHAR_T"); // "TÃT"
check(L"T\u00c3T", wtest);
wtest = convertToWstring("T\xC3\x84T", 4, "UTF-8", "WCHAR_T"); // "TÄT"
check(L"T\u00c4T", wtest);
wtest = convertToWstring("T\xC3\x85T", 4, "UTF-8", "WCHAR_T"); // "TÅT"
check(L"T\u00c5T", wtest);
/*
wtest = convertToWstring("T\xC3\xBCT", 4, "UTF-8", "UTF-32"); // "TüT"
check(L"T\u00fcT", wtest);
wtest = convertToWstring("T\xC3\xB3T", 4, "UTF-8", "UTF-32"); // "TÃT"
check(L"T\u00c3T", wtest);
*/
}
catch (std::exception& ex) {
std::cout << ex.what() << std::endl;
return -1;
}
return 0;
}
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Xcode-users mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden