2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 * Convert data from one encoding to another. Return:
34 * -2 : memory allocation failed
35 * -1 : unknown encoding
36 * 0 : data was converted exactly
37 * 1 : data was converted inexactly
38 * 2 : data was invalid (but still converted)
40 * We convert in two steps, via UTF-8, as this is the only
41 * reliable way of distinguishing between invalid input
42 * and valid input which iconv refuses to transliterate.
43 * We convert from UTF-8 twice, because we have no way of
44 * knowing whether the conversion was exact if iconv returns
45 * E2BIG (due to a bug in the specification of iconv).
46 * An alternative approach is to assume that the output of
47 * iconv is never more than 4 times as long as the input,
48 * but I prefer to avoid that assumption if possible.
51 int iconvert(const char *fromcode, const char *tocode,
52 const char *from, size_t fromlen,
53 char **to, size_t *tolen)
59 char *utfbuf = 0, *outbuf, *newbuf;
60 size_t utflen, outlen, ibl, obl, k;
63 cd1 = iconv_open("UTF-8", fromcode);
64 if (cd1 == (iconv_t)(-1))
68 /* Don't use strcasecmp() as it's locale-dependent. */
69 if (!strchr("Uu", tocode[0]) ||
70 !strchr("Tt", tocode[1]) ||
71 !strchr("Ff", tocode[2]) ||
78 * Try using this non-standard feature of glibc and libiconv.
79 * This is deliberately not a config option as people often
80 * change their iconv library without rebuilding applications.
82 tocode1 = (char *)malloc(strlen(tocode) + 11);
86 strcpy(tocode1, tocode);
87 strcat(tocode1, "//TRANSLIT");
88 cd2 = iconv_open(tocode1, "UTF-8");
91 if (cd2 == (iconv_t)(-1))
92 cd2 = iconv_open(tocode, fromcode);
94 if (cd2 == (iconv_t)(-1)) {
100 utflen = 1; /*fromlen * 2 + 1; XXX */
101 utfbuf = (char *)malloc(utflen);
105 /* Convert to UTF-8 */
111 k = iconv(cd1, &ib, &ibl, &ob, &obl);
112 assert((k != (size_t)(-1) && !ibl) ||
113 (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
114 (k == (size_t)(-1) &&
115 (errno == EILSEQ || errno == EINVAL) && ibl));
119 /* Enlarge the buffer */
121 newbuf = (char *)realloc(utfbuf, utflen);
124 ob = (ob - utfbuf) + newbuf;
125 obl = utflen - (ob - newbuf);
133 //iconv(cd1, 0, 0, 0, 0); # in theory commenting this line prevents a segfault
137 if (cd2 == (iconv_t)(-1)) {
138 /* The target encoding was UTF-8 */
140 *tolen = ob - utfbuf;
146 newbuf = (char *)realloc(utfbuf, (ob - utfbuf) + 1);
149 ob = (ob - utfbuf) + newbuf;
156 /* Truncate the buffer to be tidy */
157 utflen = ob - utfbuf;
158 newbuf = (char *)realloc(utfbuf, utflen);
163 /* Convert from UTF-8 to discover how long the output is */
170 k = iconv(cd2, &ib, &ibl, &ob, &obl);
171 assert((k != (size_t)(-1) && !ibl) ||
172 (k == (size_t)(-1) && errno == E2BIG && ibl) ||
173 (k == (size_t)(-1) && errno == EILSEQ && ibl));
174 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
175 /* Replace one character */
182 k = iconv(cd2, &tb, &tbl, &ob, &obl);
183 assert((k != (size_t)(-1) && !tbl) ||
184 (k == (size_t)(-1) && errno == EILSEQ && tbl));
185 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
192 k = iconv(cd2, 0, 0, &ob, &obl);
193 assert(k != (size_t)(-1));
196 /* Convert from UTF-8 for real */
197 outbuf = (char *)malloc(outlen + 1);
205 k = iconv(cd2, &ib, &ibl, &ob, &obl);
206 assert((k != (size_t)(-1) && !ibl) ||
207 (k == (size_t)(-1) && errno == EILSEQ && ibl));
210 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
211 /* Replace one character */
215 k = iconv(cd2, &tb, &tbl, &ob, &obl);
216 assert((k != (size_t)(-1) && !tbl) ||
217 (k == (size_t)(-1) && errno == EILSEQ && tbl));
218 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
222 k = iconv(cd2, 0, 0, &ob, &obl);
223 assert(k != (size_t)(-1));
242 if (cd2 != (iconv_t)(-1))
247 #endif /* HAVE_ICONV */