2011-11-07 13:46:58 -06:00
|
|
|
/*
|
|
|
|
* smart-converter.c
|
2011-11-07 16:52:18 -06:00
|
|
|
* This file is part of pluma
|
2011-11-07 13:46:58 -06:00
|
|
|
*
|
|
|
|
* Copyright (C) 2009 - Ignacio Casal Quinteiro
|
|
|
|
*
|
2011-11-07 16:52:18 -06:00
|
|
|
* pluma is free software; you can redistribute it and/or modify
|
2011-11-07 13:46:58 -06:00
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
2011-11-07 16:52:18 -06:00
|
|
|
* pluma is distributed in the hope that it will be useful,
|
2011-11-07 13:46:58 -06:00
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
2011-11-07 16:52:18 -06:00
|
|
|
* along with pluma; if not, write to the Free Software
|
2011-11-07 13:46:58 -06:00
|
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
|
|
* Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
#include "pluma-smart-charset-converter.h"
|
|
|
|
#include "pluma-encodings.h"
|
2011-11-07 13:46:58 -06:00
|
|
|
#include <gio/gio.h>
|
|
|
|
#include <glib.h>
|
2015-08-03 14:32:35 -05:00
|
|
|
#include <glib/gprintf.h>
|
2011-11-07 13:46:58 -06:00
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#define TEXT_TO_CONVERT "this is some text to make the tests"
|
|
|
|
#define TEXT_TO_GUESS "hello \xe6\x96\x87 world"
|
|
|
|
|
|
|
|
static void
|
|
|
|
print_hex (gchar *ptr, gint len)
|
|
|
|
{
|
|
|
|
gint i;
|
|
|
|
|
|
|
|
for (i = 0; i < len; ++i)
|
|
|
|
{
|
|
|
|
g_printf ("\\x%02x", (unsigned char)ptr[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
g_printf ("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static gchar *
|
|
|
|
get_encoded_text (const gchar *text,
|
|
|
|
gsize nread,
|
2011-11-07 16:52:18 -06:00
|
|
|
const PlumaEncoding *to,
|
|
|
|
const PlumaEncoding *from,
|
2011-11-07 13:46:58 -06:00
|
|
|
gsize *bytes_written_aux,
|
|
|
|
gboolean care_about_error)
|
|
|
|
{
|
|
|
|
GCharsetConverter *converter;
|
|
|
|
gchar *out, *out_aux;
|
|
|
|
gsize bytes_read, bytes_read_aux;
|
|
|
|
gsize bytes_written;
|
|
|
|
GConverterResult res;
|
|
|
|
GError *err;
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
converter = g_charset_converter_new (pluma_encoding_get_charset (to),
|
|
|
|
pluma_encoding_get_charset (from),
|
2011-11-07 13:46:58 -06:00
|
|
|
NULL);
|
|
|
|
|
|
|
|
out = g_malloc (200);
|
|
|
|
out_aux = g_malloc (200);
|
|
|
|
err = NULL;
|
|
|
|
bytes_read_aux = 0;
|
|
|
|
*bytes_written_aux = 0;
|
|
|
|
|
|
|
|
if (nread == -1)
|
|
|
|
{
|
|
|
|
nread = strlen (text);
|
|
|
|
}
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
res = g_converter_convert (G_CONVERTER (converter),
|
|
|
|
text + bytes_read_aux,
|
|
|
|
nread,
|
|
|
|
out_aux,
|
|
|
|
200,
|
|
|
|
G_CONVERTER_INPUT_AT_END,
|
|
|
|
&bytes_read,
|
|
|
|
&bytes_written,
|
|
|
|
&err);
|
|
|
|
memcpy (out + *bytes_written_aux, out_aux, bytes_written);
|
|
|
|
bytes_read_aux += bytes_read;
|
|
|
|
*bytes_written_aux += bytes_written;
|
|
|
|
nread -= bytes_read;
|
|
|
|
} while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR);
|
|
|
|
|
|
|
|
if (care_about_error)
|
|
|
|
{
|
|
|
|
g_assert_no_error (err);
|
|
|
|
}
|
|
|
|
else if (err)
|
|
|
|
{
|
|
|
|
g_printf ("** You don't care, but there was an error: %s", err->message);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
out[*bytes_written_aux] = '\0';
|
|
|
|
|
|
|
|
if (!g_utf8_validate (out, *bytes_written_aux, NULL) && !care_about_error)
|
|
|
|
{
|
|
|
|
if (!care_about_error)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
g_assert_not_reached ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static GSList *
|
|
|
|
get_all_encodings ()
|
|
|
|
{
|
|
|
|
GSList *encs = NULL;
|
|
|
|
gint i = 0;
|
|
|
|
|
|
|
|
while (TRUE)
|
|
|
|
{
|
2011-11-07 16:52:18 -06:00
|
|
|
const PlumaEncoding *enc;
|
2011-11-07 13:46:58 -06:00
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
enc = pluma_encoding_get_from_index (i);
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
if (enc == NULL)
|
|
|
|
break;
|
|
|
|
|
|
|
|
encs = g_slist_prepend (encs, (gpointer)enc);
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return encs;
|
|
|
|
}
|
|
|
|
|
|
|
|
static gchar *
|
|
|
|
do_test (const gchar *test_in,
|
|
|
|
const gchar *enc,
|
|
|
|
GSList *encodings,
|
|
|
|
gsize nread,
|
2011-11-07 16:52:18 -06:00
|
|
|
const PlumaEncoding **guessed)
|
2011-11-07 13:46:58 -06:00
|
|
|
{
|
2011-11-07 16:52:18 -06:00
|
|
|
PlumaSmartCharsetConverter *converter;
|
2011-11-07 13:46:58 -06:00
|
|
|
gchar *out, *out_aux;
|
|
|
|
gsize bytes_read, bytes_read_aux;
|
|
|
|
gsize bytes_written, bytes_written_aux;
|
|
|
|
GConverterResult res;
|
|
|
|
GError *err;
|
|
|
|
|
|
|
|
if (enc != NULL)
|
|
|
|
{
|
|
|
|
encodings = NULL;
|
2011-11-07 16:52:18 -06:00
|
|
|
encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset (enc));
|
2011-11-07 13:46:58 -06:00
|
|
|
}
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
converter = pluma_smart_charset_converter_new (encodings);
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
out = g_malloc (200);
|
|
|
|
out_aux = g_malloc (200);
|
|
|
|
err = NULL;
|
|
|
|
bytes_read_aux = 0;
|
|
|
|
bytes_written_aux = 0;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
res = g_converter_convert (G_CONVERTER (converter),
|
|
|
|
test_in + bytes_read_aux,
|
|
|
|
nread,
|
|
|
|
out_aux,
|
|
|
|
200,
|
|
|
|
G_CONVERTER_INPUT_AT_END,
|
|
|
|
&bytes_read,
|
|
|
|
&bytes_written,
|
|
|
|
&err);
|
|
|
|
memcpy (out + bytes_written_aux, out_aux, bytes_written);
|
|
|
|
bytes_read_aux += bytes_read;
|
|
|
|
bytes_written_aux += bytes_written;
|
|
|
|
nread -= bytes_read;
|
|
|
|
} while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR);
|
|
|
|
|
|
|
|
g_assert_no_error (err);
|
|
|
|
out[bytes_written_aux] = '\0';
|
|
|
|
|
|
|
|
if (guessed != NULL)
|
2011-11-07 16:52:18 -06:00
|
|
|
*guessed = pluma_smart_charset_converter_get_guessed (converter);
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
do_test_roundtrip (const char *str, const char *charset)
|
|
|
|
{
|
|
|
|
gsize len;
|
|
|
|
gchar *buf, *p;
|
|
|
|
GInputStream *in, *tmp;
|
|
|
|
GCharsetConverter *c1;
|
2011-11-07 16:52:18 -06:00
|
|
|
PlumaSmartCharsetConverter *c2;
|
2011-11-07 13:46:58 -06:00
|
|
|
gsize n, tot;
|
|
|
|
GError *err;
|
|
|
|
GSList *enc = NULL;
|
|
|
|
|
|
|
|
len = strlen(str);
|
|
|
|
buf = g_new0 (char, len);
|
|
|
|
|
|
|
|
in = g_memory_input_stream_new_from_data (str, -1, NULL);
|
|
|
|
|
|
|
|
c1 = g_charset_converter_new (charset, "UTF-8", NULL);
|
|
|
|
|
|
|
|
tmp = in;
|
|
|
|
in = g_converter_input_stream_new (in, G_CONVERTER (c1));
|
|
|
|
g_object_unref (tmp);
|
|
|
|
g_object_unref (c1);
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
enc = g_slist_prepend (enc, (gpointer)pluma_encoding_get_from_charset (charset));
|
|
|
|
c2 = pluma_smart_charset_converter_new (enc);
|
2011-11-07 13:46:58 -06:00
|
|
|
g_slist_free (enc);
|
|
|
|
|
|
|
|
tmp = in;
|
|
|
|
in = g_converter_input_stream_new (in, G_CONVERTER (c2));
|
|
|
|
g_object_unref (tmp);
|
|
|
|
g_object_unref (c2);
|
|
|
|
|
|
|
|
tot = 0;
|
|
|
|
p = buf;
|
|
|
|
n = len;
|
|
|
|
while (TRUE)
|
|
|
|
{
|
|
|
|
gssize res;
|
|
|
|
|
|
|
|
err = NULL;
|
|
|
|
res = g_input_stream_read (in, p, n, NULL, &err);
|
|
|
|
g_assert_no_error (err);
|
|
|
|
if (res == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
p += res;
|
|
|
|
n -= res;
|
|
|
|
tot += res;
|
|
|
|
}
|
|
|
|
|
|
|
|
g_assert_cmpint (tot, ==, len);
|
|
|
|
g_assert_cmpstr (str, ==, buf);
|
|
|
|
|
|
|
|
g_free (buf);
|
|
|
|
g_object_unref (in);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
test_utf8_utf8 ()
|
|
|
|
{
|
|
|
|
gchar *aux;
|
|
|
|
|
|
|
|
aux = do_test (TEXT_TO_CONVERT, "UTF-8", NULL, strlen (TEXT_TO_CONVERT), NULL);
|
|
|
|
g_assert_cmpstr (aux, ==, TEXT_TO_CONVERT);
|
|
|
|
|
|
|
|
aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 18, NULL);
|
|
|
|
g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz");
|
|
|
|
|
|
|
|
aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 9, NULL);
|
|
|
|
g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3");
|
|
|
|
|
|
|
|
/* FIXME: Use the utf8 stream for a fallback? */
|
|
|
|
//do_test_with_error ("\xef\xbf\xbezzzzzz", encs, G_IO_ERROR_FAILED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
test_xxx_xxx ()
|
|
|
|
{
|
|
|
|
GSList *encs, *l;
|
|
|
|
|
|
|
|
encs = get_all_encodings ();
|
|
|
|
|
|
|
|
/* Here we just test all encodings it is just to know that the conversions
|
|
|
|
are done ok */
|
|
|
|
for (l = encs; l != NULL; l = g_slist_next (l))
|
|
|
|
{
|
2011-11-07 16:52:18 -06:00
|
|
|
do_test_roundtrip (TEXT_TO_CONVERT, pluma_encoding_get_charset ((const PlumaEncoding *)l->data));
|
2011-11-07 13:46:58 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
g_slist_free (encs);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
test_empty ()
|
|
|
|
{
|
2011-11-07 16:52:18 -06:00
|
|
|
const PlumaEncoding *guessed;
|
2011-11-07 13:46:58 -06:00
|
|
|
gchar *out;
|
|
|
|
GSList *encodings = NULL;
|
|
|
|
|
|
|
|
/* testing the case of an empty file and list of encodings with no
|
|
|
|
utf-8. In this case, the smart converter cannot determine the right
|
|
|
|
encoding (because there is no input), but should still default to
|
|
|
|
utf-8 for the detection */
|
2011-11-07 16:52:18 -06:00
|
|
|
encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset ("UTF-16"));
|
|
|
|
encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset ("ISO-8859-15"));
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
out = do_test ("", NULL, encodings, 0, &guessed);
|
|
|
|
|
|
|
|
g_assert_cmpstr (out, ==, "");
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
g_assert (guessed == pluma_encoding_get_utf8 ());
|
2011-11-07 13:46:58 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
test_guessed ()
|
|
|
|
{
|
|
|
|
GSList *encs = NULL;
|
|
|
|
gchar *aux, *aux2, *fail;
|
|
|
|
gsize aux_len, fail_len;
|
2011-11-07 16:52:18 -06:00
|
|
|
const PlumaEncoding *guessed;
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
aux = get_encoded_text (TEXT_TO_GUESS, -1,
|
2011-11-07 16:52:18 -06:00
|
|
|
pluma_encoding_get_from_charset ("UTF-16"),
|
|
|
|
pluma_encoding_get_from_charset ("UTF-8"),
|
2011-11-07 13:46:58 -06:00
|
|
|
&aux_len,
|
|
|
|
TRUE);
|
|
|
|
|
|
|
|
fail = get_encoded_text (aux, aux_len,
|
2011-11-07 16:52:18 -06:00
|
|
|
pluma_encoding_get_from_charset ("UTF-8"),
|
|
|
|
pluma_encoding_get_from_charset ("ISO-8859-15"),
|
2011-11-07 13:46:58 -06:00
|
|
|
&fail_len,
|
|
|
|
FALSE);
|
|
|
|
|
|
|
|
g_assert (fail == NULL);
|
|
|
|
|
|
|
|
/* ISO-8859-15 should fail */
|
2011-11-07 16:52:18 -06:00
|
|
|
encs = g_slist_append (encs, (gpointer)pluma_encoding_get_from_charset ("ISO-8859-15"));
|
|
|
|
encs = g_slist_append (encs, (gpointer)pluma_encoding_get_from_charset ("UTF-16"));
|
2011-11-07 13:46:58 -06:00
|
|
|
|
|
|
|
aux2 = do_test (aux, NULL, encs, aux_len, &guessed);
|
|
|
|
|
2011-11-07 16:52:18 -06:00
|
|
|
g_assert (guessed == pluma_encoding_get_from_charset ("UTF-16"));
|
2011-11-07 13:46:58 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
int main (int argc,
|
|
|
|
char *argv[])
|
|
|
|
{
|
|
|
|
g_test_init (&argc, &argv, NULL);
|
|
|
|
|
|
|
|
g_test_add_func ("/smart-converter/utf8-utf8", test_utf8_utf8);
|
|
|
|
//g_test_add_func ("/smart-converter/xxx-xxx", test_xxx_xxx);
|
|
|
|
g_test_add_func ("/smart-converter/guessed", test_guessed);
|
|
|
|
g_test_add_func ("/smart-converter/empty", test_empty);
|
|
|
|
|
|
|
|
return g_test_run ();
|
|
|
|
}
|