• Skip to content
  • Skip to link menu
KDE 4.7 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KDECore

kcharsets.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries
00002     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
00003     Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
00004     Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net>
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 */
00021 #include "kcharsets.h"
00022 
00023 #include "kfilterdev.h"
00024 #include "kentities.c"
00025 
00026 #include "kconfig.h"
00027 #include "kdebug.h"
00028 #include "kglobal.h"
00029 #include "klocale.h"
00030 
00031 #include <QtCore/QDir>
00032 #include <QtCore/QRegExp>
00033 #include <QtCore/QCharRef>
00034 #include <QtCore/QMutableStringListIterator>
00035 #include <QtCore/QTextCodec>
00036 
00037 #include <assert.h>
00038 #include <QHash>
00039 
00040 /*
00041  * ### FIXME KDE4: the name of the encodings should mostly be uppercase
00042  * The names of this list are user-visible
00043  * Generate with generate_string_table.pl, input data:
00044 ISO 8859-1
00045 i18n:Western European
00046 ISO 8859-15
00047 i18n:Western European
00048 ISO 8859-14
00049 i18n:Western European
00050 cp 1252
00051 i18n:Western European
00052 IBM850
00053 i18n:Western European
00054 ISO 8859-2
00055 i18n:Central European
00056 ISO 8859-3
00057 i18n:Central European
00058 ISO 8859-4
00059 i18n:Baltic
00060 ISO 8859-13
00061 i18n:Baltic
00062 ISO 8859-16
00063 i18n:South-Eastern Europe
00064 cp 1250
00065 i18n:Central European
00066 cp 1254
00067 i18n:Turkish
00068 cp 1257
00069 i18n:Baltic
00070 KOI8-R
00071 i18n:Cyrillic
00072 ISO 8859-5
00073 i18n:Cyrillic
00074 cp 1251
00075 i18n:Cyrillic
00076 KOI8-U
00077 i18n:Cyrillic
00078 IBM866
00079 i18n:Cyrillic
00080 Big5
00081 i18n:Chinese Traditional
00082 Big5-HKSCS
00083 i18n:Chinese Traditional
00084 GB18030
00085 i18n:Chinese Simplified
00086 GBK
00087 i18n:Chinese Simplified
00088 GB2312
00089 i18n:Chinese Simplified
00090 EUC-KR
00091 i18n:Korean
00092 sjis
00093 i18n:Japanese
00094 jis7
00095 i18n:Japanese
00096 EUC-JP
00097 i18n:Japanese
00098 ISO 8859-7
00099 i18n:Greek
00100 cp 1253
00101 i18n:Greek
00102 ISO 8859-6
00103 i18n:Arabic
00104 cp 1256
00105 i18n:Arabic
00106 ISO 8859-8
00107 i18n:Hebrew
00108 ISO 8859-8-I
00109 i18n:Hebrew
00110 cp 1255
00111 i18n:Hebrew
00112 ISO 8859-9
00113 i18n:Turkish
00114 TIS620
00115 i18n:Thai
00116 ISO 8859-11
00117 i18n:Thai
00118 UTF-8
00119 i18n:Unicode
00120 UTF-16
00121 i18n:Unicode
00122 utf7
00123 i18n:Unicode
00124 ucs2
00125 i18n:Unicode
00126 ISO 10646-UCS-2
00127 i18n:Unicode
00128 winsami2
00129 i18n:Northern Saami
00130 windows-1258
00131 i18n:Other
00132 IBM874
00133 i18n:Other
00134 TSCII
00135 i18n:Other
00136  */
00137 /*
00138  * Notes about the table:
00139  *
00140  * - The following entries were disabled and removed from the table:
00141 ibm852
00142 i18n:Central European
00143 pt 154
00144 i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
00145  *
00146  * - ISO 8559-11 is the deprecated name of TIS-620
00147  * - utf7 is not in Qt
00148  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00149  * - windows-1258: TODO
00150  * - IBM874: TODO
00151  * - TSCII: TODO
00152  */
00153 static const char language_for_encoding_string[] =
00154     "ISO 8859-1\0"
00155     I18N_NOOP2("@item Text character set", "Western European")"\0"
00156     "ISO 8859-15\0"
00157     "ISO 8859-14\0"
00158     "cp 1252\0"
00159     "IBM850\0"
00160     "ISO 8859-2\0"
00161     I18N_NOOP2("@item Text character set", "Central European")"\0"
00162     "ISO 8859-3\0"
00163     "ISO 8859-4\0"
00164     I18N_NOOP2("@item Text character set", "Baltic")"\0"
00165     "ISO 8859-13\0"
00166     "ISO 8859-16\0"
00167     I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0"
00168     "cp 1250\0"
00169     "cp 1254\0"
00170     I18N_NOOP2("@item Text character set", "Turkish")"\0"
00171     "cp 1257\0"
00172     "KOI8-R\0"
00173     I18N_NOOP2("@item Text character set", "Cyrillic")"\0"
00174     "ISO 8859-5\0"
00175     "cp 1251\0"
00176     "KOI8-U\0"
00177     "IBM866\0"
00178     "Big5\0"
00179     I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0"
00180     "Big5-HKSCS\0"
00181     "GB18030\0"
00182     I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0"
00183     "GBK\0"
00184     "GB2312\0"
00185     "EUC-KR\0"
00186     I18N_NOOP2("@item Text character set", "Korean")"\0"
00187     "sjis\0"
00188     I18N_NOOP2("@item Text character set", "Japanese")"\0"
00189     "jis7\0"
00190     "EUC-JP\0"
00191     "ISO 8859-7\0"
00192     I18N_NOOP2("@item Text character set", "Greek")"\0"
00193     "cp 1253\0"
00194     "ISO 8859-6\0"
00195     I18N_NOOP2("@item Text character set", "Arabic")"\0"
00196     "cp 1256\0"
00197     "ISO 8859-8\0"
00198     I18N_NOOP2("@item Text character set", "Hebrew")"\0"
00199     "ISO 8859-8-I\0"
00200     "cp 1255\0"
00201     "ISO 8859-9\0"
00202     "TIS620\0"
00203     I18N_NOOP2("@item Text character set", "Thai")"\0"
00204     "ISO 8859-11\0"
00205     "UTF-8\0"
00206     I18N_NOOP2("@item Text character set", "Unicode")"\0"
00207     "UTF-16\0"
00208     "utf7\0"
00209     "ucs2\0"
00210     "ISO 10646-UCS-2\0"
00211     "winsami2\0"
00212     I18N_NOOP2("@item Text character set", "Northern Saami")"\0"
00213     "windows-1258\0"
00214     I18N_NOOP2("@item Text character set", "Other")"\0"
00215     "IBM874\0"
00216     "TSCII\0"
00217     "\0";
00218 
00219 static const int language_for_encoding_indices[] = {
00220        0,   11,   28,   11,   40,   11,   52,   11,
00221       60,   11,   67,   78,   95,   78,  106,  117,
00222      124,  117,  136,  148,  169,   78,  177,  185,
00223      193,  117,  201,  208,  217,  208,  228,  208,
00224      236,  208,  243,  208,  250,  255,  275,  255,
00225      286,  294,  313,  294,  317,  294,  324,  331,
00226      338,  343,  352,  343,  357,  343,  364,  375,
00227      381,  375,  389,  400,  407,  400,  415,  426,
00228      433,  426,  446,  426,  454,  185,  465,  472,
00229      477,  472,  489,  495,  503,  495,  510,  495,
00230      515,  495,  520,  495,  536,  545,  560,  573,
00231      579,  573,  586,  573,   -1
00232 };
00233 
00234 /*
00235  * defines some different names for codecs that are built into Qt.
00236  * The names in this list must be lower-case.
00237  * input data for generate_string_table.pl:
00238 iso-ir-111
00239 koi8-r
00240 koi unified
00241 koi8-r
00242 us-ascii
00243 iso 8859-1
00244 usascii
00245 iso 8859-1
00246 ascii
00247 iso 8859-1
00248 unicode-1-1-utf-7
00249 utf-7
00250 ucs2
00251 iso-10646-ucs-2
00252 iso10646-1
00253 iso-10646-ucs-2
00254 gb18030.2000-1
00255 gb18030
00256 gb18030.2000-0
00257 gb18030
00258 gbk-0
00259 gbk
00260 gb2312
00261 gbk
00262 gb2312.1980-0
00263 gbk
00264 big5-0
00265 big5
00266 euc-kr
00267 euckr
00268 euc-jp
00269 eucjp
00270 jisx0201.1976-0
00271 eucjp
00272 jisx0208.1983-0
00273 eucjp
00274 jisx0208.1990-0
00275 eucjp
00276 jisx0208.1997-0
00277 eucjp
00278 jisx0212.1990-0
00279 eucjp
00280 jisx0213.2000-1
00281 eucjp
00282 jisx0213.2000-2
00283 eucjp
00284 shift_jis
00285 sjis
00286 shift-jis
00287 sjis
00288 sjis
00289 sjis
00290 iso-2022-jp
00291 jis7
00292 windows850
00293 ibm850
00294 windows866
00295 ibm866
00296 windows-850
00297 ibm850
00298 windows-866
00299 ibm866
00300 cp-10000
00301 apple roman
00302 thai-tis620
00303 iso 8859-11
00304 windows-874
00305 ibm874
00306 windows874
00307 ibm874
00308 cp-874
00309 ibm874
00310 ksc5601.1987-0
00311 euckr
00312 ks_c_5601-1987
00313 euckr
00314 mac-roman
00315 apple roman
00316 macintosh
00317 apple roman
00318 mac
00319 apple roman
00320 csiso2022jp
00321 iso-2022-jp
00322 */
00323 /*
00324  * Notes about the table:
00325  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
00326  * - utf7 is not in Qt
00327  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00328  * - sjis: appears on the table for x-sjis
00329  * - jis7: ISO-2022-JP is now the default name in Qt4
00330  * - cp-874: is it really needed?
00331  * - mac-roman: appears on the table for x-mac-roman
00332  * - csiso2022jp: See bug #77243
00333  */
00334 static const char builtin_string[] =
00335     "iso-ir-111\0"
00336     "koi8-r\0"
00337     "koi unified\0"
00338     "us-ascii\0"
00339     "iso 8859-1\0"
00340     "usascii\0"
00341     "ascii\0"
00342     "unicode-1-1-utf-7\0"
00343     "utf-7\0"
00344     "ucs2\0"
00345     "iso-10646-ucs-2\0"
00346     "iso10646-1\0"
00347     "gb18030.2000-1\0"
00348     "gb18030\0"
00349     "gb18030.2000-0\0"
00350     "gbk-0\0"
00351     "gbk\0"
00352     "gb2312\0"
00353     "gb2312.1980-0\0"
00354     "big5-0\0"
00355     "big5\0"
00356     "euc-kr\0"
00357     "euckr\0"
00358     "euc-jp\0"
00359     "eucjp\0"
00360     "jisx0201.1976-0\0"
00361     "jisx0208.1983-0\0"
00362     "jisx0208.1990-0\0"
00363     "jisx0208.1997-0\0"
00364     "jisx0212.1990-0\0"
00365     "jisx0213.2000-1\0"
00366     "jisx0213.2000-2\0"
00367     "shift_jis\0"
00368     "sjis\0"
00369     "shift-jis\0"
00370     "iso-2022-jp\0"
00371     "jis7\0"
00372     "windows850\0"
00373     "ibm850\0"
00374     "windows866\0"
00375     "ibm866\0"
00376     "windows-850\0"
00377     "windows-866\0"
00378     "cp-10000\0"
00379     "apple roman\0"
00380     "thai-tis620\0"
00381     "iso 8859-11\0"
00382     "windows-874\0"
00383     "ibm874\0"
00384     "windows874\0"
00385     "cp-874\0"
00386     "ksc5601.1987-0\0"
00387     "ks_c_5601-1987\0"
00388     "mac-roman\0"
00389     "macintosh\0"
00390     "mac\0"
00391     "csiso2022jp\0"
00392     "\0";
00393 
00394 static const int builtin_indices[] = {
00395        0,   11,   18,   11,   30,   39,   50,   39,
00396       58,   39,   64,   82,   88,   93,  109,   93,
00397      120,  135,  143,  135,  158,  164,  168,  164,
00398      175,  164,  189,  196,  201,  208,  214,  221,
00399      227,  221,  243,  221,  259,  221,  275,  221,
00400      291,  221,  307,  221,  323,  221,  339,  349,
00401      354,  349,  349,  349,  364,  376,  381,  392,
00402      399,  410,  417,  392,  429,  410,  441,  450,
00403      462,  474,  486,  498,  505,  498,  516,  498,
00404      523,  208,  538,  208,  553,  450,  563,  450,
00405      573,  450,  577,  364,   -1
00406 };
00407 
00408 /*
00409  * some last resort hints in case the charmap file couldn't be found.
00410  * This gives at least a partial conversion and helps making things readable.
00411  *
00412  * the name used as input here is already converted to the more canonical
00413  * name as defined in the aliases array.
00414  *
00415  * Input data:
00416 cp1250
00417 iso-8859-2
00418 koi8-r
00419 iso-8859-5
00420 koi8-u
00421 koi8-r
00422 pt 154
00423 windows-1251
00424 paratype-154
00425 windows-1251
00426 pt-154
00427 windows-1251
00428  */
00429 /* Notes:
00430  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
00431  */
00432 static const char conversion_hints_string[] =
00433     "cp1250\0"
00434     "iso-8859-2\0"
00435     "koi8-r\0"
00436     "iso-8859-5\0"
00437     "koi8-u\0"
00438     "pt 154\0"
00439     "windows-1251\0"
00440     "paratype-154\0"
00441     "pt-154\0"
00442     "\0";
00443 
00444 static const int conversion_hints_indices[] = {
00445        0,    7,   18,   25,   36,   18,   43,   50,
00446       63,   50,   76,   50,   -1
00447 };
00448 
00449 // search an array of items index/data, find first matching index
00450 // and return data, or return 0
00451 static inline
00452 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
00453 {
00454     for (int i = 0; indices[i] != -1; i += 2)
00455         if (qstrcmp(start + indices[i], entry) == 0)
00456             return start + indices[i + 1];
00457     return 0;
00458 }
00459 
00460 
00461 class KCharsetsPrivate
00462 {
00463 public:
00464     KCharsetsPrivate(KCharsets* _kc)
00465     {
00466         kc = _kc;
00467         codecForNameDict.reserve( 43 );
00468     }
00469     // Hash for the encoding names (sensitive case)
00470     QHash<QByteArray,QTextCodec*> codecForNameDict;
00471     KCharsets* kc;
00472 
00473     //Cache list so QStrings can be implicitly shared
00474     QList<QStringList> encodingsByScript;
00475 };
00476 
00477 // --------------------------------------------------------------------------
00478 
00479 KCharsets::KCharsets()
00480     :d(new KCharsetsPrivate(this))
00481 {
00482 }
00483 
00484 KCharsets::~KCharsets()
00485 {
00486     delete d;
00487 }
00488 
00489 QChar KCharsets::fromEntity(const QString &str)
00490 {
00491     QChar res = QChar::Null;
00492 
00493     if ( str.isEmpty() )
00494         return QChar::Null;
00495 
00496     int pos = 0;
00497     if(str[pos] == QLatin1Char('&')) pos++;
00498 
00499     // Check for '&#000' or '&#x0000' sequence
00500     if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) {
00501         bool ok;
00502         pos++;
00503         if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
00504             pos++;
00505             // '&#x0000', hexadecimal character reference
00506             const QString tmp( str.mid( pos ) );
00507             res = tmp.toInt(&ok, 16);
00508         } else {
00509             //  '&#0000', decimal character reference
00510             const QString tmp( str.mid( pos ) );
00511             res = tmp.toInt(&ok, 10);
00512         }
00513         if ( ok )
00514             return res;
00515         else
00516             return QChar::Null;
00517     }
00518 
00519     const QByteArray raw ( str.toLatin1() );
00520     const entity *e = kde_findEntity( raw, raw.length() );
00521 
00522     if(!e)
00523     {
00524         //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length();
00525         return QChar::Null;
00526     }
00527     //kDebug() << "got entity " << str << " = " << e->code;
00528 
00529     return QChar(e->code);
00530 }
00531 
00532 QChar KCharsets::fromEntity(const QString &str, int &len)
00533 {
00534     // entities are never longer than 8 chars... we start from
00535     // that length and work backwards...
00536     len = 8;
00537     while(len > 0)
00538     {
00539         QString tmp = str.left(len);
00540         QChar res = fromEntity(tmp);
00541         if( res != QChar::Null ) return res;
00542         len--;
00543     }
00544     return QChar::Null;
00545 }
00546 
00547 
00548 QString KCharsets::toEntity(const QChar &ch)
00549 {
00550     QString ent;
00551     ent.sprintf("&#0x%x;", ch.unicode());
00552     return ent;
00553 }
00554 
00555 QString KCharsets::resolveEntities( const QString &input )
00556 {
00557     QString text = input;
00558     const QChar *p = text.unicode();
00559     const QChar *end = p + text.length();
00560     const QChar *ampersand = 0;
00561     bool scanForSemicolon = false;
00562 
00563     for ( ; p < end; ++p ) {
00564         const QChar ch = *p;
00565 
00566         if ( ch == QLatin1Char('&') ) {
00567             ampersand = p;
00568             scanForSemicolon = true;
00569             continue;
00570         }
00571 
00572         if ( ch != QLatin1Char(';') || scanForSemicolon == false )
00573             continue;
00574 
00575         assert( ampersand );
00576 
00577         scanForSemicolon = false;
00578 
00579         const QChar *entityBegin = ampersand + 1;
00580 
00581         const uint entityLength = p - entityBegin;
00582         if ( entityLength == 0 )
00583             continue;
00584 
00585         const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
00586         if ( entityValue.isNull() )
00587             continue;
00588 
00589         const uint ampersandPos = ampersand - text.unicode();
00590 
00591         text[ (int)ampersandPos ] = entityValue;
00592         text.remove( ampersandPos + 1, entityLength + 1 );
00593         p = text.unicode() + ampersandPos;
00594         end = text.unicode() + text.length();
00595         ampersand = 0;
00596     }
00597 
00598     return text;
00599 }
00600 
00601 QStringList KCharsets::availableEncodingNames() const
00602 {
00603     QStringList available;
00604     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2)
00605         available.append( QString::fromUtf8( language_for_encoding_string + *p ) );
00606     available.sort();
00607     return available;
00608 }
00609 
00610 #ifndef KDE_NO_DEPRECATED
00611 QString KCharsets::languageForEncoding( const QString &encoding ) const
00612 {
00613     const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string,
00614                                                language_for_encoding_indices,
00615                                                encoding.toUtf8().constData() );
00616     if ( lang )
00617         return i18nc( "@item Text character set", lang );
00618     else
00619         return i18nc( "@item Text character set", "Other" );
00620 }
00621 #endif
00622 
00623 QString KCharsets::descriptionForEncoding( const QString& encoding ) const
00624 {
00625     const char* lang = kcharsets_array_search( language_for_encoding_string,
00626                                                language_for_encoding_indices,
00627                                                encoding.toUtf8() );
00628     if ( lang )
00629         return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )",
00630                       i18nc( "@item Text character set", lang ), encoding );
00631     else
00632         return i18nc( "@item", "Other encoding (%1)", encoding );
00633 }
00634 
00635 QString KCharsets::encodingForName( const QString &descriptiveName ) const
00636 {
00637     const int left = descriptiveName.lastIndexOf( QLatin1Char('(') );
00638 
00639     if (left<0) // No parenthesis, so assume it is a normal encoding name
00640     return descriptiveName.trimmed();
00641 
00642     QString name(descriptiveName.mid(left+1));
00643 
00644     const int right = name.lastIndexOf( QLatin1Char(')') );
00645 
00646     if (right<0)
00647         return name;
00648 
00649     return name.left(right).trimmed();
00650 }
00651 
00652 QStringList KCharsets::descriptiveEncodingNames() const
00653 {
00654     QStringList encodings;
00655     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00656         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00657         const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] );
00658         encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )",
00659                                  description, name ) );
00660     }
00661     encodings.sort();
00662     return encodings;
00663 }
00664 
00665 QList<QStringList> KCharsets::encodingsByScript() const
00666 {
00667     if (!d->encodingsByScript.isEmpty())
00668         return d->encodingsByScript;
00669     int i;
00670     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00671         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00672         const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] );
00673 
00674         for (i=0; i<d->encodingsByScript.size(); ++i) {
00675             if (d->encodingsByScript.at(i).at(0) == description) {
00676                 d->encodingsByScript[i].append(name);
00677                 break;
00678             }
00679         }
00680 
00681         if (i==d->encodingsByScript.size()) {
00682             d->encodingsByScript.append(QStringList() << description << name);
00683         }
00684 
00685     }
00686     return d->encodingsByScript;
00687 }
00688 
00689 QTextCodec* KCharsets::codecForName(const QString &n) const
00690 {
00691     if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") )
00692         return QTextCodec::codecForName( "gb18030" );
00693     const QByteArray name( n.toLatin1() );
00694     QTextCodec* codec = codecForNameOrNull( name );
00695     if ( codec )
00696         return codec;
00697     else
00698         return QTextCodec::codecForName( "iso-8859-1" );
00699 }
00700 
00701 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
00702 {
00703     if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
00704         ok = true;
00705         return QTextCodec::codecForName( "gb18030" );
00706     }
00707     const QByteArray name( n.toLatin1() );
00708     QTextCodec* codec = codecForNameOrNull( name );
00709     if ( codec )
00710     {
00711         ok = true;
00712         return codec;
00713     }
00714     else
00715     {
00716         ok = false;
00717         return QTextCodec::codecForName( "iso-8859-1" );
00718     }
00719 }
00720 
00721 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
00722 {
00723     QTextCodec* codec = 0;
00724 
00725     if (n.isEmpty()) {
00726         // No name, assume locale (KDE's, not Qt's)
00727         const QByteArray locale = "->locale<-";
00728         if ( d->codecForNameDict.contains( locale ) )
00729             return d->codecForNameDict.value( locale );
00730         codec = KGlobal::locale()->codecForEncoding();
00731         d->codecForNameDict.insert("->locale<-", codec);
00732         return codec;
00733     }
00734     // For a non-empty name, lookup the "dictionnary", in a case-sensitive way.
00735     else if ( d->codecForNameDict.contains( n ) ) {
00736         return d->codecForNameDict.value( n );
00737     }
00738 
00739     // If the name is not in the hash table, call directly QTextCoded::codecForName.
00740     // We assume that QTextCodec is smarter and more maintained than this code.
00741     codec = QTextCodec::codecForName( n );
00742     if ( codec ) {
00743         d->codecForNameDict.insert( n, codec );
00744         return codec;
00745     }
00746 
00747     // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
00748 
00749     QByteArray name = n.toLower();
00750     bool changed = false;
00751     if (name.endsWith("_charset")) { // krazy:exclude=strings
00752        name.chop( 8 );
00753        changed = true;
00754     }
00755     if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings
00756        name.remove( 0, 2 ); // remove x- at start
00757        changed = true;
00758     }
00759 
00760     if (name.isEmpty()) {
00761       // We have no name anymore, therefore the name is invalid.
00762       return 0;
00763     }
00764 
00765     // We only need to check changed names.
00766     if ( changed ) {
00767         codec = QTextCodec::codecForName(name);
00768         if (codec) {
00769             d->codecForNameDict.insert( n, codec );
00770             return codec;
00771         }
00772         changed = false;
00773     }
00774 
00775     // these codecs are built into Qt, but the name given for the codec is different,
00776     // so QTextCodec did not recognize it.
00777     QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name);
00778 
00779     if(!cname.isEmpty())
00780         codec = QTextCodec::codecForName(cname);
00781 
00782     if (codec)
00783     {
00784         d->codecForNameDict.insert( n, codec );
00785         return codec;
00786     }
00787 
00788     // this also failed, the last resort is now to take some compatibility charmap
00789     // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
00790     cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name );
00791 
00792     if (!cname.isEmpty()) {
00793         codec = QTextCodec::codecForName(cname);
00794         if (codec) {
00795             d->codecForNameDict.insert( n, codec );
00796             return codec;
00797         }
00798     }
00799 
00800     // we could not assign a codec, therefore return NULL
00801     return 0;
00802 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.5
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal