• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KDECore

kcharsets.cpp

Go to the documentation of this file.
00001 /* This file is part of the KDE libraries
00002     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
00003     Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
00004     Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net>
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 */
00021 #include "kcharsets.h"
00022 
00023 #include "kfilterdev.h"
00024 #include "kentities.c"
00025 
00026 #include "kconfig.h"
00027 #include "kdebug.h"
00028 #include "kglobal.h"
00029 #include "klocale.h"
00030 
00031 #include <QtCore/QDir>
00032 #include <QtCore/QRegExp>
00033 #include <QtCore/QCharRef>
00034 #include <QtCore/QMutableStringListIterator>
00035 #include <QtCore/QTextCodec>
00036 
00037 #include <assert.h>
00038 #include <QHash>
00039 
00040 /*
00041  * ### FIXME KDE4: the name of the encodings should mostly be uppercase
00042  * The names of this list are user-visible
00043  * Generate with generate_string_table.pl, input data:
00044 ISO 8859-1
00045 i18n:Western European
00046 ISO 8859-15
00047 i18n:Western European
00048 ISO 8859-14
00049 i18n:Western European
00050 cp 1252
00051 i18n:Western European
00052 IBM850
00053 i18n:Western European
00054 ISO 8859-2
00055 i18n:Central European
00056 ISO 8859-3
00057 i18n:Central European
00058 ISO 8859-4
00059 i18n:Baltic
00060 ISO 8859-13
00061 i18n:Baltic
00062 ISO 8859-16
00063 i18n:South-Eastern Europe
00064 cp 1250
00065 i18n:Central European
00066 cp 1254
00067 i18n:Turkish
00068 cp 1257
00069 i18n:Baltic
00070 KOI8-R
00071 i18n:Cyrillic
00072 ISO 8859-5
00073 i18n:Cyrillic
00074 cp 1251
00075 i18n:Cyrillic
00076 KOI8-U
00077 i18n:Cyrillic
00078 IBM866
00079 i18n:Cyrillic
00080 Big5
00081 i18n:Chinese Traditional
00082 Big5-HKSCS
00083 i18n:Chinese Traditional
00084 GB18030
00085 i18n:Chinese Simplified
00086 GBK
00087 i18n:Chinese Simplified
00088 GB2312
00089 i18n:Chinese Simplified
00090 EUC-KR
00091 i18n:Korean
00092 sjis
00093 i18n:Japanese
00094 jis7
00095 i18n:Japanese
00096 EUC-JP
00097 i18n:Japanese
00098 ISO 8859-7
00099 i18n:Greek
00100 cp 1253
00101 i18n:Greek
00102 ISO 8859-6
00103 i18n:Arabic
00104 cp 1256
00105 i18n:Arabic
00106 ISO 8859-8
00107 i18n:Hebrew
00108 ISO 8859-8-I
00109 i18n:Hebrew
00110 cp 1255
00111 i18n:Hebrew
00112 ISO 8859-9
00113 i18n:Turkish
00114 TIS620
00115 i18n:Thai
00116 ISO 8859-11
00117 i18n:Thai
00118 UTF-8
00119 i18n:Unicode
00120 UTF-16
00121 i18n:Unicode
00122 utf7
00123 i18n:Unicode
00124 ucs2
00125 i18n:Unicode
00126 ISO 10646-UCS-2
00127 i18n:Unicode
00128 winsami2
00129 i18n:Northern Saami
00130 windows-1258
00131 i18n:Other
00132 IBM874
00133 i18n:Other
00134 TSCII
00135 i18n:Other
00136  */
00137 /*
00138  * Notes about the table:
00139  *
00140  * - The following entries were disabled and removed from the table:
00141 ibm852
00142 i18n:Central European
00143 pt 154
00144 i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
00145  *
00146  * - ISO 8559-11 is the deprecated name of TIS-620
00147  * - utf7 is not in Qt
00148  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00149  * - windows-1258: TODO
00150  * - IBM874: TODO
00151  * - TSCII: TODO
00152  */
00153 static const char language_for_encoding_string[] =
00154     "ISO 8859-1\0"
00155     I18N_NOOP2("@item Text character set", "Western European")"\0"
00156     "ISO 8859-15\0"
00157     "ISO 8859-14\0"
00158     "cp 1252\0"
00159     "IBM850\0"
00160     "ISO 8859-2\0"
00161     I18N_NOOP2("@item Text character set", "Central European")"\0"
00162     "ISO 8859-3\0"
00163     "ISO 8859-4\0"
00164     I18N_NOOP2("@item Text character set", "Baltic")"\0"
00165     "ISO 8859-13\0"
00166     "ISO 8859-16\0"
00167     I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0"
00168     "cp 1250\0"
00169     "cp 1254\0"
00170     I18N_NOOP2("@item Text character set", "Turkish")"\0"
00171     "cp 1257\0"
00172     "KOI8-R\0"
00173     I18N_NOOP2("@item Text character set", "Cyrillic")"\0"
00174     "ISO 8859-5\0"
00175     "cp 1251\0"
00176     "KOI8-U\0"
00177     "IBM866\0"
00178     "Big5\0"
00179     I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0"
00180     "Big5-HKSCS\0"
00181     "GB18030\0"
00182     I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0"
00183     "GBK\0"
00184     "GB2312\0"
00185     "EUC-KR\0"
00186     I18N_NOOP2("@item Text character set", "Korean")"\0"
00187     "sjis\0"
00188     I18N_NOOP2("@item Text character set", "Japanese")"\0"
00189     "jis7\0"
00190     "EUC-JP\0"
00191     "ISO 8859-7\0"
00192     I18N_NOOP2("@item Text character set", "Greek")"\0"
00193     "cp 1253\0"
00194     "ISO 8859-6\0"
00195     I18N_NOOP2("@item Text character set", "Arabic")"\0"
00196     "cp 1256\0"
00197     "ISO 8859-8\0"
00198     I18N_NOOP2("@item Text character set", "Hebrew")"\0"
00199     "ISO 8859-8-I\0"
00200     "cp 1255\0"
00201     "ISO 8859-9\0"
00202     "TIS620\0"
00203     I18N_NOOP2("@item Text character set", "Thai")"\0"
00204     "ISO 8859-11\0"
00205     "UTF-8\0"
00206     I18N_NOOP2("@item Text character set", "Unicode")"\0"
00207     "UTF-16\0"
00208     "utf7\0"
00209     "ucs2\0"
00210     "ISO 10646-UCS-2\0"
00211     "winsami2\0"
00212     I18N_NOOP2("@item Text character set", "Northern Saami")"\0"
00213     "windows-1258\0"
00214     I18N_NOOP2("@item Text character set", "Other")"\0"
00215     "IBM874\0"
00216     "TSCII\0"
00217     "\0";
00218 
00219 static const int language_for_encoding_indices[] = {
00220        0,   11,   28,   11,   40,   11,   52,   11,
00221       60,   11,   67,   78,   95,   78,  106,  117,
00222      124,  117,  136,  148,  169,   78,  177,  185,
00223      193,  117,  201,  208,  217,  208,  228,  208,
00224      236,  208,  243,  208,  250,  255,  275,  255,
00225      286,  294,  313,  294,  317,  294,  324,  331,
00226      338,  343,  352,  343,  357,  343,  364,  375,
00227      381,  375,  389,  400,  407,  400,  415,  426,
00228      433,  426,  446,  426,  454,  185,  465,  472,
00229      477,  472,  489,  495,  503,  495,  510,  495,
00230      515,  495,  520,  495,  536,  545,  560,  573,
00231      579,  573,  586,  573,   -1
00232 };
00233 
00234 /*
00235  * defines some different names for codecs that are built into Qt.
00236  * The names in this list must be lower-case.
00237  * input data for generate_string_table.pl:
00238 iso-ir-111
00239 koi8-r
00240 koi unified
00241 koi8-r
00242 us-ascii
00243 iso 8859-1
00244 usascii
00245 iso 8859-1
00246 ascii
00247 iso 8859-1
00248 unicode-1-1-utf-7
00249 utf-7
00250 ucs2
00251 iso-10646-ucs-2
00252 iso10646-1
00253 iso-10646-ucs-2
00254 gb18030.2000-1
00255 gb18030
00256 gb18030.2000-0
00257 gb18030
00258 gbk-0
00259 gbk
00260 gb2312
00261 gbk
00262 gb2312.1980-0
00263 gbk
00264 big5-0
00265 big5
00266 euc-kr
00267 euckr
00268 euc-jp
00269 eucjp
00270 jisx0201.1976-0
00271 eucjp
00272 jisx0208.1983-0
00273 eucjp
00274 jisx0208.1990-0
00275 eucjp
00276 jisx0208.1997-0
00277 eucjp
00278 jisx0212.1990-0
00279 eucjp
00280 jisx0213.2000-1
00281 eucjp
00282 jisx0213.2000-2
00283 eucjp
00284 shift_jis
00285 sjis
00286 shift-jis
00287 sjis
00288 sjis
00289 sjis
00290 iso-2022-jp
00291 jis7
00292 windows850
00293 ibm850
00294 windows866
00295 ibm866
00296 windows-850
00297 ibm850
00298 windows-866
00299 ibm866
00300 cp-10000
00301 apple roman
00302 thai-tis620
00303 iso 8859-11
00304 windows-874
00305 ibm874
00306 windows874
00307 ibm874
00308 cp-874
00309 ibm874
00310 ksc5601.1987-0
00311 euckr
00312 ks_c_5601-1987
00313 euckr
00314 mac-roman
00315 apple roman
00316 macintosh
00317 apple roman
00318 mac
00319 apple roman
00320 csiso2022jp
00321 iso-2022-jp
00322 */
00323 /*
00324  * Notes about the table:
00325  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
00326  * - utf7 is not in Qt
00327  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00328  * - sjis: appears on the table for x-sjis
00329  * - jis7: ISO-2022-JP is now the default name in Qt4
00330  * - cp-874: is it really needed?
00331  * - mac-roman: appears on the table for x-mac-roman
00332  * - csiso2022jp: See bug #77243
00333  */
00334 static const char builtin_string[] =
00335     "iso-ir-111\0"
00336     "koi8-r\0"
00337     "koi unified\0"
00338     "us-ascii\0"
00339     "iso 8859-1\0"
00340     "usascii\0"
00341     "ascii\0"
00342     "unicode-1-1-utf-7\0"
00343     "utf-7\0"
00344     "ucs2\0"
00345     "iso-10646-ucs-2\0"
00346     "iso10646-1\0"
00347     "gb18030.2000-1\0"
00348     "gb18030\0"
00349     "gb18030.2000-0\0"
00350     "gbk-0\0"
00351     "gbk\0"
00352     "gb2312\0"
00353     "gb2312.1980-0\0"
00354     "big5-0\0"
00355     "big5\0"
00356     "euc-kr\0"
00357     "euckr\0"
00358     "euc-jp\0"
00359     "eucjp\0"
00360     "jisx0201.1976-0\0"
00361     "jisx0208.1983-0\0"
00362     "jisx0208.1990-0\0"
00363     "jisx0208.1997-0\0"
00364     "jisx0212.1990-0\0"
00365     "jisx0213.2000-1\0"
00366     "jisx0213.2000-2\0"
00367     "shift_jis\0"
00368     "sjis\0"
00369     "shift-jis\0"
00370     "iso-2022-jp\0"
00371     "jis7\0"
00372     "windows850\0"
00373     "ibm850\0"
00374     "windows866\0"
00375     "ibm866\0"
00376     "windows-850\0"
00377     "windows-866\0"
00378     "cp-10000\0"
00379     "apple roman\0"
00380     "thai-tis620\0"
00381     "iso 8859-11\0"
00382     "windows-874\0"
00383     "ibm874\0"
00384     "windows874\0"
00385     "cp-874\0"
00386     "ksc5601.1987-0\0"
00387     "ks_c_5601-1987\0"
00388     "mac-roman\0"
00389     "macintosh\0"
00390     "mac\0"
00391     "csiso2022jp\0"
00392     "\0";
00393 
00394 static const int builtin_indices[] = {
00395        0,   11,   18,   11,   30,   39,   50,   39,
00396       58,   39,   64,   82,   88,   93,  109,   93,
00397      120,  135,  143,  135,  158,  164,  168,  164,
00398      175,  164,  189,  196,  201,  208,  214,  221,
00399      227,  221,  243,  221,  259,  221,  275,  221,
00400      291,  221,  307,  221,  323,  221,  339,  349,
00401      354,  349,  349,  349,  364,  376,  381,  392,
00402      399,  410,  417,  392,  429,  410,  441,  450,
00403      462,  474,  486,  498,  505,  498,  516,  498,
00404      523,  208,  538,  208,  553,  450,  563,  450,
00405      573,  450,  577,  364,   -1
00406 };
00407 
00408 #if 0
00409 // some different names for the encodings defined in the charmaps files.
00410 // even though the charmap file names are all uppercase, the names are all lowercase here.
00411 /* input data for generate_string_table.pl:
00412 cp852
00413 ibm852
00414 cp-852
00415 ibm852
00416 x-cp-852
00417 ibm852
00418 windows852
00419 ibm852
00420 windows-852
00421 ibm852
00422 x-windows-852
00423 ibm852
00424  */
00425 static const char aliases_string[] =
00426     "cp852\0"
00427     "ibm852\0"
00428     "cp-852\0"
00429     "x-cp-852\0"
00430     "windows852\0"
00431     "windows-852\0"
00432     "x-windows-852\0"
00433     "\0";
00434 
00435 static const int aliases_indices[] = {
00436        0,    6,   13,    6,   20,    6,   29,    6,
00437       40,    6,   52,    6,   -1
00438 };
00439 #endif
00440 
00441 /*
00442  * some last resort hints in case the charmap file couldn't be found.
00443  * This gives at least a partial conversion and helps making things readable.
00444  *
00445  * the name used as input here is already converted to the more canonical
00446  * name as defined in the aliases array.
00447  *
00448  * Input data:
00449 cp1250
00450 iso-8859-2
00451 koi8-r
00452 iso-8859-5
00453 koi8-u
00454 koi8-r
00455 pt 154
00456 windows-1251
00457 paratype-154
00458 windows-1251
00459 pt-154
00460 windows-1251
00461  */
00462 /* Notes:
00463  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
00464  */
00465 static const char conversion_hints_string[] =
00466     "cp1250\0"
00467     "iso-8859-2\0"
00468     "koi8-r\0"
00469     "iso-8859-5\0"
00470     "koi8-u\0"
00471     "pt 154\0"
00472     "windows-1251\0"
00473     "paratype-154\0"
00474     "pt-154\0"
00475     "\0";
00476 
00477 static const int conversion_hints_indices[] = {
00478        0,    7,   18,   25,   36,   18,   43,   50,
00479       63,   50,   76,   50,   -1
00480 };
00481 
00482 // search an array of items index/data, find first matching index
00483 // and return data, or return 0
00484 static inline
00485 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
00486 {
00487     for (int i = 0; indices[i] != -1; i += 2)
00488         if (qstrcmp(start + indices[i], entry) == 0)
00489             return start + indices[i + 1];
00490     return 0;
00491 }
00492 
00493 
00494 class KCharsetsPrivate
00495 {
00496 public:
00497     KCharsetsPrivate(KCharsets* _kc)
00498     {
00499         kc = _kc;
00500         codecForNameDict.reserve( 43 );
00501     }
00502     // Hash for the encoding names (sensitive case)
00503     QHash<QByteArray,QTextCodec*> codecForNameDict;
00504     KCharsets* kc;
00505 
00506     //Cache list so QStrings can be implicitly shared
00507     QList<QStringList> encodingsByScript;
00508 };
00509 
00510 // --------------------------------------------------------------------------
00511 
00512 KCharsets::KCharsets()
00513     :d(new KCharsetsPrivate(this))
00514 {
00515 }
00516 
00517 KCharsets::~KCharsets()
00518 {
00519     delete d;
00520 }
00521 
00522 QChar KCharsets::fromEntity(const QString &str)
00523 {
00524     QChar res = QChar::Null;
00525 
00526     if ( str.isEmpty() )
00527         return QChar::Null;
00528 
00529     int pos = 0;
00530     if(str[pos] == QLatin1Char('&')) pos++;
00531 
00532     // Check for '&#000' or '&#x0000' sequence
00533     if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) {
00534         bool ok;
00535         pos++;
00536         if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
00537             pos++;
00538             // '&#x0000', hexadecimal character reference
00539             const QString tmp( str.mid( pos ) );
00540             res = tmp.toInt(&ok, 16);
00541         } else {
00542             //  '&#0000', decimal character reference
00543             const QString tmp( str.mid( pos ) );
00544             res = tmp.toInt(&ok, 10);
00545         }
00546         if ( ok )
00547             return res;
00548         else
00549             return QChar::Null;
00550     }
00551 
00552     const QByteArray raw ( str.toLatin1() );
00553     const entity *e = kde_findEntity( raw, raw.length() );
00554 
00555     if(!e)
00556     {
00557         //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length();
00558         return QChar::Null;
00559     }
00560     //kDebug() << "got entity " << str << " = " << e->code;
00561 
00562     return QChar(e->code);
00563 }
00564 
00565 QChar KCharsets::fromEntity(const QString &str, int &len)
00566 {
00567     // entities are never longer than 8 chars... we start from
00568     // that length and work backwards...
00569     len = 8;
00570     while(len > 0)
00571     {
00572         QString tmp = str.left(len);
00573         QChar res = fromEntity(tmp);
00574         if( res != QChar::Null ) return res;
00575         len--;
00576     }
00577     return QChar::Null;
00578 }
00579 
00580 
00581 QString KCharsets::toEntity(const QChar &ch)
00582 {
00583     QString ent;
00584     ent.sprintf("&#0x%x;", ch.unicode());
00585     return ent;
00586 }
00587 
00588 QString KCharsets::resolveEntities( const QString &input )
00589 {
00590     QString text = input;
00591     const QChar *p = text.unicode();
00592     const QChar *end = p + text.length();
00593     const QChar *ampersand = 0;
00594     bool scanForSemicolon = false;
00595 
00596     for ( ; p < end; ++p ) {
00597         const QChar ch = *p;
00598 
00599         if ( ch == QLatin1Char('&') ) {
00600             ampersand = p;
00601             scanForSemicolon = true;
00602             continue;
00603         }
00604 
00605         if ( ch != QLatin1Char(';') || scanForSemicolon == false )
00606             continue;
00607 
00608         assert( ampersand );
00609 
00610         scanForSemicolon = false;
00611 
00612         const QChar *entityBegin = ampersand + 1;
00613 
00614         const uint entityLength = p - entityBegin;
00615         if ( entityLength == 0 )
00616             continue;
00617 
00618         const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
00619         if ( entityValue.isNull() )
00620             continue;
00621 
00622         const uint ampersandPos = ampersand - text.unicode();
00623 
00624         text[ (int)ampersandPos ] = entityValue;
00625         text.remove( ampersandPos + 1, entityLength + 1 );
00626         p = text.unicode() + ampersandPos;
00627         end = text.unicode() + text.length();
00628         ampersand = 0;
00629     }
00630 
00631     return text;
00632 }
00633 
00634 QStringList KCharsets::availableEncodingNames() const
00635 {
00636     QStringList available;
00637     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2)
00638         available.append( QString::fromUtf8( language_for_encoding_string + *p ) );
00639     available.sort();
00640     return available;
00641 }
00642 
00643 #ifndef KDE_NO_DEPRECATED
00644 QString KCharsets::languageForEncoding( const QString &encoding ) const
00645 {
00646     const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string,
00647                                                language_for_encoding_indices,
00648                                                encoding.toUtf8().constData() );
00649     if ( lang )
00650         return i18nc( "@item Text character set", lang );
00651     else
00652         return i18nc( "@item Text character set", "Other" );
00653 }
00654 #endif
00655 
00656 QString KCharsets::descriptionForEncoding( const QString& encoding ) const
00657 {
00658     const char* lang = kcharsets_array_search( language_for_encoding_string,
00659                                                language_for_encoding_indices,
00660                                                encoding.toUtf8() );
00661     if ( lang )
00662         return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )",
00663                       i18nc( "@item Text character set", lang ), encoding );
00664     else
00665         return i18nc( "@item", "Other encoding (%1)", encoding );
00666 }
00667 
00668 QString KCharsets::encodingForName( const QString &descriptiveName ) const
00669 {
00670     const int left = descriptiveName.lastIndexOf( QLatin1Char('(') );
00671 
00672     if (left<0) // No parenthesis, so assume it is a normal encoding name
00673     return descriptiveName.trimmed();
00674 
00675     QString name(descriptiveName.mid(left+1));
00676 
00677     const int right = name.lastIndexOf( QLatin1Char(')') );
00678 
00679     if (right<0)
00680         return name;
00681 
00682     return name.left(right).trimmed();
00683 }
00684 
00685 QStringList KCharsets::descriptiveEncodingNames() const
00686 {
00687     QStringList encodings;
00688     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00689         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00690         const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] );
00691         encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )",
00692                                  description, name ) );
00693     }
00694     encodings.sort();
00695     return encodings;
00696 }
00697 
00698 QList<QStringList> KCharsets::encodingsByScript() const
00699 {
00700     if (!d->encodingsByScript.isEmpty())
00701         return d->encodingsByScript;
00702     int i;
00703     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00704         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00705         const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] );
00706 
00707         for (i=0; i<d->encodingsByScript.size(); ++i) {
00708             if (d->encodingsByScript.at(i).at(0) == description) {
00709                 d->encodingsByScript[i].append(name);
00710                 break;
00711             }
00712         }
00713 
00714         if (i==d->encodingsByScript.size()) {
00715             d->encodingsByScript.append(QStringList() << description << name);
00716         }
00717 
00718     }
00719     return d->encodingsByScript;
00720 }
00721 
00722 QTextCodec* KCharsets::codecForName(const QString &n) const
00723 {
00724     if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") )
00725         return QTextCodec::codecForName( "gb18030" );
00726     const QByteArray name( n.toLatin1() );
00727     QTextCodec* codec = codecForNameOrNull( name );
00728     if ( codec )
00729         return codec;
00730     else
00731         return QTextCodec::codecForName( "iso-8859-1" );
00732 }
00733 
00734 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
00735 {
00736     if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
00737         ok = true;
00738         return QTextCodec::codecForName( "gb18030" );
00739     }
00740     const QByteArray name( n.toLatin1() );
00741     QTextCodec* codec = codecForNameOrNull( name );
00742     if ( codec )
00743     {
00744         ok = true;
00745         return codec;
00746     }
00747     else
00748     {
00749         ok = false;
00750         return QTextCodec::codecForName( "iso-8859-1" );
00751     }
00752 }
00753 
00754 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
00755 {
00756     QTextCodec* codec = 0;
00757 
00758     if (n.isEmpty()) {
00759         // No name, assume locale (KDE's, not Qt's)
00760         const QByteArray locale = "->locale<-";
00761         if ( d->codecForNameDict.contains( locale ) )
00762             return d->codecForNameDict.value( locale );
00763         codec = KGlobal::locale()->codecForEncoding();
00764         d->codecForNameDict.insert("->locale<-", codec);
00765         return codec;
00766     }
00767     // For a non-empty name, lookup the "dictionnary", in a case-sensitive way.
00768     else if ( d->codecForNameDict.contains( n ) ) {
00769         return d->codecForNameDict.value( n );
00770     }
00771 
00772     // If the name is not in the hash table, call directly QTextCoded::codecForName.
00773     // We assume that QTextCodec is smarter and more maintained than this code.
00774     codec = QTextCodec::codecForName( n );
00775     if ( codec ) {
00776         d->codecForNameDict.insert( n, codec );
00777         return codec;
00778     }
00779 
00780     // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
00781 
00782     QByteArray name = n.toLower();
00783     bool changed = false;
00784     if (name.endsWith("_charset")) { // krazy:exclude=strings
00785        name.chop( 8 );
00786        changed = true;
00787     }
00788     if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings
00789        name.remove( 0, 2 ); // remove x- at start
00790        changed = true;
00791     }
00792 
00793     if (name.isEmpty()) {
00794       // We have no name anymore, therefore the name is invalid.
00795       return 0;
00796     }
00797 
00798     // We only need to check changed names.
00799     if ( changed ) {
00800         codec = QTextCodec::codecForName(name);
00801         if (codec) {
00802             d->codecForNameDict.insert( n, codec );
00803             return codec;
00804         }
00805         changed = false;
00806     }
00807 
00808     // these codecs are built into Qt, but the name given for the codec is different,
00809     // so QTextCodec did not recognize it.
00810     QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name);
00811 
00812     if(!cname.isEmpty())
00813         codec = QTextCodec::codecForName(cname);
00814 
00815     if (codec)
00816     {
00817         d->codecForNameDict.insert( n, codec );
00818         return codec;
00819     }
00820 
00821 #ifdef __GNUC__
00822 #warning is it still useful with Qt4 ?
00823 #endif
00824     //don't forget to remove the #if 0 on a few structs at the top also if you reenable that ;)  (search for 852 )
00825     //from what I understood, one needs to create a QTextCodecPlugin in order to be able to support a new Codec, but I do not
00826     //know how to convert a charmap to a QTextCodec and the real big question is whether we need that at all ...  (mikmak)
00827         // Yes, it is useful (for examples EBCDIC in Kate or codepages for KOffice filters from/to MS formats) (goutte)
00828 #if 0
00829     QString dir;
00830     {
00831     KConfigGroup cg( KGlobal::config(), "i18n" );
00832     dir = cg.readPathEntry("i18ndir", QLatin1String("/usr/share/i18n/charmaps"));
00833     }
00834 
00835     // these are codecs not included in Qt. They can be build up if the corresponding charmap
00836     // is available in the charmap directory.
00837     cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00838 
00839     if(cname.isEmpty())
00840         cname = name;
00841     cname = cname.toUpper();
00842 
00843     const QString basicName = QLatin1String(cname);
00844     kDebug() << endl << " Trying to find " << cname << " in " << dir;
00845 
00846     QString charMapFileName;
00847     bool gzipped = false;
00848     QDir qdir(dir);
00849     if (!qdir.exists()) {
00850         // The directory for the charmaps does not even exist... (That is common!)
00851     }
00852     else if (qdir.exists(basicName, false)) {
00853         charMapFileName = basicName;
00854     }
00855     else if (qdir.exists(basicName+".gz", false)) {
00856         charMapFileName = basicName + ".gz";
00857         gzipped = true;
00858     }
00859     else {
00860         // Check if we are asking a code page
00861         // If yes, then check "CP99999" and "IBM99999"
00862         // First we need to find the number of the codepage
00863         QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
00864         if ( regexp.search(basicName) != -1) {
00865             const QString num = regexp.cap(4);
00866             if (num.isEmpty()) {
00867                 // No number, not a code page (or something went wrong)
00868             }
00869             else if (qdir.exists("IBM"+num)) {
00870                 charMapFileName = "IBM"+num;
00871             }
00872             else if (qdir.exists("IBM"+num+".gz")) {
00873                 charMapFileName = "IBM"+num+".gz";
00874                 gzipped = true;
00875             }
00876             else if (qdir.exists("CP"+num)) {
00877                 charMapFileName = "CP"+num;
00878             }
00879             else if (qdir.exists("CP"+num+".gz")) {
00880                 charMapFileName = "CP"+num+".gz";
00881                 gzipped = true;
00882             }
00883         }
00884     }
00885 
00886     if (gzipped && !charMapFileName.isEmpty()) {
00887         KFilterDev gzip(dir + '/' + charMapFileName);
00888         if (gzip.open(QIODevice::ReadOnly)) {
00889             kDebug() << "Loading gzipped charset...";
00890             codec = QTextCodec::loadCharmap(&gzip);
00891             gzip.close();
00892         }
00893         else
00894             kWarning() << "Could not open gzipped charset!";
00895     }
00896     else if (!charMapFileName.isEmpty()) {
00897         codec = QTextCodec::loadCharmapFile(dir + '/' + charMapFileName);
00898     }
00899 
00900     if(codec) {
00901         d->codecForNameDict.insert( n, codec );
00902         return codec;
00903     }
00904 #endif
00905 
00906     // this also failed, the last resort is now to take some compatibility charmap
00907     // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
00908     cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name );
00909 
00910     if (!cname.isEmpty()) {
00911         codec = QTextCodec::codecForName(cname);
00912         if (codec) {
00913             d->codecForNameDict.insert( n, codec );
00914             return codec;
00915         }
00916     }
00917 
00918     // we could not assign a codec, therefore return NULL
00919     return 0;
00920 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal