KDECore
kcharsets.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 1999 Lars Knoll (knoll@kde.org) 00003 Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> 00004 Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net> 00005 00006 This library is free software; you can redistribute it and/or 00007 modify it under the terms of the GNU Library General Public 00008 License as published by the Free Software Foundation; either 00009 version 2 of the License, or (at your option) any later version. 00010 00011 This library is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 Library General Public License for more details. 00015 00016 You should have received a copy of the GNU Library General Public License 00017 along with this library; see the file COPYING.LIB. If not, write to 00018 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 Boston, MA 02110-1301, USA. 00020 */ 00021 #include "kcharsets.h" 00022 00023 #include "kfilterdev.h" 00024 #include "kentities.c" 00025 00026 #include "kconfig.h" 00027 #include "kdebug.h" 00028 #include "kglobal.h" 00029 #include "klocale.h" 00030 00031 #include <QtCore/QDir> 00032 #include <QtCore/QRegExp> 00033 #include <QtCore/QCharRef> 00034 #include <QtCore/QMutableStringListIterator> 00035 #include <QtCore/QTextCodec> 00036 00037 #include <assert.h> 00038 #include <QHash> 00039 00040 /* 00041 * ### FIXME KDE4: the name of the encodings should mostly be uppercase 00042 * The names of this list are user-visible 00043 * Generate with generate_string_table.pl, input data: 00044 ISO 8859-1 00045 i18n:Western European 00046 ISO 8859-15 00047 i18n:Western European 00048 ISO 8859-14 00049 i18n:Western European 00050 cp 1252 00051 i18n:Western European 00052 IBM850 00053 i18n:Western European 00054 ISO 8859-2 00055 i18n:Central European 00056 ISO 8859-3 00057 i18n:Central European 00058 ISO 8859-4 00059 i18n:Baltic 00060 ISO 8859-13 00061 i18n:Baltic 00062 ISO 8859-16 00063 i18n:South-Eastern Europe 00064 cp 1250 00065 i18n:Central European 00066 cp 1254 00067 i18n:Turkish 00068 cp 1257 00069 i18n:Baltic 00070 KOI8-R 00071 i18n:Cyrillic 00072 ISO 8859-5 00073 i18n:Cyrillic 00074 cp 1251 00075 i18n:Cyrillic 00076 KOI8-U 00077 i18n:Cyrillic 00078 IBM866 00079 i18n:Cyrillic 00080 Big5 00081 i18n:Chinese Traditional 00082 Big5-HKSCS 00083 i18n:Chinese Traditional 00084 GB18030 00085 i18n:Chinese Simplified 00086 GBK 00087 i18n:Chinese Simplified 00088 GB2312 00089 i18n:Chinese Simplified 00090 EUC-KR 00091 i18n:Korean 00092 sjis 00093 i18n:Japanese 00094 jis7 00095 i18n:Japanese 00096 EUC-JP 00097 i18n:Japanese 00098 ISO 8859-7 00099 i18n:Greek 00100 cp 1253 00101 i18n:Greek 00102 ISO 8859-6 00103 i18n:Arabic 00104 cp 1256 00105 i18n:Arabic 00106 ISO 8859-8 00107 i18n:Hebrew 00108 ISO 8859-8-I 00109 i18n:Hebrew 00110 cp 1255 00111 i18n:Hebrew 00112 ISO 8859-9 00113 i18n:Turkish 00114 TIS620 00115 i18n:Thai 00116 ISO 8859-11 00117 i18n:Thai 00118 UTF-8 00119 i18n:Unicode 00120 UTF-16 00121 i18n:Unicode 00122 utf7 00123 i18n:Unicode 00124 ucs2 00125 i18n:Unicode 00126 ISO 10646-UCS-2 00127 i18n:Unicode 00128 winsami2 00129 i18n:Northern Saami 00130 windows-1258 00131 i18n:Other 00132 IBM874 00133 i18n:Other 00134 TSCII 00135 i18n:Other 00136 */ 00137 /* 00138 * Notes about the table: 00139 * 00140 * - The following entries were disabled and removed from the table: 00141 ibm852 00142 i18n:Central European 00143 pt 154 00144 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt 00145 * 00146 * - ISO 8559-11 is the deprecated name of TIS-620 00147 * - utf7 is not in Qt 00148 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 00149 * - windows-1258: TODO 00150 * - IBM874: TODO 00151 * - TSCII: TODO 00152 */ 00153 static const char language_for_encoding_string[] = 00154 "ISO 8859-1\0" 00155 I18N_NOOP2("@item Text character set", "Western European")"\0" 00156 "ISO 8859-15\0" 00157 "ISO 8859-14\0" 00158 "cp 1252\0" 00159 "IBM850\0" 00160 "ISO 8859-2\0" 00161 I18N_NOOP2("@item Text character set", "Central European")"\0" 00162 "ISO 8859-3\0" 00163 "ISO 8859-4\0" 00164 I18N_NOOP2("@item Text character set", "Baltic")"\0" 00165 "ISO 8859-13\0" 00166 "ISO 8859-16\0" 00167 I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0" 00168 "cp 1250\0" 00169 "cp 1254\0" 00170 I18N_NOOP2("@item Text character set", "Turkish")"\0" 00171 "cp 1257\0" 00172 "KOI8-R\0" 00173 I18N_NOOP2("@item Text character set", "Cyrillic")"\0" 00174 "ISO 8859-5\0" 00175 "cp 1251\0" 00176 "KOI8-U\0" 00177 "IBM866\0" 00178 "Big5\0" 00179 I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0" 00180 "Big5-HKSCS\0" 00181 "GB18030\0" 00182 I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0" 00183 "GBK\0" 00184 "GB2312\0" 00185 "EUC-KR\0" 00186 I18N_NOOP2("@item Text character set", "Korean")"\0" 00187 "sjis\0" 00188 I18N_NOOP2("@item Text character set", "Japanese")"\0" 00189 "jis7\0" 00190 "EUC-JP\0" 00191 "ISO 8859-7\0" 00192 I18N_NOOP2("@item Text character set", "Greek")"\0" 00193 "cp 1253\0" 00194 "ISO 8859-6\0" 00195 I18N_NOOP2("@item Text character set", "Arabic")"\0" 00196 "cp 1256\0" 00197 "ISO 8859-8\0" 00198 I18N_NOOP2("@item Text character set", "Hebrew")"\0" 00199 "ISO 8859-8-I\0" 00200 "cp 1255\0" 00201 "ISO 8859-9\0" 00202 "TIS620\0" 00203 I18N_NOOP2("@item Text character set", "Thai")"\0" 00204 "ISO 8859-11\0" 00205 "UTF-8\0" 00206 I18N_NOOP2("@item Text character set", "Unicode")"\0" 00207 "UTF-16\0" 00208 "utf7\0" 00209 "ucs2\0" 00210 "ISO 10646-UCS-2\0" 00211 "winsami2\0" 00212 I18N_NOOP2("@item Text character set", "Northern Saami")"\0" 00213 "windows-1258\0" 00214 I18N_NOOP2("@item Text character set", "Other")"\0" 00215 "IBM874\0" 00216 "TSCII\0" 00217 "\0"; 00218 00219 static const int language_for_encoding_indices[] = { 00220 0, 11, 28, 11, 40, 11, 52, 11, 00221 60, 11, 67, 78, 95, 78, 106, 117, 00222 124, 117, 136, 148, 169, 78, 177, 185, 00223 193, 117, 201, 208, 217, 208, 228, 208, 00224 236, 208, 243, 208, 250, 255, 275, 255, 00225 286, 294, 313, 294, 317, 294, 324, 331, 00226 338, 343, 352, 343, 357, 343, 364, 375, 00227 381, 375, 389, 400, 407, 400, 415, 426, 00228 433, 426, 446, 426, 454, 185, 465, 472, 00229 477, 472, 489, 495, 503, 495, 510, 495, 00230 515, 495, 520, 495, 536, 545, 560, 573, 00231 579, 573, 586, 573, -1 00232 }; 00233 00234 /* 00235 * defines some different names for codecs that are built into Qt. 00236 * The names in this list must be lower-case. 00237 * input data for generate_string_table.pl: 00238 iso-ir-111 00239 koi8-r 00240 koi unified 00241 koi8-r 00242 us-ascii 00243 iso 8859-1 00244 usascii 00245 iso 8859-1 00246 ascii 00247 iso 8859-1 00248 unicode-1-1-utf-7 00249 utf-7 00250 ucs2 00251 iso-10646-ucs-2 00252 iso10646-1 00253 iso-10646-ucs-2 00254 gb18030.2000-1 00255 gb18030 00256 gb18030.2000-0 00257 gb18030 00258 gbk-0 00259 gbk 00260 gb2312 00261 gbk 00262 gb2312.1980-0 00263 gbk 00264 big5-0 00265 big5 00266 euc-kr 00267 euckr 00268 euc-jp 00269 eucjp 00270 jisx0201.1976-0 00271 eucjp 00272 jisx0208.1983-0 00273 eucjp 00274 jisx0208.1990-0 00275 eucjp 00276 jisx0208.1997-0 00277 eucjp 00278 jisx0212.1990-0 00279 eucjp 00280 jisx0213.2000-1 00281 eucjp 00282 jisx0213.2000-2 00283 eucjp 00284 shift_jis 00285 sjis 00286 shift-jis 00287 sjis 00288 sjis 00289 sjis 00290 iso-2022-jp 00291 jis7 00292 windows850 00293 ibm850 00294 windows866 00295 ibm866 00296 windows-850 00297 ibm850 00298 windows-866 00299 ibm866 00300 cp-10000 00301 apple roman 00302 thai-tis620 00303 iso 8859-11 00304 windows-874 00305 ibm874 00306 windows874 00307 ibm874 00308 cp-874 00309 ibm874 00310 ksc5601.1987-0 00311 euckr 00312 ks_c_5601-1987 00313 euckr 00314 mac-roman 00315 apple roman 00316 macintosh 00317 apple roman 00318 mac 00319 apple roman 00320 csiso2022jp 00321 iso-2022-jp 00322 */ 00323 /* 00324 * Notes about the table: 00325 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set) 00326 * - utf7 is not in Qt 00327 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 00328 * - sjis: appears on the table for x-sjis 00329 * - jis7: ISO-2022-JP is now the default name in Qt4 00330 * - cp-874: is it really needed? 00331 * - mac-roman: appears on the table for x-mac-roman 00332 * - csiso2022jp: See bug #77243 00333 */ 00334 static const char builtin_string[] = 00335 "iso-ir-111\0" 00336 "koi8-r\0" 00337 "koi unified\0" 00338 "us-ascii\0" 00339 "iso 8859-1\0" 00340 "usascii\0" 00341 "ascii\0" 00342 "unicode-1-1-utf-7\0" 00343 "utf-7\0" 00344 "ucs2\0" 00345 "iso-10646-ucs-2\0" 00346 "iso10646-1\0" 00347 "gb18030.2000-1\0" 00348 "gb18030\0" 00349 "gb18030.2000-0\0" 00350 "gbk-0\0" 00351 "gbk\0" 00352 "gb2312\0" 00353 "gb2312.1980-0\0" 00354 "big5-0\0" 00355 "big5\0" 00356 "euc-kr\0" 00357 "euckr\0" 00358 "euc-jp\0" 00359 "eucjp\0" 00360 "jisx0201.1976-0\0" 00361 "jisx0208.1983-0\0" 00362 "jisx0208.1990-0\0" 00363 "jisx0208.1997-0\0" 00364 "jisx0212.1990-0\0" 00365 "jisx0213.2000-1\0" 00366 "jisx0213.2000-2\0" 00367 "shift_jis\0" 00368 "sjis\0" 00369 "shift-jis\0" 00370 "iso-2022-jp\0" 00371 "jis7\0" 00372 "windows850\0" 00373 "ibm850\0" 00374 "windows866\0" 00375 "ibm866\0" 00376 "windows-850\0" 00377 "windows-866\0" 00378 "cp-10000\0" 00379 "apple roman\0" 00380 "thai-tis620\0" 00381 "iso 8859-11\0" 00382 "windows-874\0" 00383 "ibm874\0" 00384 "windows874\0" 00385 "cp-874\0" 00386 "ksc5601.1987-0\0" 00387 "ks_c_5601-1987\0" 00388 "mac-roman\0" 00389 "macintosh\0" 00390 "mac\0" 00391 "csiso2022jp\0" 00392 "\0"; 00393 00394 static const int builtin_indices[] = { 00395 0, 11, 18, 11, 30, 39, 50, 39, 00396 58, 39, 64, 82, 88, 93, 109, 93, 00397 120, 135, 143, 135, 158, 164, 168, 164, 00398 175, 164, 189, 196, 201, 208, 214, 221, 00399 227, 221, 243, 221, 259, 221, 275, 221, 00400 291, 221, 307, 221, 323, 221, 339, 349, 00401 354, 349, 349, 349, 364, 376, 381, 392, 00402 399, 410, 417, 392, 429, 410, 441, 450, 00403 462, 474, 486, 498, 505, 498, 516, 498, 00404 523, 208, 538, 208, 553, 450, 563, 450, 00405 573, 450, 577, 364, -1 00406 }; 00407 00408 /* 00409 * some last resort hints in case the charmap file couldn't be found. 00410 * This gives at least a partial conversion and helps making things readable. 00411 * 00412 * the name used as input here is already converted to the more canonical 00413 * name as defined in the aliases array. 00414 * 00415 * Input data: 00416 cp1250 00417 iso-8859-2 00418 koi8-r 00419 iso-8859-5 00420 koi8-u 00421 koi8-r 00422 pt 154 00423 windows-1251 00424 paratype-154 00425 windows-1251 00426 pt-154 00427 windows-1251 00428 */ 00429 /* Notes: 00430 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback. 00431 */ 00432 static const char conversion_hints_string[] = 00433 "cp1250\0" 00434 "iso-8859-2\0" 00435 "koi8-r\0" 00436 "iso-8859-5\0" 00437 "koi8-u\0" 00438 "pt 154\0" 00439 "windows-1251\0" 00440 "paratype-154\0" 00441 "pt-154\0" 00442 "\0"; 00443 00444 static const int conversion_hints_indices[] = { 00445 0, 7, 18, 25, 36, 18, 43, 50, 00446 63, 50, 76, 50, -1 00447 }; 00448 00449 // search an array of items index/data, find first matching index 00450 // and return data, or return 0 00451 static inline 00452 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) 00453 { 00454 for (int i = 0; indices[i] != -1; i += 2) 00455 if (qstrcmp(start + indices[i], entry) == 0) 00456 return start + indices[i + 1]; 00457 return 0; 00458 } 00459 00460 00461 class KCharsetsPrivate 00462 { 00463 public: 00464 KCharsetsPrivate(KCharsets* _kc) 00465 { 00466 kc = _kc; 00467 codecForNameDict.reserve( 43 ); 00468 } 00469 // Hash for the encoding names (sensitive case) 00470 QHash<QByteArray,QTextCodec*> codecForNameDict; 00471 KCharsets* kc; 00472 00473 //Cache list so QStrings can be implicitly shared 00474 QList<QStringList> encodingsByScript; 00475 }; 00476 00477 // -------------------------------------------------------------------------- 00478 00479 KCharsets::KCharsets() 00480 :d(new KCharsetsPrivate(this)) 00481 { 00482 } 00483 00484 KCharsets::~KCharsets() 00485 { 00486 delete d; 00487 } 00488 00489 QChar KCharsets::fromEntity(const QString &str) 00490 { 00491 QChar res = QChar::Null; 00492 00493 if ( str.isEmpty() ) 00494 return QChar::Null; 00495 00496 int pos = 0; 00497 if(str[pos] == QLatin1Char('&')) pos++; 00498 00499 // Check for '�' or '�' sequence 00500 if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) { 00501 bool ok; 00502 pos++; 00503 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { 00504 pos++; 00505 // '�', hexadecimal character reference 00506 const QString tmp( str.mid( pos ) ); 00507 res = tmp.toInt(&ok, 16); 00508 } else { 00509 // '�', decimal character reference 00510 const QString tmp( str.mid( pos ) ); 00511 res = tmp.toInt(&ok, 10); 00512 } 00513 if ( ok ) 00514 return res; 00515 else 00516 return QChar::Null; 00517 } 00518 00519 const QByteArray raw ( str.toLatin1() ); 00520 const entity *e = kde_findEntity( raw, raw.length() ); 00521 00522 if(!e) 00523 { 00524 //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length(); 00525 return QChar::Null; 00526 } 00527 //kDebug() << "got entity " << str << " = " << e->code; 00528 00529 return QChar(e->code); 00530 } 00531 00532 QChar KCharsets::fromEntity(const QString &str, int &len) 00533 { 00534 // entities are never longer than 8 chars... we start from 00535 // that length and work backwards... 00536 len = 8; 00537 while(len > 0) 00538 { 00539 QString tmp = str.left(len); 00540 QChar res = fromEntity(tmp); 00541 if( res != QChar::Null ) return res; 00542 len--; 00543 } 00544 return QChar::Null; 00545 } 00546 00547 00548 QString KCharsets::toEntity(const QChar &ch) 00549 { 00550 QString ent; 00551 ent.sprintf("�x%x;", ch.unicode()); 00552 return ent; 00553 } 00554 00555 QString KCharsets::resolveEntities( const QString &input ) 00556 { 00557 QString text = input; 00558 const QChar *p = text.unicode(); 00559 const QChar *end = p + text.length(); 00560 const QChar *ampersand = 0; 00561 bool scanForSemicolon = false; 00562 00563 for ( ; p < end; ++p ) { 00564 const QChar ch = *p; 00565 00566 if ( ch == QLatin1Char('&') ) { 00567 ampersand = p; 00568 scanForSemicolon = true; 00569 continue; 00570 } 00571 00572 if ( ch != QLatin1Char(';') || scanForSemicolon == false ) 00573 continue; 00574 00575 assert( ampersand ); 00576 00577 scanForSemicolon = false; 00578 00579 const QChar *entityBegin = ampersand + 1; 00580 00581 const uint entityLength = p - entityBegin; 00582 if ( entityLength == 0 ) 00583 continue; 00584 00585 const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) ); 00586 if ( entityValue.isNull() ) 00587 continue; 00588 00589 const uint ampersandPos = ampersand - text.unicode(); 00590 00591 text[ (int)ampersandPos ] = entityValue; 00592 text.remove( ampersandPos + 1, entityLength + 1 ); 00593 p = text.unicode() + ampersandPos; 00594 end = text.unicode() + text.length(); 00595 ampersand = 0; 00596 } 00597 00598 return text; 00599 } 00600 00601 QStringList KCharsets::availableEncodingNames() const 00602 { 00603 QStringList available; 00604 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) 00605 available.append( QString::fromUtf8( language_for_encoding_string + *p ) ); 00606 available.sort(); 00607 return available; 00608 } 00609 00610 #ifndef KDE_NO_DEPRECATED 00611 QString KCharsets::languageForEncoding( const QString &encoding ) const 00612 { 00613 const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string, 00614 language_for_encoding_indices, 00615 encoding.toUtf8().constData() ); 00616 if ( lang ) 00617 return i18nc( "@item Text character set", lang ); 00618 else 00619 return i18nc( "@item Text character set", "Other" ); 00620 } 00621 #endif 00622 00623 QString KCharsets::descriptionForEncoding( const QString& encoding ) const 00624 { 00625 const char* lang = kcharsets_array_search( language_for_encoding_string, 00626 language_for_encoding_indices, 00627 encoding.toUtf8() ); 00628 if ( lang ) 00629 return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )", 00630 i18nc( "@item Text character set", lang ), encoding ); 00631 else 00632 return i18nc( "@item", "Other encoding (%1)", encoding ); 00633 } 00634 00635 QString KCharsets::encodingForName( const QString &descriptiveName ) const 00636 { 00637 const int left = descriptiveName.lastIndexOf( QLatin1Char('(') ); 00638 00639 if (left<0) // No parenthesis, so assume it is a normal encoding name 00640 return descriptiveName.trimmed(); 00641 00642 QString name(descriptiveName.mid(left+1)); 00643 00644 const int right = name.lastIndexOf( QLatin1Char(')') ); 00645 00646 if (right<0) 00647 return name; 00648 00649 return name.left(right).trimmed(); 00650 } 00651 00652 QStringList KCharsets::descriptiveEncodingNames() const 00653 { 00654 QStringList encodings; 00655 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) { 00656 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] ); 00657 const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] ); 00658 encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )", 00659 description, name ) ); 00660 } 00661 encodings.sort(); 00662 return encodings; 00663 } 00664 00665 QList<QStringList> KCharsets::encodingsByScript() const 00666 { 00667 if (!d->encodingsByScript.isEmpty()) 00668 return d->encodingsByScript; 00669 int i; 00670 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) { 00671 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] ); 00672 const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] ); 00673 00674 for (i=0; i<d->encodingsByScript.size(); ++i) { 00675 if (d->encodingsByScript.at(i).at(0) == description) { 00676 d->encodingsByScript[i].append(name); 00677 break; 00678 } 00679 } 00680 00681 if (i==d->encodingsByScript.size()) { 00682 d->encodingsByScript.append(QStringList() << description << name); 00683 } 00684 00685 } 00686 return d->encodingsByScript; 00687 } 00688 00689 QTextCodec* KCharsets::codecForName(const QString &n) const 00690 { 00691 if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") ) 00692 return QTextCodec::codecForName( "gb18030" ); 00693 const QByteArray name( n.toLatin1() ); 00694 QTextCodec* codec = codecForNameOrNull( name ); 00695 if ( codec ) 00696 return codec; 00697 else 00698 return QTextCodec::codecForName( "iso-8859-1" ); 00699 } 00700 00701 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const 00702 { 00703 if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) { 00704 ok = true; 00705 return QTextCodec::codecForName( "gb18030" ); 00706 } 00707 const QByteArray name( n.toLatin1() ); 00708 QTextCodec* codec = codecForNameOrNull( name ); 00709 if ( codec ) 00710 { 00711 ok = true; 00712 return codec; 00713 } 00714 else 00715 { 00716 ok = false; 00717 return QTextCodec::codecForName( "iso-8859-1" ); 00718 } 00719 } 00720 00721 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const 00722 { 00723 QTextCodec* codec = 0; 00724 00725 if (n.isEmpty()) { 00726 // No name, assume locale (KDE's, not Qt's) 00727 const QByteArray locale = "->locale<-"; 00728 if ( d->codecForNameDict.contains( locale ) ) 00729 return d->codecForNameDict.value( locale ); 00730 codec = KGlobal::locale()->codecForEncoding(); 00731 d->codecForNameDict.insert("->locale<-", codec); 00732 return codec; 00733 } 00734 // For a non-empty name, lookup the "dictionnary", in a case-sensitive way. 00735 else if ( d->codecForNameDict.contains( n ) ) { 00736 return d->codecForNameDict.value( n ); 00737 } 00738 00739 // If the name is not in the hash table, call directly QTextCoded::codecForName. 00740 // We assume that QTextCodec is smarter and more maintained than this code. 00741 codec = QTextCodec::codecForName( n ); 00742 if ( codec ) { 00743 d->codecForNameDict.insert( n, codec ); 00744 return codec; 00745 } 00746 00747 // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it. 00748 00749 QByteArray name = n.toLower(); 00750 bool changed = false; 00751 if (name.endsWith("_charset")) { // krazy:exclude=strings 00752 name.chop( 8 ); 00753 changed = true; 00754 } 00755 if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings 00756 name.remove( 0, 2 ); // remove x- at start 00757 changed = true; 00758 } 00759 00760 if (name.isEmpty()) { 00761 // We have no name anymore, therefore the name is invalid. 00762 return 0; 00763 } 00764 00765 // We only need to check changed names. 00766 if ( changed ) { 00767 codec = QTextCodec::codecForName(name); 00768 if (codec) { 00769 d->codecForNameDict.insert( n, codec ); 00770 return codec; 00771 } 00772 changed = false; 00773 } 00774 00775 // these codecs are built into Qt, but the name given for the codec is different, 00776 // so QTextCodec did not recognize it. 00777 QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name); 00778 00779 if(!cname.isEmpty()) 00780 codec = QTextCodec::codecForName(cname); 00781 00782 if (codec) 00783 { 00784 d->codecForNameDict.insert( n, codec ); 00785 return codec; 00786 } 00787 00788 // this also failed, the last resort is now to take some compatibility charmap 00789 // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write. 00790 cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name ); 00791 00792 if (!cname.isEmpty()) { 00793 codec = QTextCodec::codecForName(cname); 00794 if (codec) { 00795 d->codecForNameDict.insert( n, codec ); 00796 return codec; 00797 } 00798 } 00799 00800 // we could not assign a codec, therefore return NULL 00801 return 0; 00802 }
KDE 4.7 API Reference