KDECore
kcharsets.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 1999 Lars Knoll (knoll@kde.org) 00003 Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> 00004 Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net> 00005 00006 This library is free software; you can redistribute it and/or 00007 modify it under the terms of the GNU Library General Public 00008 License as published by the Free Software Foundation; either 00009 version 2 of the License, or (at your option) any later version. 00010 00011 This library is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 Library General Public License for more details. 00015 00016 You should have received a copy of the GNU Library General Public License 00017 along with this library; see the file COPYING.LIB. If not, write to 00018 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 Boston, MA 02110-1301, USA. 00020 */ 00021 #include "kcharsets.h" 00022 00023 #include "kfilterdev.h" 00024 #include "kentities.c" 00025 00026 #include "kconfig.h" 00027 #include "kdebug.h" 00028 #include "kglobal.h" 00029 #include "klocale.h" 00030 00031 #include <QtCore/QDir> 00032 #include <QtCore/QRegExp> 00033 #include <QtCore/QCharRef> 00034 #include <QtCore/QMutableStringListIterator> 00035 #include <QtCore/QTextCodec> 00036 00037 #include <assert.h> 00038 #include <QHash> 00039 00040 /* 00041 * ### FIXME KDE4: the name of the encodings should mostly be uppercase 00042 * The names of this list are user-visible 00043 * Generate with generate_string_table.pl, input data: 00044 ISO 8859-1 00045 i18n:Western European 00046 ISO 8859-15 00047 i18n:Western European 00048 ISO 8859-14 00049 i18n:Western European 00050 cp 1252 00051 i18n:Western European 00052 IBM850 00053 i18n:Western European 00054 ISO 8859-2 00055 i18n:Central European 00056 ISO 8859-3 00057 i18n:Central European 00058 ISO 8859-4 00059 i18n:Baltic 00060 ISO 8859-13 00061 i18n:Baltic 00062 ISO 8859-16 00063 i18n:South-Eastern Europe 00064 cp 1250 00065 i18n:Central European 00066 cp 1254 00067 i18n:Turkish 00068 cp 1257 00069 i18n:Baltic 00070 KOI8-R 00071 i18n:Cyrillic 00072 ISO 8859-5 00073 i18n:Cyrillic 00074 cp 1251 00075 i18n:Cyrillic 00076 KOI8-U 00077 i18n:Cyrillic 00078 IBM866 00079 i18n:Cyrillic 00080 Big5 00081 i18n:Chinese Traditional 00082 Big5-HKSCS 00083 i18n:Chinese Traditional 00084 GB18030 00085 i18n:Chinese Simplified 00086 GBK 00087 i18n:Chinese Simplified 00088 GB2312 00089 i18n:Chinese Simplified 00090 EUC-KR 00091 i18n:Korean 00092 sjis 00093 i18n:Japanese 00094 jis7 00095 i18n:Japanese 00096 EUC-JP 00097 i18n:Japanese 00098 ISO 8859-7 00099 i18n:Greek 00100 cp 1253 00101 i18n:Greek 00102 ISO 8859-6 00103 i18n:Arabic 00104 cp 1256 00105 i18n:Arabic 00106 ISO 8859-8 00107 i18n:Hebrew 00108 ISO 8859-8-I 00109 i18n:Hebrew 00110 cp 1255 00111 i18n:Hebrew 00112 ISO 8859-9 00113 i18n:Turkish 00114 TIS620 00115 i18n:Thai 00116 ISO 8859-11 00117 i18n:Thai 00118 UTF-8 00119 i18n:Unicode 00120 UTF-16 00121 i18n:Unicode 00122 utf7 00123 i18n:Unicode 00124 ucs2 00125 i18n:Unicode 00126 ISO 10646-UCS-2 00127 i18n:Unicode 00128 winsami2 00129 i18n:Northern Saami 00130 windows-1258 00131 i18n:Other 00132 IBM874 00133 i18n:Other 00134 TSCII 00135 i18n:Other 00136 */ 00137 /* 00138 * Notes about the table: 00139 * 00140 * - The following entries were disabled and removed from the table: 00141 ibm852 00142 i18n:Central European 00143 pt 154 00144 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt 00145 * 00146 * - ISO 8559-11 is the deprecated name of TIS-620 00147 * - utf7 is not in Qt 00148 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 00149 * - windows-1258: TODO 00150 * - IBM874: TODO 00151 * - TSCII: TODO 00152 */ 00153 static const char language_for_encoding_string[] = 00154 "ISO 8859-1\0" 00155 I18N_NOOP2("@item Text character set", "Western European")"\0" 00156 "ISO 8859-15\0" 00157 "ISO 8859-14\0" 00158 "cp 1252\0" 00159 "IBM850\0" 00160 "ISO 8859-2\0" 00161 I18N_NOOP2("@item Text character set", "Central European")"\0" 00162 "ISO 8859-3\0" 00163 "ISO 8859-4\0" 00164 I18N_NOOP2("@item Text character set", "Baltic")"\0" 00165 "ISO 8859-13\0" 00166 "ISO 8859-16\0" 00167 I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0" 00168 "cp 1250\0" 00169 "cp 1254\0" 00170 I18N_NOOP2("@item Text character set", "Turkish")"\0" 00171 "cp 1257\0" 00172 "KOI8-R\0" 00173 I18N_NOOP2("@item Text character set", "Cyrillic")"\0" 00174 "ISO 8859-5\0" 00175 "cp 1251\0" 00176 "KOI8-U\0" 00177 "IBM866\0" 00178 "Big5\0" 00179 I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0" 00180 "Big5-HKSCS\0" 00181 "GB18030\0" 00182 I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0" 00183 "GBK\0" 00184 "GB2312\0" 00185 "EUC-KR\0" 00186 I18N_NOOP2("@item Text character set", "Korean")"\0" 00187 "sjis\0" 00188 I18N_NOOP2("@item Text character set", "Japanese")"\0" 00189 "jis7\0" 00190 "EUC-JP\0" 00191 "ISO 8859-7\0" 00192 I18N_NOOP2("@item Text character set", "Greek")"\0" 00193 "cp 1253\0" 00194 "ISO 8859-6\0" 00195 I18N_NOOP2("@item Text character set", "Arabic")"\0" 00196 "cp 1256\0" 00197 "ISO 8859-8\0" 00198 I18N_NOOP2("@item Text character set", "Hebrew")"\0" 00199 "ISO 8859-8-I\0" 00200 "cp 1255\0" 00201 "ISO 8859-9\0" 00202 "TIS620\0" 00203 I18N_NOOP2("@item Text character set", "Thai")"\0" 00204 "ISO 8859-11\0" 00205 "UTF-8\0" 00206 I18N_NOOP2("@item Text character set", "Unicode")"\0" 00207 "UTF-16\0" 00208 "utf7\0" 00209 "ucs2\0" 00210 "ISO 10646-UCS-2\0" 00211 "winsami2\0" 00212 I18N_NOOP2("@item Text character set", "Northern Saami")"\0" 00213 "windows-1258\0" 00214 I18N_NOOP2("@item Text character set", "Other")"\0" 00215 "IBM874\0" 00216 "TSCII\0" 00217 "\0"; 00218 00219 static const int language_for_encoding_indices[] = { 00220 0, 11, 28, 11, 40, 11, 52, 11, 00221 60, 11, 67, 78, 95, 78, 106, 117, 00222 124, 117, 136, 148, 169, 78, 177, 185, 00223 193, 117, 201, 208, 217, 208, 228, 208, 00224 236, 208, 243, 208, 250, 255, 275, 255, 00225 286, 294, 313, 294, 317, 294, 324, 331, 00226 338, 343, 352, 343, 357, 343, 364, 375, 00227 381, 375, 389, 400, 407, 400, 415, 426, 00228 433, 426, 446, 426, 454, 185, 465, 472, 00229 477, 472, 489, 495, 503, 495, 510, 495, 00230 515, 495, 520, 495, 536, 545, 560, 573, 00231 579, 573, 586, 573, -1 00232 }; 00233 00234 /* 00235 * defines some different names for codecs that are built into Qt. 00236 * The names in this list must be lower-case. 00237 * input data for generate_string_table.pl: 00238 iso-ir-111 00239 koi8-r 00240 koi unified 00241 koi8-r 00242 us-ascii 00243 iso 8859-1 00244 usascii 00245 iso 8859-1 00246 ascii 00247 iso 8859-1 00248 unicode-1-1-utf-7 00249 utf-7 00250 ucs2 00251 iso-10646-ucs-2 00252 iso10646-1 00253 iso-10646-ucs-2 00254 gb18030.2000-1 00255 gb18030 00256 gb18030.2000-0 00257 gb18030 00258 gbk-0 00259 gbk 00260 gb2312 00261 gbk 00262 gb2312.1980-0 00263 gbk 00264 big5-0 00265 big5 00266 euc-kr 00267 euckr 00268 euc-jp 00269 eucjp 00270 jisx0201.1976-0 00271 eucjp 00272 jisx0208.1983-0 00273 eucjp 00274 jisx0208.1990-0 00275 eucjp 00276 jisx0208.1997-0 00277 eucjp 00278 jisx0212.1990-0 00279 eucjp 00280 jisx0213.2000-1 00281 eucjp 00282 jisx0213.2000-2 00283 eucjp 00284 shift_jis 00285 sjis 00286 shift-jis 00287 sjis 00288 sjis 00289 sjis 00290 iso-2022-jp 00291 jis7 00292 windows850 00293 ibm850 00294 windows866 00295 ibm866 00296 windows-850 00297 ibm850 00298 windows-866 00299 ibm866 00300 cp-10000 00301 apple roman 00302 thai-tis620 00303 iso 8859-11 00304 windows-874 00305 ibm874 00306 windows874 00307 ibm874 00308 cp-874 00309 ibm874 00310 ksc5601.1987-0 00311 euckr 00312 ks_c_5601-1987 00313 euckr 00314 mac-roman 00315 apple roman 00316 macintosh 00317 apple roman 00318 mac 00319 apple roman 00320 csiso2022jp 00321 iso-2022-jp 00322 */ 00323 /* 00324 * Notes about the table: 00325 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set) 00326 * - utf7 is not in Qt 00327 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 00328 * - sjis: appears on the table for x-sjis 00329 * - jis7: ISO-2022-JP is now the default name in Qt4 00330 * - cp-874: is it really needed? 00331 * - mac-roman: appears on the table for x-mac-roman 00332 * - csiso2022jp: See bug #77243 00333 */ 00334 static const char builtin_string[] = 00335 "iso-ir-111\0" 00336 "koi8-r\0" 00337 "koi unified\0" 00338 "us-ascii\0" 00339 "iso 8859-1\0" 00340 "usascii\0" 00341 "ascii\0" 00342 "unicode-1-1-utf-7\0" 00343 "utf-7\0" 00344 "ucs2\0" 00345 "iso-10646-ucs-2\0" 00346 "iso10646-1\0" 00347 "gb18030.2000-1\0" 00348 "gb18030\0" 00349 "gb18030.2000-0\0" 00350 "gbk-0\0" 00351 "gbk\0" 00352 "gb2312\0" 00353 "gb2312.1980-0\0" 00354 "big5-0\0" 00355 "big5\0" 00356 "euc-kr\0" 00357 "euckr\0" 00358 "euc-jp\0" 00359 "eucjp\0" 00360 "jisx0201.1976-0\0" 00361 "jisx0208.1983-0\0" 00362 "jisx0208.1990-0\0" 00363 "jisx0208.1997-0\0" 00364 "jisx0212.1990-0\0" 00365 "jisx0213.2000-1\0" 00366 "jisx0213.2000-2\0" 00367 "shift_jis\0" 00368 "sjis\0" 00369 "shift-jis\0" 00370 "iso-2022-jp\0" 00371 "jis7\0" 00372 "windows850\0" 00373 "ibm850\0" 00374 "windows866\0" 00375 "ibm866\0" 00376 "windows-850\0" 00377 "windows-866\0" 00378 "cp-10000\0" 00379 "apple roman\0" 00380 "thai-tis620\0" 00381 "iso 8859-11\0" 00382 "windows-874\0" 00383 "ibm874\0" 00384 "windows874\0" 00385 "cp-874\0" 00386 "ksc5601.1987-0\0" 00387 "ks_c_5601-1987\0" 00388 "mac-roman\0" 00389 "macintosh\0" 00390 "mac\0" 00391 "csiso2022jp\0" 00392 "\0"; 00393 00394 static const int builtin_indices[] = { 00395 0, 11, 18, 11, 30, 39, 50, 39, 00396 58, 39, 64, 82, 88, 93, 109, 93, 00397 120, 135, 143, 135, 158, 164, 168, 164, 00398 175, 164, 189, 196, 201, 208, 214, 221, 00399 227, 221, 243, 221, 259, 221, 275, 221, 00400 291, 221, 307, 221, 323, 221, 339, 349, 00401 354, 349, 349, 349, 364, 376, 381, 392, 00402 399, 410, 417, 392, 429, 410, 441, 450, 00403 462, 474, 486, 498, 505, 498, 516, 498, 00404 523, 208, 538, 208, 553, 450, 563, 450, 00405 573, 450, 577, 364, -1 00406 }; 00407 00408 #if 0 00409 // some different names for the encodings defined in the charmaps files. 00410 // even though the charmap file names are all uppercase, the names are all lowercase here. 00411 /* input data for generate_string_table.pl: 00412 cp852 00413 ibm852 00414 cp-852 00415 ibm852 00416 x-cp-852 00417 ibm852 00418 windows852 00419 ibm852 00420 windows-852 00421 ibm852 00422 x-windows-852 00423 ibm852 00424 */ 00425 static const char aliases_string[] = 00426 "cp852\0" 00427 "ibm852\0" 00428 "cp-852\0" 00429 "x-cp-852\0" 00430 "windows852\0" 00431 "windows-852\0" 00432 "x-windows-852\0" 00433 "\0"; 00434 00435 static const int aliases_indices[] = { 00436 0, 6, 13, 6, 20, 6, 29, 6, 00437 40, 6, 52, 6, -1 00438 }; 00439 #endif 00440 00441 /* 00442 * some last resort hints in case the charmap file couldn't be found. 00443 * This gives at least a partial conversion and helps making things readable. 00444 * 00445 * the name used as input here is already converted to the more canonical 00446 * name as defined in the aliases array. 00447 * 00448 * Input data: 00449 cp1250 00450 iso-8859-2 00451 koi8-r 00452 iso-8859-5 00453 koi8-u 00454 koi8-r 00455 pt 154 00456 windows-1251 00457 paratype-154 00458 windows-1251 00459 pt-154 00460 windows-1251 00461 */ 00462 /* Notes: 00463 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback. 00464 */ 00465 static const char conversion_hints_string[] = 00466 "cp1250\0" 00467 "iso-8859-2\0" 00468 "koi8-r\0" 00469 "iso-8859-5\0" 00470 "koi8-u\0" 00471 "pt 154\0" 00472 "windows-1251\0" 00473 "paratype-154\0" 00474 "pt-154\0" 00475 "\0"; 00476 00477 static const int conversion_hints_indices[] = { 00478 0, 7, 18, 25, 36, 18, 43, 50, 00479 63, 50, 76, 50, -1 00480 }; 00481 00482 // search an array of items index/data, find first matching index 00483 // and return data, or return 0 00484 static inline 00485 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) 00486 { 00487 for (int i = 0; indices[i] != -1; i += 2) 00488 if (qstrcmp(start + indices[i], entry) == 0) 00489 return start + indices[i + 1]; 00490 return 0; 00491 } 00492 00493 00494 class KCharsetsPrivate 00495 { 00496 public: 00497 KCharsetsPrivate(KCharsets* _kc) 00498 { 00499 kc = _kc; 00500 codecForNameDict.reserve( 43 ); 00501 } 00502 // Hash for the encoding names (sensitive case) 00503 QHash<QByteArray,QTextCodec*> codecForNameDict; 00504 KCharsets* kc; 00505 00506 //Cache list so QStrings can be implicitly shared 00507 QList<QStringList> encodingsByScript; 00508 }; 00509 00510 // -------------------------------------------------------------------------- 00511 00512 KCharsets::KCharsets() 00513 :d(new KCharsetsPrivate(this)) 00514 { 00515 } 00516 00517 KCharsets::~KCharsets() 00518 { 00519 delete d; 00520 } 00521 00522 QChar KCharsets::fromEntity(const QString &str) 00523 { 00524 QChar res = QChar::Null; 00525 00526 if ( str.isEmpty() ) 00527 return QChar::Null; 00528 00529 int pos = 0; 00530 if(str[pos] == QLatin1Char('&')) pos++; 00531 00532 // Check for '�' or '�' sequence 00533 if (str[pos] == QLatin1Char('#') && str.length()-pos > 1) { 00534 bool ok; 00535 pos++; 00536 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { 00537 pos++; 00538 // '�', hexadecimal character reference 00539 const QString tmp( str.mid( pos ) ); 00540 res = tmp.toInt(&ok, 16); 00541 } else { 00542 // '�', decimal character reference 00543 const QString tmp( str.mid( pos ) ); 00544 res = tmp.toInt(&ok, 10); 00545 } 00546 if ( ok ) 00547 return res; 00548 else 00549 return QChar::Null; 00550 } 00551 00552 const QByteArray raw ( str.toLatin1() ); 00553 const entity *e = kde_findEntity( raw, raw.length() ); 00554 00555 if(!e) 00556 { 00557 //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length(); 00558 return QChar::Null; 00559 } 00560 //kDebug() << "got entity " << str << " = " << e->code; 00561 00562 return QChar(e->code); 00563 } 00564 00565 QChar KCharsets::fromEntity(const QString &str, int &len) 00566 { 00567 // entities are never longer than 8 chars... we start from 00568 // that length and work backwards... 00569 len = 8; 00570 while(len > 0) 00571 { 00572 QString tmp = str.left(len); 00573 QChar res = fromEntity(tmp); 00574 if( res != QChar::Null ) return res; 00575 len--; 00576 } 00577 return QChar::Null; 00578 } 00579 00580 00581 QString KCharsets::toEntity(const QChar &ch) 00582 { 00583 QString ent; 00584 ent.sprintf("�x%x;", ch.unicode()); 00585 return ent; 00586 } 00587 00588 QString KCharsets::resolveEntities( const QString &input ) 00589 { 00590 QString text = input; 00591 const QChar *p = text.unicode(); 00592 const QChar *end = p + text.length(); 00593 const QChar *ampersand = 0; 00594 bool scanForSemicolon = false; 00595 00596 for ( ; p < end; ++p ) { 00597 const QChar ch = *p; 00598 00599 if ( ch == QLatin1Char('&') ) { 00600 ampersand = p; 00601 scanForSemicolon = true; 00602 continue; 00603 } 00604 00605 if ( ch != QLatin1Char(';') || scanForSemicolon == false ) 00606 continue; 00607 00608 assert( ampersand ); 00609 00610 scanForSemicolon = false; 00611 00612 const QChar *entityBegin = ampersand + 1; 00613 00614 const uint entityLength = p - entityBegin; 00615 if ( entityLength == 0 ) 00616 continue; 00617 00618 const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) ); 00619 if ( entityValue.isNull() ) 00620 continue; 00621 00622 const uint ampersandPos = ampersand - text.unicode(); 00623 00624 text[ (int)ampersandPos ] = entityValue; 00625 text.remove( ampersandPos + 1, entityLength + 1 ); 00626 p = text.unicode() + ampersandPos; 00627 end = text.unicode() + text.length(); 00628 ampersand = 0; 00629 } 00630 00631 return text; 00632 } 00633 00634 QStringList KCharsets::availableEncodingNames() const 00635 { 00636 QStringList available; 00637 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) 00638 available.append( QString::fromUtf8( language_for_encoding_string + *p ) ); 00639 available.sort(); 00640 return available; 00641 } 00642 00643 #ifndef KDE_NO_DEPRECATED 00644 QString KCharsets::languageForEncoding( const QString &encoding ) const 00645 { 00646 const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string, 00647 language_for_encoding_indices, 00648 encoding.toUtf8().constData() ); 00649 if ( lang ) 00650 return i18nc( "@item Text character set", lang ); 00651 else 00652 return i18nc( "@item Text character set", "Other" ); 00653 } 00654 #endif 00655 00656 QString KCharsets::descriptionForEncoding( const QString& encoding ) const 00657 { 00658 const char* lang = kcharsets_array_search( language_for_encoding_string, 00659 language_for_encoding_indices, 00660 encoding.toUtf8() ); 00661 if ( lang ) 00662 return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )", 00663 i18nc( "@item Text character set", lang ), encoding ); 00664 else 00665 return i18nc( "@item", "Other encoding (%1)", encoding ); 00666 } 00667 00668 QString KCharsets::encodingForName( const QString &descriptiveName ) const 00669 { 00670 const int left = descriptiveName.lastIndexOf( QLatin1Char('(') ); 00671 00672 if (left<0) // No parenthesis, so assume it is a normal encoding name 00673 return descriptiveName.trimmed(); 00674 00675 QString name(descriptiveName.mid(left+1)); 00676 00677 const int right = name.lastIndexOf( QLatin1Char(')') ); 00678 00679 if (right<0) 00680 return name; 00681 00682 return name.left(right).trimmed(); 00683 } 00684 00685 QStringList KCharsets::descriptiveEncodingNames() const 00686 { 00687 QStringList encodings; 00688 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) { 00689 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] ); 00690 const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] ); 00691 encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )", 00692 description, name ) ); 00693 } 00694 encodings.sort(); 00695 return encodings; 00696 } 00697 00698 QList<QStringList> KCharsets::encodingsByScript() const 00699 { 00700 if (!d->encodingsByScript.isEmpty()) 00701 return d->encodingsByScript; 00702 int i; 00703 for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) { 00704 const QString name = QString::fromUtf8( language_for_encoding_string + p[0] ); 00705 const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] ); 00706 00707 for (i=0; i<d->encodingsByScript.size(); ++i) { 00708 if (d->encodingsByScript.at(i).at(0) == description) { 00709 d->encodingsByScript[i].append(name); 00710 break; 00711 } 00712 } 00713 00714 if (i==d->encodingsByScript.size()) { 00715 d->encodingsByScript.append(QStringList() << description << name); 00716 } 00717 00718 } 00719 return d->encodingsByScript; 00720 } 00721 00722 QTextCodec* KCharsets::codecForName(const QString &n) const 00723 { 00724 if ( n == QLatin1String("gb2312") || n == QLatin1String("gbk") ) 00725 return QTextCodec::codecForName( "gb18030" ); 00726 const QByteArray name( n.toLatin1() ); 00727 QTextCodec* codec = codecForNameOrNull( name ); 00728 if ( codec ) 00729 return codec; 00730 else 00731 return QTextCodec::codecForName( "iso-8859-1" ); 00732 } 00733 00734 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const 00735 { 00736 if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) { 00737 ok = true; 00738 return QTextCodec::codecForName( "gb18030" ); 00739 } 00740 const QByteArray name( n.toLatin1() ); 00741 QTextCodec* codec = codecForNameOrNull( name ); 00742 if ( codec ) 00743 { 00744 ok = true; 00745 return codec; 00746 } 00747 else 00748 { 00749 ok = false; 00750 return QTextCodec::codecForName( "iso-8859-1" ); 00751 } 00752 } 00753 00754 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const 00755 { 00756 QTextCodec* codec = 0; 00757 00758 if (n.isEmpty()) { 00759 // No name, assume locale (KDE's, not Qt's) 00760 const QByteArray locale = "->locale<-"; 00761 if ( d->codecForNameDict.contains( locale ) ) 00762 return d->codecForNameDict.value( locale ); 00763 codec = KGlobal::locale()->codecForEncoding(); 00764 d->codecForNameDict.insert("->locale<-", codec); 00765 return codec; 00766 } 00767 // For a non-empty name, lookup the "dictionnary", in a case-sensitive way. 00768 else if ( d->codecForNameDict.contains( n ) ) { 00769 return d->codecForNameDict.value( n ); 00770 } 00771 00772 // If the name is not in the hash table, call directly QTextCoded::codecForName. 00773 // We assume that QTextCodec is smarter and more maintained than this code. 00774 codec = QTextCodec::codecForName( n ); 00775 if ( codec ) { 00776 d->codecForNameDict.insert( n, codec ); 00777 return codec; 00778 } 00779 00780 // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it. 00781 00782 QByteArray name = n.toLower(); 00783 bool changed = false; 00784 if (name.endsWith("_charset")) { // krazy:exclude=strings 00785 name.chop( 8 ); 00786 changed = true; 00787 } 00788 if ( name.startsWith( "x-" ) ) { // krazy:exclude=strings 00789 name.remove( 0, 2 ); // remove x- at start 00790 changed = true; 00791 } 00792 00793 if (name.isEmpty()) { 00794 // We have no name anymore, therefore the name is invalid. 00795 return 0; 00796 } 00797 00798 // We only need to check changed names. 00799 if ( changed ) { 00800 codec = QTextCodec::codecForName(name); 00801 if (codec) { 00802 d->codecForNameDict.insert( n, codec ); 00803 return codec; 00804 } 00805 changed = false; 00806 } 00807 00808 // these codecs are built into Qt, but the name given for the codec is different, 00809 // so QTextCodec did not recognize it. 00810 QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name); 00811 00812 if(!cname.isEmpty()) 00813 codec = QTextCodec::codecForName(cname); 00814 00815 if (codec) 00816 { 00817 d->codecForNameDict.insert( n, codec ); 00818 return codec; 00819 } 00820 00821 #ifdef __GNUC__ 00822 #warning is it still useful with Qt4 ? 00823 #endif 00824 //don't forget to remove the #if 0 on a few structs at the top also if you reenable that ;) (search for 852 ) 00825 //from what I understood, one needs to create a QTextCodecPlugin in order to be able to support a new Codec, but I do not 00826 //know how to convert a charmap to a QTextCodec and the real big question is whether we need that at all ... (mikmak) 00827 // Yes, it is useful (for examples EBCDIC in Kate or codepages for KOffice filters from/to MS formats) (goutte) 00828 #if 0 00829 QString dir; 00830 { 00831 KConfigGroup cg( KGlobal::config(), "i18n" ); 00832 dir = cg.readPathEntry("i18ndir", QLatin1String("/usr/share/i18n/charmaps")); 00833 } 00834 00835 // these are codecs not included in Qt. They can be build up if the corresponding charmap 00836 // is available in the charmap directory. 00837 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data()); 00838 00839 if(cname.isEmpty()) 00840 cname = name; 00841 cname = cname.toUpper(); 00842 00843 const QString basicName = QLatin1String(cname); 00844 kDebug() << endl << " Trying to find " << cname << " in " << dir; 00845 00846 QString charMapFileName; 00847 bool gzipped = false; 00848 QDir qdir(dir); 00849 if (!qdir.exists()) { 00850 // The directory for the charmaps does not even exist... (That is common!) 00851 } 00852 else if (qdir.exists(basicName, false)) { 00853 charMapFileName = basicName; 00854 } 00855 else if (qdir.exists(basicName+".gz", false)) { 00856 charMapFileName = basicName + ".gz"; 00857 gzipped = true; 00858 } 00859 else { 00860 // Check if we are asking a code page 00861 // If yes, then check "CP99999" and "IBM99999" 00862 // First we need to find the number of the codepage 00863 QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+"); 00864 if ( regexp.search(basicName) != -1) { 00865 const QString num = regexp.cap(4); 00866 if (num.isEmpty()) { 00867 // No number, not a code page (or something went wrong) 00868 } 00869 else if (qdir.exists("IBM"+num)) { 00870 charMapFileName = "IBM"+num; 00871 } 00872 else if (qdir.exists("IBM"+num+".gz")) { 00873 charMapFileName = "IBM"+num+".gz"; 00874 gzipped = true; 00875 } 00876 else if (qdir.exists("CP"+num)) { 00877 charMapFileName = "CP"+num; 00878 } 00879 else if (qdir.exists("CP"+num+".gz")) { 00880 charMapFileName = "CP"+num+".gz"; 00881 gzipped = true; 00882 } 00883 } 00884 } 00885 00886 if (gzipped && !charMapFileName.isEmpty()) { 00887 KFilterDev gzip(dir + '/' + charMapFileName); 00888 if (gzip.open(QIODevice::ReadOnly)) { 00889 kDebug() << "Loading gzipped charset..."; 00890 codec = QTextCodec::loadCharmap(&gzip); 00891 gzip.close(); 00892 } 00893 else 00894 kWarning() << "Could not open gzipped charset!"; 00895 } 00896 else if (!charMapFileName.isEmpty()) { 00897 codec = QTextCodec::loadCharmapFile(dir + '/' + charMapFileName); 00898 } 00899 00900 if(codec) { 00901 d->codecForNameDict.insert( n, codec ); 00902 return codec; 00903 } 00904 #endif 00905 00906 // this also failed, the last resort is now to take some compatibility charmap 00907 // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write. 00908 cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name ); 00909 00910 if (!cname.isEmpty()) { 00911 codec = QTextCodec::codecForName(cname); 00912 if (codec) { 00913 d->codecForNameDict.insert( n, codec ); 00914 return codec; 00915 } 00916 } 00917 00918 // we could not assign a codec, therefore return NULL 00919 return 0; 00920 }
KDE 4.6 API Reference