KDECore
kencodingdetector.cpp
Go to the documentation of this file.
00001 /* 00002 This file is part of the KDE libraries 00003 00004 Copyright (C) 1999 Lars Knoll (knoll@kde.org) 00005 Copyright (C) 2003 Dirk Mueller (mueller@kde.org) 00006 Copyright (C) 2003 Apple Computer, Inc. 00007 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) 00008 00009 This library is free software; you can redistribute it and/or 00010 modify it under the terms of the GNU Library General Public 00011 License as published by the Free Software Foundation; either 00012 version 2 of the License, or (at your option) any later version. 00013 00014 This library is distributed in the hope that it will be useful, 00015 but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00017 Library General Public License for more details. 00018 00019 You should have received a copy of the GNU Library General Public License 00020 along with this library; see the file COPYING.LIB. If not, write to 00021 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00022 Boston, MA 02110-1301, USA. 00023 */ 00024 //---------------------------------------------------------------------------- 00025 // 00026 // decoder for input stream 00027 00028 #include "kencodingdetector.h" 00029 00030 #undef DECODE_DEBUG 00031 //#define DECODE_DEBUG 00032 00033 #define MAX_BUFFER 16*1024 00034 00035 #include <assert.h> 00036 00037 #include "guess_ja_p.h" 00038 00039 #include <QRegExp> 00040 #include <QTextCodec> 00041 00042 #include <kglobal.h> 00043 #include <kcharsets.h> 00044 #include <kdebug.h> 00045 #include <klocale.h> 00046 00047 #include <ctype.h> 00048 00049 enum MIB 00050 { 00051 MibLatin1 = 4, 00052 Mib8859_8 = 85, 00053 MibUtf8 = 106, 00054 MibUcs2 = 1000, 00055 MibUtf16 = 1015, 00056 MibUtf16BE = 1013, 00057 MibUtf16LE = 1014 00058 }; 00059 00060 static bool is16Bit(QTextCodec* codec) 00061 { 00062 switch (codec->mibEnum()) 00063 { 00064 case MibUtf16: 00065 case MibUtf16BE: 00066 case MibUtf16LE: 00067 case MibUcs2: 00068 return true; 00069 default: 00070 return false; 00071 } 00072 } 00073 00074 class KEncodingDetectorPrivate 00075 { 00076 public: 00077 QTextCodec *m_codec; 00078 QTextDecoder *m_decoder; // utf16 00079 QTextCodec *m_defaultCodec; 00080 QByteArray m_storeDecoderName; 00081 00082 KEncodingDetector::EncodingChoiceSource m_source; 00083 KEncodingDetector::AutoDetectScript m_autoDetectLanguage; 00084 00085 bool m_visualRTL : 1; 00086 bool m_seenBody : 1; 00087 bool m_writtingHappened : 1; 00088 bool m_analyzeCalled : 1; //for decode() 00089 int m_multiByte; 00090 00091 QByteArray m_bufferForDefferedEncDetection; 00092 00093 KEncodingDetectorPrivate() 00094 : m_codec(QTextCodec::codecForMib(MibLatin1)) 00095 , m_decoder(m_codec->makeDecoder()) 00096 , m_defaultCodec(m_codec) 00097 , m_source(KEncodingDetector::DefaultEncoding) 00098 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection) 00099 , m_visualRTL(false) 00100 , m_seenBody(false) 00101 , m_writtingHappened(false) 00102 , m_analyzeCalled(false) 00103 , m_multiByte(0) 00104 { 00105 } 00106 00107 KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script) 00108 : m_codec(codec) 00109 , m_decoder(m_codec->makeDecoder()) 00110 , m_defaultCodec(m_codec) 00111 , m_source(source) 00112 , m_autoDetectLanguage(script) 00113 , m_visualRTL(false) 00114 , m_seenBody(false) 00115 , m_writtingHappened(false) 00116 , m_analyzeCalled(false) 00117 , m_multiByte(0) 00118 { 00119 } 00120 00121 ~KEncodingDetectorPrivate() 00122 { 00123 delete m_decoder; 00124 } 00125 00126 // Returns true if the encoding was explicitly specified someplace. 00127 bool isExplicitlySpecifiedEncoding() 00128 { 00129 return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding; 00130 } 00131 }; 00132 00133 00134 static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size ) 00135 { 00136 for ( int i = 0; i < size; ++i ) { 00137 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 00138 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) 00139 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 00140 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { 00141 return "cp1256"; 00142 } 00143 } 00144 00145 return "iso-8859-6"; 00146 } 00147 00148 static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size ) 00149 { 00150 for ( int i = 0; i < size; ++i ) { 00151 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) 00152 return "cp1257"; 00153 00154 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) 00155 return "iso-8859-13"; 00156 } 00157 00158 return "iso-8859-13"; 00159 } 00160 00161 static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) 00162 { 00163 QByteArray charset = QByteArray(); 00164 for ( int i = 0; i < size; ++i ) { 00165 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { 00166 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) 00167 return "ibm852"; 00168 00169 if ( i + 1 > size ) 00170 return "cp1250"; 00171 else { // maybe ibm852 ? 00172 charset = "cp1250"; 00173 continue; 00174 } 00175 } 00176 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { 00177 if ( i + 1 > size ) 00178 return "iso-8859-2"; 00179 else { // maybe ibm852 ? 00180 if ( charset.isNull() ) 00181 charset = "iso-8859-2"; 00182 continue; 00183 } 00184 } 00185 } 00186 00187 if ( charset.isNull() ) 00188 charset = "iso-8859-3"; 00189 00190 return charset.data(); 00191 } 00192 00193 static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size) 00194 { 00195 #ifdef DECODE_DEBUG 00196 kWarning() << "KEncodingDetector: Cyr heuristics"; 00197 #endif 00198 00199 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) 00200 // return "utf8"; 00201 int utf8_mark=0; 00202 int koi_score=0; 00203 int cp1251_score=0; 00204 00205 int koi_st=0; 00206 int cp1251_st=0; 00207 00208 // int koi_na=0; 00209 // int cp1251_na=0; 00210 00211 int koi_o_capital=0; 00212 int koi_o=0; 00213 int cp1251_o_capital=0; 00214 int cp1251_o=0; 00215 00216 int koi_a_capital=0; 00217 int koi_a=0; 00218 int cp1251_a_capital=0; 00219 int cp1251_a=0; 00220 00221 int koi_s_capital=0; 00222 int koi_s=0; 00223 int cp1251_s_capital=0; 00224 int cp1251_s=0; 00225 00226 int koi_i_capital=0; 00227 int koi_i=0; 00228 int cp1251_i_capital=0; 00229 int cp1251_i=0; 00230 00231 int cp1251_small_range=0; 00232 int koi_small_range=0; 00233 int ibm866_small_range=0; 00234 00235 int i; 00236 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) 00237 { 00238 if (ptr[i]>0xdf) 00239 { 00240 ++cp1251_small_range; 00241 00242 if (ptr[i]==0xee)//small o 00243 ++cp1251_o; 00244 else if (ptr[i]==0xe0)//small a 00245 ++cp1251_a; 00246 else if (ptr[i]==0xe8)//small i 00247 ++cp1251_i; 00248 else if (ptr[i]==0xf1)//small s 00249 ++cp1251_s; 00250 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st 00251 ++cp1251_st; 00252 00253 else if (ptr[i]==0xef) 00254 ++koi_o_capital; 00255 else if (ptr[i]==0xe1) 00256 ++koi_a_capital; 00257 else if (ptr[i]==0xe9) 00258 ++koi_i_capital; 00259 else if (ptr[i]==0xf3) 00260 ++koi_s_capital; 00261 00262 } 00263 else if (ptr[i]>0xbf) 00264 { 00265 ++koi_small_range; 00266 00267 if (ptr[i]==0xd0||ptr[i]==0xd1)//small o 00268 ++utf8_mark; 00269 else if (ptr[i]==0xcf)//small o 00270 ++koi_o; 00271 else if (ptr[i]==0xc1)//small a 00272 ++koi_a; 00273 else if (ptr[i]==0xc9)//small i 00274 ++koi_i; 00275 else if (ptr[i]==0xd3)//small s 00276 ++koi_s; 00277 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st 00278 ++koi_st; 00279 00280 else if (ptr[i]==0xce) 00281 ++cp1251_o_capital; 00282 else if (ptr[i]==0xc0) 00283 ++cp1251_a_capital; 00284 else if (ptr[i]==0xc8) 00285 ++cp1251_i_capital; 00286 else if (ptr[i]==0xd1) 00287 ++cp1251_s_capital; 00288 } 00289 else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% 00290 ++ibm866_small_range; 00291 00292 } 00293 00294 //cannot decide? 00295 if (cp1251_small_range+koi_small_range+ibm866_small_range<8) 00296 { 00297 return ""; 00298 } 00299 00300 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) 00301 { 00302 #ifdef DECODE_DEBUG 00303 kWarning() << "Cyr Enc Detection: UTF8"; 00304 #endif 00305 return "UTF-8"; 00306 } 00307 00308 if (ibm866_small_range>cp1251_small_range+koi_small_range) 00309 return "ibm866"; 00310 00311 // QByteArray koi_string = "koi8-u"; 00312 // QByteArray cp1251_string = "cp1251"; 00313 00314 if (cp1251_st==0 && koi_st>1) 00315 koi_score+=10; 00316 else if (koi_st==0 && cp1251_st>1) 00317 cp1251_score+=10; 00318 00319 if (cp1251_st && koi_st) 00320 { 00321 if (cp1251_st/koi_st>2) 00322 cp1251_score+=20; 00323 else if (koi_st/cp1251_st>2) 00324 koi_score+=20; 00325 } 00326 00327 if (cp1251_a>koi_a) 00328 cp1251_score+=10; 00329 else if (cp1251_a || koi_a) 00330 koi_score+=10; 00331 00332 if (cp1251_o>koi_o) 00333 cp1251_score+=10; 00334 else if (cp1251_o || koi_o) 00335 koi_score+=10; 00336 00337 if (cp1251_i>koi_i) 00338 cp1251_score+=10; 00339 else if (cp1251_i || koi_i) 00340 koi_score+=10; 00341 00342 if (cp1251_s>koi_s) 00343 cp1251_score+=10; 00344 else if (cp1251_s || koi_s) 00345 koi_score+=10; 00346 00347 if (cp1251_a_capital>koi_a_capital) 00348 cp1251_score+=9; 00349 else if (cp1251_a_capital || koi_a_capital) 00350 koi_score+=9; 00351 00352 if (cp1251_o_capital>koi_o_capital) 00353 cp1251_score+=9; 00354 else if (cp1251_o_capital || koi_o_capital) 00355 koi_score+=9; 00356 00357 if (cp1251_i_capital>koi_i_capital) 00358 cp1251_score+=9; 00359 else if (cp1251_i_capital || koi_i_capital) 00360 koi_score+=9; 00361 00362 if (cp1251_s_capital>koi_s_capital) 00363 cp1251_score+=9; 00364 else if (cp1251_s_capital || koi_s_capital) 00365 koi_score+=9; 00366 #ifdef DECODE_DEBUG 00367 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; 00368 #endif 00369 if (abs(koi_score-cp1251_score)<10) 00370 { 00371 //fallback... 00372 cp1251_score=cp1251_small_range; 00373 koi_score=koi_small_range; 00374 } 00375 if (cp1251_score>koi_score) 00376 return "cp1251"; 00377 else 00378 return "koi8-u"; 00379 00380 00381 // if (cp1251_score>koi_score) 00382 // setEncoding("cp1251",AutoDetectedEncoding); 00383 // else 00384 // setEncoding("koi8-u",AutoDetectedEncoding); 00385 // return true; 00386 00387 } 00388 00389 static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size ) 00390 { 00391 for ( int i = 0; i < size; ++i ) { 00392 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B 00393 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 00394 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { 00395 return "cp1253"; 00396 } 00397 } 00398 00399 return "iso-8859-7"; 00400 } 00401 00402 static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size ) 00403 { 00404 for ( int i = 0; i < size; ++i ) { 00405 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B 00406 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) 00407 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { 00408 return "cp1255"; 00409 } 00410 00411 if ( ptr[ i ] == 0xDF ) 00412 return "iso-8859-8-i"; 00413 } 00414 00415 return "iso-8859-8-i"; 00416 } 00417 00418 static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size ) 00419 { 00420 JapaneseCode kc; 00421 00422 switch ( kc.guess_jp( (const char*)ptr, size ) ) { 00423 case JapaneseCode::JIS: 00424 return "jis7"; 00425 case JapaneseCode::EUC: 00426 return "eucjp"; 00427 case JapaneseCode::SJIS: 00428 return "sjis"; 00429 case JapaneseCode::UTF8: 00430 return "utf8"; 00431 default: 00432 break; 00433 } 00434 00435 return ""; 00436 } 00437 00438 static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size ) 00439 { 00440 for ( int i = 0; i < size; ++i ) { 00441 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { 00442 return "cp1254"; 00443 } 00444 } 00445 00446 return "iso-8859-9"; 00447 } 00448 00449 static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) 00450 { 00451 --size; 00452 uint nonansi_count=0; 00453 for (int i=0; i<size; ++i) 00454 { 00455 if (ptr[i]>0x79) 00456 { 00457 ++nonansi_count; 00458 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0) 00459 { 00460 return "UTF-8"; 00461 } 00462 if (ptr[i] >= 0x78 && ptr[i]<=0x9F ) 00463 { 00464 return "cp1252"; 00465 } 00466 } 00467 00468 } 00469 00470 if (nonansi_count>0) 00471 return "iso-8859-15"; 00472 00473 return ""; 00474 } 00475 00476 // Other browsers allow comments in the head section, so we need to also. 00477 // It's important not to look for tags inside the comments. 00478 static void skipComment(const char *&ptr, const char *pEnd) 00479 { 00480 const char *p = ptr; 00481 // Allow <!-->; other browsers do. 00482 if (*p=='>') 00483 { 00484 p++; 00485 } 00486 else 00487 { 00488 while (p!=pEnd) 00489 { 00490 if (*p=='-') 00491 { 00492 // This is the real end of comment, "-->". 00493 if (p[1]=='-' && p[2]=='>') 00494 { 00495 p += 3; 00496 break; 00497 } 00498 // This is the incorrect end of comment that other browsers allow, "--!>". 00499 if (p[1] == '-' && p[2] == '!' && p[3] == '>') 00500 { 00501 p += 4; 00502 break; 00503 } 00504 } 00505 p++; 00506 } 00507 } 00508 ptr=p; 00509 } 00510 00511 // Returns the position of the encoding string. 00512 static int findXMLEncoding(const QByteArray &str, int &encodingLength) 00513 { 00514 int len = str.length(); 00515 int pos = str.indexOf("encoding"); 00516 if (pos == -1) 00517 return -1; 00518 pos += 8; 00519 00520 // Skip spaces and stray control characters. 00521 while (pos<len && str[pos]<=' ') 00522 ++pos; 00523 00524 //Bail out if nothing after 00525 // Skip equals sign. 00526 if (pos>=len || str[pos] != '=') 00527 return -1; 00528 ++pos; 00529 00530 // Skip spaces and stray control characters. 00531 while (pos<len && str[pos]<=' ') 00532 ++pos; 00533 00534 //Bail out if nothing after 00535 if (pos >= len) 00536 return -1; 00537 00538 // Skip quotation mark. 00539 char quoteMark = str[pos]; 00540 if (quoteMark != '"' && quoteMark != '\'') 00541 return -1; 00542 ++pos; 00543 00544 // Find the trailing quotation mark. 00545 int end=pos; 00546 while (end<len && str[end]!=quoteMark) 00547 ++end; 00548 00549 if (end>=len) 00550 return -1; 00551 00552 encodingLength = end-pos; 00553 return pos; 00554 } 00555 00556 bool KEncodingDetector::processNull(char *data, int len) 00557 { 00558 bool bin=false; 00559 if(is16Bit(d->m_codec)) 00560 { 00561 for (int i=1; i < len; i+=2) 00562 { 00563 if ((data[i]=='\0') && (data[i-1]=='\0')) 00564 { 00565 bin=true; 00566 data[i]=' '; 00567 } 00568 } 00569 return bin; 00570 } 00571 // replace '\0' by spaces, for buggy pages 00572 int i = len-1; 00573 while(--i>=0) 00574 { 00575 if(data[i]==0) 00576 { 00577 bin=true; 00578 data[i]=' '; 00579 } 00580 } 00581 return bin; 00582 } 00583 00584 00585 bool KEncodingDetector::errorsIfUtf8 (const char* data, int length) 00586 { 00587 if (d->m_codec->mibEnum()!=MibUtf8) 00588 return false; //means no errors 00589 // #define highest1Bits (unsigned char)0x80 00590 // #define highest2Bits (unsigned char)0xC0 00591 // #define highest3Bits (unsigned char)0xE0 00592 // #define highest4Bits (unsigned char)0xF0 00593 // #define highest5Bits (unsigned char)0xF8 00594 static const unsigned char highest1Bits = 0x80; 00595 static const unsigned char highest2Bits = 0xC0; 00596 static const unsigned char highest3Bits = 0xE0; 00597 static const unsigned char highest4Bits = 0xF0; 00598 static const unsigned char highest5Bits = 0xF8; 00599 00600 for (int i=0; i<length; ++i) 00601 { 00602 unsigned char c = data[i]; 00603 00604 if (d->m_multiByte>0) 00605 { 00606 if ((c & highest2Bits) == 0x80) 00607 { 00608 --(d->m_multiByte); 00609 continue; 00610 } 00611 #ifdef DECODE_DEBUG 00612 kWarning() << "EncDetector: Broken UTF8"; 00613 #endif 00614 return true; 00615 } 00616 00617 // most significant bit zero, single char 00618 if ((c & highest1Bits) == 0x00) 00619 continue; 00620 00621 // 110xxxxx => init 1 following bytes 00622 if ((c & highest3Bits) == 0xC0) 00623 { 00624 d->m_multiByte = 1; 00625 continue; 00626 } 00627 00628 // 1110xxxx => init 2 following bytes 00629 if ((c & highest4Bits) == 0xE0) 00630 { 00631 d->m_multiByte = 2; 00632 continue; 00633 } 00634 00635 // 11110xxx => init 3 following bytes 00636 if ((c & highest5Bits) == 0xF0) 00637 { 00638 d->m_multiByte = 3; 00639 continue; 00640 } 00641 #ifdef DECODE_DEBUG 00642 kWarning() << "EncDetector:_Broken UTF8"; 00643 #endif 00644 return true; 00645 } 00646 return false; 00647 } 00648 00649 00650 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate) 00651 { 00652 } 00653 00654 KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : 00655 d(new KEncodingDetectorPrivate(codec,source,script)) 00656 { 00657 } 00658 00659 KEncodingDetector::~KEncodingDetector() 00660 { 00661 delete d; 00662 } 00663 00664 void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang) 00665 { 00666 d->m_autoDetectLanguage=lang; 00667 } 00668 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const 00669 { 00670 return d->m_autoDetectLanguage; 00671 } 00672 00673 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const 00674 { 00675 return d->m_source; 00676 } 00677 00678 const char* KEncodingDetector::encoding() const 00679 { 00680 d->m_storeDecoderName = d->m_codec->name(); 00681 return d->m_storeDecoderName.constData(); 00682 } 00683 00684 bool KEncodingDetector::visuallyOrdered() const 00685 { 00686 return d->m_visualRTL; 00687 } 00688 00689 // const QTextCodec* KEncodingDetector::codec() const 00690 // { 00691 // return d->m_codec; 00692 // } 00693 00694 QTextDecoder* KEncodingDetector::decoder() 00695 { 00696 return d->m_decoder; 00697 } 00698 00699 void KEncodingDetector::resetDecoder() 00700 { 00701 assert(d->m_defaultCodec); 00702 d->m_bufferForDefferedEncDetection.clear(); 00703 d->m_writtingHappened = false; 00704 d->m_analyzeCalled = false; 00705 d->m_multiByte = 0; 00706 delete d->m_decoder; 00707 if (!d->m_codec) 00708 d->m_codec = d->m_defaultCodec; 00709 d->m_decoder = d->m_codec->makeDecoder(); 00710 } 00711 00712 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) 00713 { 00714 QTextCodec *codec; 00715 QByteArray enc(_encoding); 00716 if(/*enc.isNull() || */enc.isEmpty()) 00717 { 00718 if (type==DefaultEncoding) 00719 codec=d->m_defaultCodec; 00720 else 00721 return false; 00722 } 00723 else 00724 { 00725 //QString->QTextCodec 00726 00727 enc = enc.toLower(); 00728 // hebrew visually ordered 00729 if(enc=="visual") 00730 enc="iso8859-8"; 00731 bool b; 00732 codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b); 00733 if (!b) 00734 return false; 00735 } 00736 00737 if (d->m_codec->mibEnum()==codec->mibEnum()) 00738 { 00739 // We already have the codec, but we still want to re-set the type, 00740 // as we may have overwritten a default with a detected 00741 d->m_source = type; 00742 return true; 00743 } 00744 00745 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) 00746 { 00747 //Sometimes the codec specified is absurd, i.e. UTF-16 despite 00748 //us decoding a meta tag as ASCII. In that case, ignore it. 00749 return false; 00750 } 00751 00752 if (codec->mibEnum() == Mib8859_8) 00753 { 00754 //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. 00755 codec = QTextCodec::codecForName("iso8859-8-i"); 00756 00757 // visually ordered unless one of the following 00758 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) 00759 d->m_visualRTL = true; 00760 } 00761 00762 d->m_codec = codec; 00763 d->m_source = type; 00764 delete d->m_decoder; 00765 d->m_decoder = d->m_codec->makeDecoder(); 00766 #ifdef DECODE_DEBUG 00767 kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name(); 00768 #endif 00769 return true; 00770 } 00771 00772 QString KEncodingDetector::decode(const char *data, int len) 00773 { 00774 processNull(const_cast<char *>(data),len); 00775 if (!d->m_analyzeCalled) 00776 { 00777 analyze(data,len); 00778 d->m_analyzeCalled=true; 00779 } 00780 00781 return d->m_decoder->toUnicode(data,len); 00782 } 00783 00784 QString KEncodingDetector::decode(const QByteArray &data) 00785 { 00786 processNull(const_cast<char *>(data.data()),data.size()); 00787 if (!d->m_analyzeCalled) 00788 { 00789 analyze(data.data(),data.size()); 00790 d->m_analyzeCalled=true; 00791 } 00792 00793 return d->m_decoder->toUnicode(data); 00794 } 00795 00796 QString KEncodingDetector::decodeWithBuffering(const char *data, int len) 00797 { 00798 #ifdef DECODE_DEBUG 00799 kWarning() << "KEncodingDetector: decoding "<<len<<" bytes"; 00800 #endif 00801 if (d->m_writtingHappened) 00802 { 00803 #ifdef DECODE_DEBUG 00804 kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name(); 00805 #endif 00806 processNull(const_cast<char *>(data),len); 00807 return d->m_decoder->toUnicode(data, len); 00808 } 00809 else 00810 { 00811 if (d->m_bufferForDefferedEncDetection.isEmpty()) 00812 { 00813 // If encoding detection produced something, and we either got to the body or 00814 // actually saw the encoding explicitly, we're done. 00815 if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) 00816 { 00817 #ifdef DECODE_DEBUG 00818 kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name(); 00819 #endif 00820 processNull(const_cast<char *>(data),len); 00821 d->m_writtingHappened=true; 00822 return d->m_decoder->toUnicode(data, len); 00823 } 00824 else 00825 { 00826 #ifdef DECODE_DEBUG 00827 kWarning() << "KEncodingDetector: begin deffer"; 00828 #endif 00829 d->m_bufferForDefferedEncDetection=data; 00830 } 00831 } 00832 else 00833 { 00834 d->m_bufferForDefferedEncDetection+=data; 00835 // As above, but also limit the buffer size. We must use the entire buffer here, 00836 // since the boundaries might split the meta tag, etc. 00837 bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length()); 00838 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) || 00839 d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) 00840 { 00841 d->m_writtingHappened=true; 00842 d->m_bufferForDefferedEncDetection.replace('\0',' '); 00843 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); 00844 d->m_bufferForDefferedEncDetection.clear(); 00845 #ifdef DECODE_DEBUG 00846 kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name(); 00847 #endif 00848 return result; 00849 } 00850 } 00851 } 00852 00853 return QString(); 00854 } 00855 00856 bool KEncodingDetector::decodedInvalidCharacters() const 00857 { 00858 return d->m_decoder ? d->m_decoder->hasFailure() : false; 00859 } 00860 00861 QString KEncodingDetector::flush() 00862 { 00863 if (d->m_bufferForDefferedEncDetection.isEmpty()) 00864 return QString(); 00865 00866 d->m_bufferForDefferedEncDetection.replace('\0',' '); 00867 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection)); 00868 d->m_bufferForDefferedEncDetection.clear(); 00869 #ifdef DECODE_DEBUG 00870 kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name(); 00871 #endif 00872 return result; 00873 } 00874 00875 bool KEncodingDetector::analyze(const char *data, int len) 00876 { 00877 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 00878 // maximumBOMLength = 10 00879 // Even if the user has chosen utf16 we still need to auto-detect the endianness 00880 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) 00881 { 00882 // Extract the first three bytes. 00883 const uchar *udata = (const uchar *)data; 00884 uchar c1 = *udata++; 00885 uchar c2 = *udata++; 00886 uchar c3 = *udata++; 00887 00888 // Check for the BOM 00889 const char *autoDetectedEncoding; 00890 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) 00891 { 00892 autoDetectedEncoding = "UTF-16"; 00893 } 00894 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) 00895 { 00896 autoDetectedEncoding = "UTF-8"; 00897 } 00898 else if (c1 == 0x00 || c2 == 0x00) 00899 { 00900 uchar c4 = *udata++; 00901 uchar c5 = *udata++; 00902 uchar c6 = *udata++; 00903 uchar c7 = *udata++; 00904 uchar c8 = *udata++; 00905 uchar c9 = *udata++; 00906 uchar c10 = *udata++; 00907 00908 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); 00909 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); 00910 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) 00911 autoDetectedEncoding = "UTF-16"; 00912 else 00913 autoDetectedEncoding = 0; 00914 } 00915 else 00916 { 00917 autoDetectedEncoding = 0; 00918 } 00919 00920 // If we found a BOM, use the encoding it implies. 00921 if (autoDetectedEncoding != 0) 00922 { 00923 d->m_source = BOM; 00924 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); 00925 assert(d->m_codec); 00926 //enc = d->m_codec->name(); 00927 delete d->m_decoder; 00928 d->m_decoder = d->m_codec->makeDecoder(); 00929 #ifdef DECODE_DEBUG 00930 kWarning() << "Detection by BOM"; 00931 #endif 00932 if (is16Bit(d->m_codec) && c2==0x00) 00933 { 00934 // utf16LE, we need to put the decoder in LE mode 00935 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; 00936 d->m_decoder->toUnicode(reverseUtf16, 2); 00937 } 00938 return true; 00939 } 00940 } 00941 00942 //exit from routine in case it was called to only detect byte order for utf-16 00943 if (d->m_source==UserChosenEncoding) 00944 { 00945 #ifdef DECODE_DEBUG 00946 kWarning() << "KEncodingDetector: UserChosenEncoding exit "; 00947 #endif 00948 00949 if (errorsIfUtf8(data, len)) 00950 setEncoding("",DefaultEncoding); 00951 return true; 00952 } 00953 00954 // HTTP header takes precedence over meta-type stuff 00955 if (d->m_source==EncodingFromHTTPHeader) 00956 return true; 00957 00958 if (!d->m_seenBody) 00959 { 00960 // we still don't have an encoding, and are in the head 00961 // the following tags are allowed in <head>: 00962 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 00963 const char *ptr = data; 00964 const char *pEnd = data+len; 00965 00966 while(ptr != pEnd) 00967 { 00968 if(*ptr!='<') 00969 { 00970 ++ptr; 00971 continue; 00972 } 00973 ++ptr; 00974 // Handle comments. 00975 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') 00976 { 00977 ptr += 3; 00978 skipComment(ptr, pEnd); 00979 continue; 00980 } 00981 00982 // Handle XML header, which can have encoding in it. 00983 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') 00984 { 00985 const char *end = ptr; 00986 while (*end != '>' && end < pEnd) 00987 end++; 00988 if (*end == '\0' || end == pEnd) 00989 break; 00990 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator 00991 int length; 00992 int pos = findXMLEncoding(str, length); 00993 // also handles the case when specified encoding aint correct 00994 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) 00995 { 00996 return true; 00997 } 00998 } 00999 01000 //look for <meta>, stop if we reach <body> 01001 while ( 01002 !(((*ptr >= 'a') && (*ptr <= 'z')) || 01003 ((*ptr >= 'A') && (*ptr <= 'Z'))) 01004 && ptr < pEnd 01005 ) 01006 ++ptr; 01007 01008 char tmp[5]; 01009 int length=0; 01010 const char* max=ptr+4; 01011 if (pEnd<max) 01012 max=pEnd; 01013 while ( 01014 (((*ptr >= 'a') && (*ptr <= 'z')) || 01015 ((*ptr >= 'A') && (*ptr <= 'Z')) || 01016 ((*ptr >= '0') && (*ptr <= '9'))) 01017 && ptr < max 01018 ) 01019 { 01020 tmp[length] = tolower( *ptr ); 01021 ++ptr; 01022 ++length; 01023 } 01024 tmp[length] = 0; 01025 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') 01026 { 01027 // found a meta tag... 01028 const char* end = ptr; 01029 while(*end != '>' && *end != '\0' && end<pEnd) 01030 end++; 01031 //if ( *end == '\0' ) break; 01032 QByteArray str( ptr, (end-ptr)+1); 01033 str = str.toLower(); 01034 int pos=0; 01035 //if( (pos = str.find("http-equiv", pos)) == -1) break; 01036 //if( (pos = str.find("content-type", pos)) == -1) break; 01037 if( (pos = str.indexOf("charset")) == -1) 01038 continue; 01039 pos+=6; 01040 // skip to '=' 01041 if( (pos = str.indexOf("=", pos)) == -1) 01042 continue; 01043 01044 // skip '=' 01045 ++pos; 01046 01047 // skip whitespace before encoding itself 01048 while (pos < (int)str.length() && str[pos] <= ' ') 01049 ++pos; 01050 01051 // there may also be an opening quote, if this is a charset= and not 01052 // a http-equiv. 01053 if (pos < (int)str.length() && str[pos] == '"') 01054 ++pos; 01055 01056 if ( pos == (int)str.length()) 01057 continue; 01058 01059 int endpos = pos; 01060 while( endpos < str.length() && 01061 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' 01062 && str[endpos] != ';' && str[endpos] != '>') ) 01063 ++endpos; 01064 #ifdef DECODE_DEBUG 01065 kDebug( 6005 ) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data(); 01066 #endif 01067 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) 01068 return true; 01069 } 01070 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') 01071 { 01072 d->m_seenBody=true; 01073 break; 01074 } 01075 } 01076 } 01077 01078 if (len<20) 01079 return false; 01080 01081 #ifdef DECODE_DEBUG 01082 kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")"; 01083 #endif 01084 01085 switch ( d->m_autoDetectLanguage) 01086 { 01087 case KEncodingDetector::Arabic: 01088 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); 01089 // break; 01090 case KEncodingDetector::Baltic: 01091 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); 01092 // break; 01093 case KEncodingDetector::CentralEuropean: 01094 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); 01095 break; 01096 case KEncodingDetector::Cyrillic: 01097 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); 01098 // break; 01099 case KEncodingDetector::Greek: 01100 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); 01101 // break; 01102 case KEncodingDetector::Hebrew: 01103 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); 01104 // break; 01105 case KEncodingDetector::Japanese: 01106 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); 01107 // break; 01108 case KEncodingDetector::Turkish: 01109 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); 01110 // break; 01111 case KEncodingDetector::WesternEuropean: 01112 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) 01113 return true; 01114 else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml 01115 { 01116 return setEncoding("iso-8859-15",AutoDetectedEncoding); 01117 } 01118 else //use default provided by eg katepart 01119 { 01120 return setEncoding("",DefaultEncoding); 01121 } 01122 // break; 01123 case KEncodingDetector::SemiautomaticDetection: 01124 case KEncodingDetector::ChineseSimplified: 01125 case KEncodingDetector::ChineseTraditional: 01126 case KEncodingDetector::Korean: 01127 case KEncodingDetector::Thai: 01128 case KEncodingDetector::Unicode: 01129 case KEncodingDetector::NorthernSaami: 01130 case KEncodingDetector::SouthEasternEurope: 01131 case KEncodingDetector::None: 01132 // huh. somethings broken in this code ### FIXME 01133 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. 01134 break; 01135 } 01136 01137 return true; 01138 } 01139 01140 01141 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang) 01142 { 01143 if (lang.isEmpty()) 01144 return KEncodingDetector::None; 01145 else if (lang==i18nc("@item Text character set", "Unicode")) 01146 return KEncodingDetector::Unicode; 01147 else if (lang==i18nc("@item Text character set", "Cyrillic")) 01148 return KEncodingDetector::Cyrillic; 01149 else if (lang==i18nc("@item Text character set", "Western European")) 01150 return KEncodingDetector::WesternEuropean; 01151 else if (lang==i18nc("@item Text character set", "Central European")) 01152 return KEncodingDetector::CentralEuropean; 01153 else if (lang==i18nc("@item Text character set", "Greek")) 01154 return KEncodingDetector::Greek; 01155 else if (lang==i18nc("@item Text character set", "Hebrew")) 01156 return KEncodingDetector::Hebrew; 01157 else if (lang==i18nc("@item Text character set", "Turkish")) 01158 return KEncodingDetector::Turkish; 01159 else if (lang==i18nc("@item Text character set", "Japanese")) 01160 return KEncodingDetector::Japanese; 01161 else if (lang==i18nc("@item Text character set", "Baltic")) 01162 return KEncodingDetector::Baltic; 01163 else if (lang==i18nc("@item Text character set", "Arabic")) 01164 return KEncodingDetector::Arabic; 01165 01166 return KEncodingDetector::None; 01167 } 01168 01169 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script) 01170 { 01171 switch (script) 01172 { 01173 case KEncodingDetector::Arabic: 01174 return true; 01175 case KEncodingDetector::Baltic: 01176 return true; 01177 case KEncodingDetector::CentralEuropean: 01178 return true; 01179 case KEncodingDetector::Cyrillic: 01180 return true; 01181 case KEncodingDetector::Greek: 01182 return true; 01183 case KEncodingDetector::Hebrew: 01184 return true; 01185 case KEncodingDetector::Japanese: 01186 return true; 01187 case KEncodingDetector::Turkish: 01188 return true; 01189 case KEncodingDetector::WesternEuropean: 01190 return true; 01191 case KEncodingDetector::ChineseTraditional: 01192 return true; 01193 case KEncodingDetector::ChineseSimplified: 01194 return true; 01195 case KEncodingDetector::Unicode: 01196 return true; 01197 break; 01198 default: 01199 return false; 01200 } 01201 } 01202 01203 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script) 01204 { 01205 switch (script) 01206 { 01207 case KEncodingDetector::Arabic: 01208 return i18nc("@item Text character set", "Arabic"); 01209 break; 01210 case KEncodingDetector::Baltic: 01211 return i18nc("@item Text character set", "Baltic"); 01212 break; 01213 case KEncodingDetector::CentralEuropean: 01214 return i18nc("@item Text character set", "Central European"); 01215 break; 01216 case KEncodingDetector::Cyrillic: 01217 return i18nc("@item Text character set", "Cyrillic"); 01218 break; 01219 case KEncodingDetector::Greek: 01220 return i18nc("@item Text character set", "Greek"); 01221 break; 01222 case KEncodingDetector::Hebrew: 01223 return i18nc("@item Text character set", "Hebrew"); 01224 break; 01225 case KEncodingDetector::Japanese: 01226 return i18nc("@item Text character set", "Japanese"); 01227 break; 01228 case KEncodingDetector::Turkish: 01229 return i18nc("@item Text character set", "Turkish"); 01230 break; 01231 case KEncodingDetector::WesternEuropean: 01232 return i18nc("@item Text character set", "Western European"); 01233 break; 01234 case KEncodingDetector::ChineseTraditional: 01235 return i18nc("@item Text character set", "Chinese Traditional"); 01236 break; 01237 case KEncodingDetector::ChineseSimplified: 01238 return i18nc("@item Text character set", "Chinese Simplified"); 01239 break; 01240 case KEncodingDetector::Korean: 01241 return i18nc("@item Text character set", "Korean"); 01242 break; 01243 case KEncodingDetector::Thai: 01244 return i18nc("@item Text character set", "Thai"); 01245 break; 01246 case KEncodingDetector::Unicode: 01247 return i18nc("@item Text character set", "Unicode"); 01248 break; 01249 //case KEncodingDetector::SemiautomaticDetection: 01250 default: 01251 return QString(); 01252 01253 } 01254 } 01255 01256 #undef DECODE_DEBUG 01257
KDE 4.6 API Reference