KDECore
kencodingprober.cpp
Go to the documentation of this file.
00001 /* 00002 This file is part of the KDE libraries 00003 00004 Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com) 00005 00006 This library is free software; you can redistribute it and/or 00007 modify it under the terms of the GNU Library General Public 00008 License as published by the Free Software Foundation; either 00009 version 2 of the License, or (at your option) any later version. 00010 00011 This library is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 Library General Public License for more details. 00015 00016 You should have received a copy of the GNU Library General Public License 00017 along with this library; see the file COPYING.LIB. If not, write to 00018 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 Boston, MA 02110-1301, USA. 00020 00021 */ 00022 00023 #include "kencodingprober.h" 00024 00025 #include "klocale.h" 00026 00027 #include "probers/nsCharSetProber.h" 00028 #include "probers/nsUniversalDetector.h" 00029 #include "probers/ChineseGroupProber.h" 00030 #include "probers/JapaneseGroupProber.h" 00031 #include "probers/UnicodeGroupProber.h" 00032 #include "probers/nsSBCSGroupProber.h" 00033 #include "probers/nsMBCSGroupProber.h" 00034 00035 #include <string.h> 00036 00037 class KEncodingProberPrivate 00038 { 00039 public: 00040 KEncodingProberPrivate(): prober(NULL), mStart(true) {}; 00041 ~KEncodingProberPrivate() 00042 { 00043 delete prober; 00044 } 00045 void setProberType(KEncodingProber::ProberType pType) 00046 { 00047 proberType = pType; 00048 /* handle multi-byte encodings carefully , because they're hard to detect, 00049 * and have to use some Stastics methods. 00050 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok, 00051 * because encoding state machine can detect many such encodings. 00052 */ 00053 00054 delete prober; 00055 00056 switch (proberType) { 00057 case KEncodingProber::None: 00058 prober = NULL; 00059 break; 00060 case KEncodingProber::Arabic: 00061 case KEncodingProber::Baltic: 00062 case KEncodingProber::CentralEuropean: 00063 case KEncodingProber::Cyrillic: 00064 case KEncodingProber::Greek: 00065 case KEncodingProber::Hebrew: 00066 case KEncodingProber::NorthernSaami: 00067 case KEncodingProber::Other: 00068 case KEncodingProber::SouthEasternEurope: 00069 case KEncodingProber::Thai: 00070 case KEncodingProber::Turkish: 00071 case KEncodingProber::WesternEuropean: 00072 prober = new kencodingprober::nsSBCSGroupProber(); 00073 break; 00074 case KEncodingProber::ChineseSimplified: 00075 case KEncodingProber::ChineseTraditional: 00076 prober = new kencodingprober::ChineseGroupProber(); 00077 break; 00078 case KEncodingProber::Japanese: 00079 prober = new kencodingprober::JapaneseGroupProber(); 00080 break; 00081 case KEncodingProber::Korean: 00082 prober = new kencodingprober::nsMBCSGroupProber(); 00083 break; 00084 case KEncodingProber::Unicode: 00085 prober = new kencodingprober::UnicodeGroupProber(); 00086 break; 00087 case KEncodingProber::Universal: 00088 prober = new kencodingprober::nsUniversalDetector(); 00089 break; 00090 default: 00091 prober = NULL; 00092 } 00093 } 00094 void unicodeTest(const char *aBuf, int aLen) 00095 { 00096 if (mStart) 00097 { 00098 mStart = false; 00099 if (aLen > 3) 00100 switch (aBuf[0]) 00101 { 00102 case '\xEF': 00103 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 00104 // EF BB BF UTF-8 encoded BOM 00105 proberState = KEncodingProber::FoundIt; 00106 break; 00107 case '\xFE': 00108 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00109 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 00110 proberState = KEncodingProber::FoundIt; 00111 else if ('\xFF' == aBuf[1]) 00112 // FE FF UTF-16, big endian BOM 00113 proberState = KEncodingProber::FoundIt; 00114 break; 00115 case '\x00': 00116 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 00117 // 00 00 FE FF UTF-32, big-endian BOM 00118 proberState = KEncodingProber::FoundIt; 00119 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 00120 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 00121 proberState = KEncodingProber::FoundIt; 00122 break; 00123 case '\xFF': 00124 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00125 // FF FE 00 00 UTF-32, little-endian BOM 00126 proberState = KEncodingProber::FoundIt; 00127 else if ('\xFE' == aBuf[1]) 00128 // FF FE UTF-16, little endian BOM 00129 proberState = KEncodingProber::FoundIt; 00130 break; 00131 } // switch 00132 00133 } 00134 } 00135 KEncodingProber::ProberType proberType; 00136 KEncodingProber::ProberState proberState; 00137 kencodingprober::nsCharSetProber *prober; 00138 bool mStart; 00139 }; 00140 00141 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate()) 00142 { 00143 setProberType(proberType); 00144 } 00145 00146 KEncodingProber::~KEncodingProber() 00147 { 00148 delete d; 00149 } 00150 00151 void KEncodingProber::reset() 00152 { 00153 d->proberState = KEncodingProber::Probing; 00154 d->mStart = true; 00155 } 00156 00157 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data) 00158 { 00159 return feed(data.data(), data.size()); 00160 } 00161 00162 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len) 00163 { 00164 if (!d->prober) 00165 return d->proberState; 00166 if (d->proberState == Probing) { 00167 if (d->mStart) { 00168 d->unicodeTest(data, len); 00169 if (d->proberState == FoundIt) 00170 return d->proberState; 00171 } 00172 d->prober->HandleData(data, len); 00173 switch (d->prober->GetState()) 00174 { 00175 case kencodingprober::eNotMe: 00176 d->proberState = NotMe; 00177 break; 00178 case kencodingprober::eFoundIt: 00179 d->proberState = FoundIt; 00180 break; 00181 default: 00182 d->proberState = Probing; 00183 break; 00184 } 00185 } 00186 #ifdef DEBUG_PROBE 00187 d->prober->DumpStatus(); 00188 #endif 00189 return d->proberState; 00190 } 00191 00192 KEncodingProber::ProberState KEncodingProber::state() const 00193 { 00194 return d->proberState; 00195 } 00196 00197 //DEPRECATED, do *not* use 00198 #ifndef KDE_NO_DEPRECATED 00199 const char* KEncodingProber::encodingName() const 00200 { 00201 return qstrdup(encoding().constData()); 00202 } 00203 #endif 00204 00205 QByteArray KEncodingProber::encoding() const 00206 { 00207 if (!d->prober) 00208 return QByteArray("UTF-8"); 00209 00210 return QByteArray(d->prober->GetCharSetName()); 00211 } 00212 00213 float KEncodingProber::confidence() const 00214 { 00215 if (!d->prober) 00216 return 0.0; 00217 00218 return d->prober->GetConfidence(); 00219 } 00220 00221 KEncodingProber::ProberType KEncodingProber::proberType() const 00222 { 00223 return d->proberType; 00224 } 00225 00226 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType) 00227 { 00228 d->setProberType(proberType); 00229 reset(); 00230 } 00231 00232 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang) 00233 { 00234 if (lang.isEmpty()) 00235 return KEncodingProber::Universal; 00236 else if (lang==i18nc("@item Text character set", "Disabled")) 00237 return KEncodingProber::None; 00238 else if (lang==i18nc("@item Text character set", "Universal")) 00239 return KEncodingProber::Universal; 00240 else if (lang==i18nc("@item Text character set", "Unicode")) 00241 return KEncodingProber::Unicode; 00242 else if (lang==i18nc("@item Text character set", "Cyrillic")) 00243 return KEncodingProber::Cyrillic; 00244 else if (lang==i18nc("@item Text character set", "Western European")) 00245 return KEncodingProber::WesternEuropean; 00246 else if (lang==i18nc("@item Text character set", "Central European")) 00247 return KEncodingProber::CentralEuropean; 00248 else if (lang==i18nc("@item Text character set", "Greek")) 00249 return KEncodingProber::Greek; 00250 else if (lang==i18nc("@item Text character set", "Hebrew")) 00251 return KEncodingProber::Hebrew; 00252 else if (lang==i18nc("@item Text character set", "Turkish")) 00253 return KEncodingProber::Turkish; 00254 else if (lang==i18nc("@item Text character set", "Japanese")) 00255 return KEncodingProber::Japanese; 00256 else if (lang==i18nc("@item Text character set", "Baltic")) 00257 return KEncodingProber::Baltic; 00258 else if (lang==i18nc("@item Text character set", "Chinese Traditional")) 00259 return KEncodingProber::ChineseTraditional; 00260 else if (lang==i18nc("@item Text character set", "Chinese Simplified")) 00261 return KEncodingProber::ChineseSimplified; 00262 else if (lang==i18nc("@item Text character set", "Arabic")) 00263 return KEncodingProber::Arabic; 00264 00265 return KEncodingProber::Universal; 00266 } 00267 00268 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType) 00269 { 00270 switch (proberType) 00271 { 00272 case KEncodingProber::None: 00273 return i18nc("@item Text character set", "Disabled"); 00274 break; 00275 case KEncodingProber::Universal: 00276 return i18nc("@item Text character set", "Universal"); 00277 break; 00278 case KEncodingProber::Arabic: 00279 return i18nc("@item Text character set", "Arabic"); 00280 break; 00281 case KEncodingProber::Baltic: 00282 return i18nc("@item Text character set", "Baltic"); 00283 break; 00284 case KEncodingProber::CentralEuropean: 00285 return i18nc("@item Text character set", "Central European"); 00286 break; 00287 case KEncodingProber::Cyrillic: 00288 return i18nc("@item Text character set", "Cyrillic"); 00289 break; 00290 case KEncodingProber::Greek: 00291 return i18nc("@item Text character set", "Greek"); 00292 break; 00293 case KEncodingProber::Hebrew: 00294 return i18nc("@item Text character set", "Hebrew"); 00295 break; 00296 case KEncodingProber::Japanese: 00297 return i18nc("@item Text character set", "Japanese"); 00298 break; 00299 case KEncodingProber::Turkish: 00300 return i18nc("@item Text character set", "Turkish"); 00301 break; 00302 case KEncodingProber::WesternEuropean: 00303 return i18nc("@item Text character set", "Western European"); 00304 break; 00305 case KEncodingProber::ChineseTraditional: 00306 return i18nc("@item Text character set", "Chinese Traditional"); 00307 break; 00308 case KEncodingProber::ChineseSimplified: 00309 return i18nc("@item Text character set", "Chinese Simplified"); 00310 break; 00311 case KEncodingProber::Korean: 00312 return i18nc("@item Text character set", "Korean"); 00313 break; 00314 case KEncodingProber::Thai: 00315 return i18nc("@item Text character set", "Thai"); 00316 break; 00317 case KEncodingProber::Unicode: 00318 return i18nc("@item Text character set", "Unicode"); 00319 break; 00320 default: 00321 return QString(); 00322 } 00323 }
KDE 4.6 API Reference