• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KDECore

kencodingprober.cpp

Go to the documentation of this file.
00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 
00021 */
00022 
00023 #include "kencodingprober.h"
00024 
00025 #include "klocale.h"
00026 
00027 #include "probers/nsCharSetProber.h"
00028 #include "probers/nsUniversalDetector.h"
00029 #include "probers/ChineseGroupProber.h"
00030 #include "probers/JapaneseGroupProber.h"
00031 #include "probers/UnicodeGroupProber.h"
00032 #include "probers/nsSBCSGroupProber.h"
00033 #include "probers/nsMBCSGroupProber.h"
00034 
00035 #include <string.h>
00036 
00037 class KEncodingProberPrivate
00038 {
00039 public:
00040     KEncodingProberPrivate(): prober(NULL), mStart(true) {};
00041     ~KEncodingProberPrivate()
00042     {
00043         delete prober;
00044     }
00045     void setProberType(KEncodingProber::ProberType pType)
00046     {
00047         proberType = pType;
00048         /* handle multi-byte encodings carefully , because they're hard to detect,
00049         *   and have to use some Stastics methods.
00050         * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
00051         *   because encoding state machine can detect many such encodings.
00052         */
00053 
00054         delete prober;
00055 
00056         switch (proberType) {
00057             case KEncodingProber::None:
00058                 prober = NULL;
00059                 break;
00060             case KEncodingProber::Arabic:
00061             case KEncodingProber::Baltic:
00062             case KEncodingProber::CentralEuropean:
00063             case KEncodingProber::Cyrillic:
00064             case KEncodingProber::Greek:
00065             case KEncodingProber::Hebrew:
00066             case KEncodingProber::NorthernSaami:
00067             case KEncodingProber::Other:
00068             case KEncodingProber::SouthEasternEurope:
00069             case KEncodingProber::Thai:
00070             case KEncodingProber::Turkish:
00071             case KEncodingProber::WesternEuropean:
00072                 prober = new kencodingprober::nsSBCSGroupProber();
00073                 break;
00074             case KEncodingProber::ChineseSimplified:
00075             case KEncodingProber::ChineseTraditional:
00076                 prober = new kencodingprober::ChineseGroupProber();
00077                 break;
00078             case KEncodingProber::Japanese:
00079                 prober = new kencodingprober::JapaneseGroupProber();
00080                 break;
00081             case KEncodingProber::Korean:
00082                 prober = new kencodingprober::nsMBCSGroupProber();
00083                 break;
00084             case KEncodingProber::Unicode:
00085                 prober = new kencodingprober::UnicodeGroupProber();
00086                 break;
00087             case KEncodingProber::Universal:
00088                 prober = new kencodingprober::nsUniversalDetector();
00089                 break;
00090             default:
00091                 prober = NULL;
00092         }
00093     }
00094     void unicodeTest(const char *aBuf, int aLen)
00095     {
00096         if (mStart)
00097         {
00098             mStart = false;
00099             if (aLen > 3)
00100             switch (aBuf[0])
00101             {
00102                 case '\xEF':
00103                     if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00104                     // EF BB BF  UTF-8 encoded BOM
00105                     proberState = KEncodingProber::FoundIt;
00106                     break;
00107                 case '\xFE':
00108                     if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00109                         // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
00110                         proberState = KEncodingProber::FoundIt;
00111                     else if ('\xFF' == aBuf[1])
00112                         // FE FF  UTF-16, big endian BOM
00113                         proberState = KEncodingProber::FoundIt;
00114                         break;
00115                 case '\x00':
00116                     if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00117                         // 00 00 FE FF  UTF-32, big-endian BOM
00118                         proberState = KEncodingProber::FoundIt;
00119                     else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00120                         // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
00121                         proberState = KEncodingProber::FoundIt;
00122                         break;
00123                 case '\xFF':
00124                     if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00125                         // FF FE 00 00  UTF-32, little-endian BOM
00126                         proberState = KEncodingProber::FoundIt;
00127                     else if ('\xFE' == aBuf[1])
00128                         // FF FE  UTF-16, little endian BOM
00129                         proberState = KEncodingProber::FoundIt;
00130                         break;
00131             }  // switch
00132 
00133         }
00134     }
00135     KEncodingProber::ProberType proberType;
00136     KEncodingProber::ProberState proberState;
00137     kencodingprober::nsCharSetProber *prober;
00138     bool mStart;
00139 };
00140 
00141 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
00142 {
00143     setProberType(proberType);
00144 }
00145 
00146 KEncodingProber::~KEncodingProber()
00147 {
00148     delete d;
00149 }
00150 
00151 void KEncodingProber::reset()
00152 {
00153     d->proberState = KEncodingProber::Probing;
00154     d->mStart = true;
00155 }
00156 
00157 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
00158 {
00159     return feed(data.data(), data.size());
00160 }
00161 
00162 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
00163 {
00164     if (!d->prober)
00165         return d->proberState;
00166     if (d->proberState == Probing) {
00167         if (d->mStart) {
00168             d->unicodeTest(data, len);
00169             if (d->proberState == FoundIt)
00170                 return d->proberState;
00171         }
00172         d->prober->HandleData(data, len);
00173         switch (d->prober->GetState())
00174         {
00175             case kencodingprober::eNotMe:
00176                 d->proberState = NotMe;
00177                 break;
00178             case kencodingprober::eFoundIt:
00179                 d->proberState = FoundIt;
00180                 break;
00181             default:
00182                 d->proberState = Probing;
00183                 break;
00184         }
00185     }
00186 #ifdef DEBUG_PROBE
00187     d->prober->DumpStatus();
00188 #endif
00189     return d->proberState;
00190 }
00191 
00192 KEncodingProber::ProberState KEncodingProber::state() const
00193 {
00194     return d->proberState;
00195 }
00196 
00197 //DEPRECATED, do *not* use
00198 #ifndef KDE_NO_DEPRECATED
00199 const char* KEncodingProber::encodingName() const
00200 {
00201     return qstrdup(encoding().constData());
00202 }
00203 #endif
00204 
00205 QByteArray KEncodingProber::encoding() const
00206 {
00207     if (!d->prober)
00208         return QByteArray("UTF-8");
00209 
00210     return QByteArray(d->prober->GetCharSetName());
00211 }
00212 
00213 float KEncodingProber::confidence() const
00214 {
00215     if (!d->prober)
00216         return 0.0;
00217 
00218     return d->prober->GetConfidence();
00219 }
00220 
00221 KEncodingProber::ProberType KEncodingProber::proberType() const
00222 {
00223     return d->proberType;
00224 }
00225 
00226 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
00227 {
00228     d->setProberType(proberType);
00229     reset();
00230 }
00231 
00232 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
00233 {
00234     if (lang.isEmpty())
00235         return KEncodingProber::Universal;
00236     else if (lang==i18nc("@item Text character set", "Disabled"))
00237         return KEncodingProber::None;
00238     else if (lang==i18nc("@item Text character set", "Universal"))
00239         return KEncodingProber::Universal;
00240     else if (lang==i18nc("@item Text character set", "Unicode"))
00241         return KEncodingProber::Unicode;
00242     else if (lang==i18nc("@item Text character set", "Cyrillic"))
00243         return KEncodingProber::Cyrillic;
00244     else if (lang==i18nc("@item Text character set", "Western European"))
00245         return KEncodingProber::WesternEuropean;
00246     else if (lang==i18nc("@item Text character set", "Central European"))
00247         return KEncodingProber::CentralEuropean;
00248     else if (lang==i18nc("@item Text character set", "Greek"))
00249         return KEncodingProber::Greek;
00250     else if (lang==i18nc("@item Text character set", "Hebrew"))
00251         return KEncodingProber::Hebrew;
00252     else if (lang==i18nc("@item Text character set", "Turkish"))
00253         return KEncodingProber::Turkish;
00254     else if (lang==i18nc("@item Text character set", "Japanese"))
00255         return KEncodingProber::Japanese;
00256     else if (lang==i18nc("@item Text character set", "Baltic"))
00257         return KEncodingProber::Baltic;
00258     else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
00259         return KEncodingProber::ChineseTraditional;
00260     else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
00261         return KEncodingProber::ChineseSimplified;
00262     else if (lang==i18nc("@item Text character set", "Arabic"))
00263         return KEncodingProber::Arabic;
00264 
00265     return KEncodingProber::Universal;
00266 }
00267 
00268 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
00269 {
00270     switch (proberType)
00271     {
00272         case KEncodingProber::None:
00273             return i18nc("@item Text character set", "Disabled");
00274             break;
00275         case KEncodingProber::Universal:
00276             return i18nc("@item Text character set", "Universal");
00277             break;
00278         case KEncodingProber::Arabic:
00279             return i18nc("@item Text character set", "Arabic");
00280             break;
00281         case KEncodingProber::Baltic:
00282             return i18nc("@item Text character set", "Baltic");
00283             break;
00284         case KEncodingProber::CentralEuropean:
00285             return i18nc("@item Text character set", "Central European");
00286             break;
00287         case KEncodingProber::Cyrillic:
00288             return i18nc("@item Text character set", "Cyrillic");
00289             break;
00290         case KEncodingProber::Greek:
00291             return i18nc("@item Text character set", "Greek");
00292             break;
00293         case KEncodingProber::Hebrew:
00294             return i18nc("@item Text character set", "Hebrew");
00295             break;
00296         case KEncodingProber::Japanese:
00297             return i18nc("@item Text character set", "Japanese");
00298             break;
00299         case KEncodingProber::Turkish:
00300             return i18nc("@item Text character set", "Turkish");
00301             break;
00302         case KEncodingProber::WesternEuropean:
00303             return i18nc("@item Text character set", "Western European");
00304             break;
00305         case KEncodingProber::ChineseTraditional:
00306             return i18nc("@item Text character set", "Chinese Traditional");
00307             break;
00308         case KEncodingProber::ChineseSimplified:
00309             return i18nc("@item Text character set", "Chinese Simplified");
00310             break;
00311         case KEncodingProber::Korean:
00312             return i18nc("@item Text character set", "Korean");
00313             break;
00314         case KEncodingProber::Thai:
00315             return i18nc("@item Text character set", "Thai");
00316             break;
00317         case KEncodingProber::Unicode:
00318             return i18nc("@item Text character set", "Unicode");
00319             break;
00320         default:
00321             return QString();
00322         }
00323 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal