• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KDECore

nsUniversalDetector.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003 *  Copyright (C) 1998 <developer@mozilla.org>
00004 *  Copyright (C) 2008 <wkai@gmail.com>
00005 *
00006 *  Permission is hereby granted, free of charge, to any person obtaining
00007 *  a copy of this software and associated documentation files (the
00008 *  "Software"), to deal in the Software without restriction, including
00009 *  without limitation the rights to use, copy, modify, merge, publish,
00010 *  distribute, sublicense, and/or sell copies of the Software, and to
00011 *  permit persons to whom the Software is furnished to do so, subject to
00012 *  the following conditions:
00013 *
00014 *  The above copyright notice and this permission notice shall be included 
00015 *  in all copies or substantial portions of the Software.
00016 *
00017 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024 */
00025 
00026 #include "nsUniversalDetector.h"
00027 
00028 #include "nsMBCSGroupProber.h"
00029 #include "nsSBCSGroupProber.h"
00030 #include "nsEscCharsetProber.h"
00031 #include "nsLatin1Prober.h"
00032 
00033 namespace kencodingprober {
00034 nsUniversalDetector::nsUniversalDetector()
00035 {
00036   mDone = false;
00037   mBestGuess = -1;   //illegal value as signal
00038   mInTag = false;
00039   mEscCharSetProber = 0;
00040 
00041   mStart = true;
00042   mDetectedCharset = 0;
00043   mGotData = false;
00044   mInputState = ePureAscii;
00045   mLastChar = '\0';
00046 
00047   unsigned int i;
00048   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00049     mCharSetProbers[i] = 0;
00050 }
00051 
00052 nsUniversalDetector::~nsUniversalDetector()
00053 {
00054   for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00055     delete mCharSetProbers[i];
00056   delete mEscCharSetProber;
00057 }
00058 
00059 void
00060 nsUniversalDetector::Reset()
00061 {
00062   mDone = false;
00063   mBestGuess = -1;   //illegal value as signal
00064   mInTag = false;
00065 
00066   mStart = true;
00067   mDetectedCharset = 0;
00068   mGotData = false;
00069   mInputState = ePureAscii;
00070   mLastChar = '\0';
00071 
00072   if (mEscCharSetProber)
00073     mEscCharSetProber->Reset();
00074 
00075   unsigned int i;
00076   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00077     if (mCharSetProbers[i])
00078       mCharSetProbers[i]->Reset();
00079 }
00080 
00081 //---------------------------------------------------------------------
00082 #define SHORTCUT_THRESHOLD      (float)0.95
00083 #define MINIMUM_THRESHOLD      (float)0.20
00084 
00085 nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen)
00086 {
00087   if(mDone) 
00088     return eFoundIt;
00089 
00090   if (aLen > 0)
00091     mGotData = true;
00092 
00093   unsigned int i;
00094   for (i = 0; i < aLen; i++)
00095   {
00096     //other than 0xa0, if every othe character is ascii, the page is ascii
00097     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
00098     {
00099       //we got a non-ascii byte (high-byte)
00100       if (mInputState != eHighbyte)
00101       {
00102         //adjust state
00103         mInputState = eHighbyte;
00104 
00105         //kill mEscCharSetProber if it is active
00106         delete mEscCharSetProber;
00107         mEscCharSetProber = 0;
00108 
00109         //start multibyte and singlebyte charset prober
00110         if (0 == mCharSetProbers[0])
00111           mCharSetProbers[0] = new nsMBCSGroupProber;
00112         if (0 == mCharSetProbers[1])
00113           mCharSetProbers[1] = new nsSBCSGroupProber;
00114         if (0 == mCharSetProbers[2])
00115           mCharSetProbers[2] = new nsLatin1Prober; 
00116       }
00117     }
00118     else
00119     {
00120       //ok, just pure ascii so far
00121       if ( ePureAscii == mInputState &&
00122         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
00123       {
00124         //found escape character or HZ "~{"
00125         mInputState = eEscAscii;
00126       }
00127           
00128       mLastChar = aBuf[i];
00129     }
00130   }
00131 
00132   nsProbingState st = eDetecting;
00133   switch (mInputState)
00134   {
00135   case eEscAscii:
00136     if (0 == mEscCharSetProber) {
00137       mEscCharSetProber = new nsEscCharSetProber;
00138     }
00139     st = mEscCharSetProber->HandleData(aBuf, aLen);
00140     if (st == eFoundIt)
00141     {
00142       mDone = true;
00143       mDetectedCharset = mEscCharSetProber->GetCharSetName();
00144     }
00145     break;
00146   case eHighbyte:
00147     for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
00148     {
00149       st = mCharSetProbers[i]->HandleData(aBuf, aLen);
00150       if (st == eFoundIt) 
00151       {
00152         mDone = true;
00153         mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
00154       } 
00155     }
00156     break;
00157 
00158   default:  //pure ascii
00159     mDetectedCharset = "UTF-8";
00160   }
00161   return st;
00162 }
00163 
00164 
00165 //---------------------------------------------------------------------
00166 const char* nsUniversalDetector::GetCharSetName()
00167 {
00168   if (mDetectedCharset)
00169     return mDetectedCharset;
00170   switch (mInputState)
00171   {
00172   case eHighbyte:
00173     {
00174       float proberConfidence;
00175       float maxProberConfidence = (float)0.0;
00176       int maxProber = 0;
00177 
00178       for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00179       {
00180         proberConfidence = mCharSetProbers[i]->GetConfidence();
00181         if (proberConfidence > maxProberConfidence)
00182         {
00183           maxProberConfidence = proberConfidence;
00184           maxProber = i;
00185         }
00186       }
00187       //do not report anything because we are not confident of it, that's in fact a negative answer
00188       if (maxProberConfidence > MINIMUM_THRESHOLD)
00189         return mCharSetProbers[maxProber]->GetCharSetName();
00190     }
00191   case eEscAscii:
00192     break;
00193   default:           // pure ascii
00194       ;
00195   }
00196   return "UTF-8";
00197 
00198 }
00199 
00200 //---------------------------------------------------------------------
00201 float nsUniversalDetector::GetConfidence()
00202 {
00203   if (!mGotData)
00204   {
00205     // we haven't got any data yet, return immediately
00206     // caller program sometimes call DataEnd before anything has been sent to detector
00207     return MINIMUM_THRESHOLD;
00208   }
00209   if (mDetectedCharset)
00210     return 0.99f;
00211   switch (mInputState)
00212   {
00213   case eHighbyte:
00214     {
00215       float proberConfidence;
00216       float maxProberConfidence = (float)0.0;
00217       int maxProber = 0;
00218 
00219       for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00220       {
00221         proberConfidence = mCharSetProbers[i]->GetConfidence();
00222         if (proberConfidence > maxProberConfidence)
00223         {
00224           maxProberConfidence = proberConfidence;
00225           maxProber = i;
00226         }
00227       }
00228       //do not report anything because we are not confident of it, that's in fact a negative answer
00229       if (maxProberConfidence > MINIMUM_THRESHOLD)
00230         return mCharSetProbers[maxProber]->GetConfidence();
00231     }
00232   case eEscAscii:
00233     break;
00234   default:           // pure ascii
00235       ;
00236   }
00237   return MINIMUM_THRESHOLD;
00238 }
00239 
00240 nsProbingState nsUniversalDetector::GetState()
00241 {
00242     if (mDone)
00243         return eFoundIt;
00244     else
00245         return eDetecting;
00246 }
00247 }
00248 
00249 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal