KDECore
nsUniversalDetector.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * Copyright (C) 2008 <wkai@gmail.com> 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "nsUniversalDetector.h" 00027 00028 #include "nsMBCSGroupProber.h" 00029 #include "nsSBCSGroupProber.h" 00030 #include "nsEscCharsetProber.h" 00031 #include "nsLatin1Prober.h" 00032 00033 namespace kencodingprober { 00034 nsUniversalDetector::nsUniversalDetector() 00035 { 00036 mDone = false; 00037 mBestGuess = -1; //illegal value as signal 00038 mInTag = false; 00039 mEscCharSetProber = 0; 00040 00041 mStart = true; 00042 mDetectedCharset = 0; 00043 mGotData = false; 00044 mInputState = ePureAscii; 00045 mLastChar = '\0'; 00046 00047 unsigned int i; 00048 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00049 mCharSetProbers[i] = 0; 00050 } 00051 00052 nsUniversalDetector::~nsUniversalDetector() 00053 { 00054 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00055 delete mCharSetProbers[i]; 00056 delete mEscCharSetProber; 00057 } 00058 00059 void 00060 nsUniversalDetector::Reset() 00061 { 00062 mDone = false; 00063 mBestGuess = -1; //illegal value as signal 00064 mInTag = false; 00065 00066 mStart = true; 00067 mDetectedCharset = 0; 00068 mGotData = false; 00069 mInputState = ePureAscii; 00070 mLastChar = '\0'; 00071 00072 if (mEscCharSetProber) 00073 mEscCharSetProber->Reset(); 00074 00075 unsigned int i; 00076 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00077 if (mCharSetProbers[i]) 00078 mCharSetProbers[i]->Reset(); 00079 } 00080 00081 //--------------------------------------------------------------------- 00082 #define SHORTCUT_THRESHOLD (float)0.95 00083 #define MINIMUM_THRESHOLD (float)0.20 00084 00085 nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen) 00086 { 00087 if(mDone) 00088 return eFoundIt; 00089 00090 if (aLen > 0) 00091 mGotData = true; 00092 00093 unsigned int i; 00094 for (i = 0; i < aLen; i++) 00095 { 00096 //other than 0xa0, if every othe character is ascii, the page is ascii 00097 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 00098 { 00099 //we got a non-ascii byte (high-byte) 00100 if (mInputState != eHighbyte) 00101 { 00102 //adjust state 00103 mInputState = eHighbyte; 00104 00105 //kill mEscCharSetProber if it is active 00106 delete mEscCharSetProber; 00107 mEscCharSetProber = 0; 00108 00109 //start multibyte and singlebyte charset prober 00110 if (0 == mCharSetProbers[0]) 00111 mCharSetProbers[0] = new nsMBCSGroupProber; 00112 if (0 == mCharSetProbers[1]) 00113 mCharSetProbers[1] = new nsSBCSGroupProber; 00114 if (0 == mCharSetProbers[2]) 00115 mCharSetProbers[2] = new nsLatin1Prober; 00116 } 00117 } 00118 else 00119 { 00120 //ok, just pure ascii so far 00121 if ( ePureAscii == mInputState && 00122 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 00123 { 00124 //found escape character or HZ "~{" 00125 mInputState = eEscAscii; 00126 } 00127 00128 mLastChar = aBuf[i]; 00129 } 00130 } 00131 00132 nsProbingState st = eDetecting; 00133 switch (mInputState) 00134 { 00135 case eEscAscii: 00136 if (0 == mEscCharSetProber) { 00137 mEscCharSetProber = new nsEscCharSetProber; 00138 } 00139 st = mEscCharSetProber->HandleData(aBuf, aLen); 00140 if (st == eFoundIt) 00141 { 00142 mDone = true; 00143 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 00144 } 00145 break; 00146 case eHighbyte: 00147 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) 00148 { 00149 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 00150 if (st == eFoundIt) 00151 { 00152 mDone = true; 00153 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 00154 } 00155 } 00156 break; 00157 00158 default: //pure ascii 00159 mDetectedCharset = "UTF-8"; 00160 } 00161 return st; 00162 } 00163 00164 00165 //--------------------------------------------------------------------- 00166 const char* nsUniversalDetector::GetCharSetName() 00167 { 00168 if (mDetectedCharset) 00169 return mDetectedCharset; 00170 switch (mInputState) 00171 { 00172 case eHighbyte: 00173 { 00174 float proberConfidence; 00175 float maxProberConfidence = (float)0.0; 00176 int maxProber = 0; 00177 00178 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00179 { 00180 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00181 if (proberConfidence > maxProberConfidence) 00182 { 00183 maxProberConfidence = proberConfidence; 00184 maxProber = i; 00185 } 00186 } 00187 //do not report anything because we are not confident of it, that's in fact a negative answer 00188 if (maxProberConfidence > MINIMUM_THRESHOLD) 00189 return mCharSetProbers[maxProber]->GetCharSetName(); 00190 } 00191 case eEscAscii: 00192 break; 00193 default: // pure ascii 00194 ; 00195 } 00196 return "UTF-8"; 00197 00198 } 00199 00200 //--------------------------------------------------------------------- 00201 float nsUniversalDetector::GetConfidence() 00202 { 00203 if (!mGotData) 00204 { 00205 // we haven't got any data yet, return immediately 00206 // caller program sometimes call DataEnd before anything has been sent to detector 00207 return MINIMUM_THRESHOLD; 00208 } 00209 if (mDetectedCharset) 00210 return 0.99f; 00211 switch (mInputState) 00212 { 00213 case eHighbyte: 00214 { 00215 float proberConfidence; 00216 float maxProberConfidence = (float)0.0; 00217 int maxProber = 0; 00218 00219 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00220 { 00221 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00222 if (proberConfidence > maxProberConfidence) 00223 { 00224 maxProberConfidence = proberConfidence; 00225 maxProber = i; 00226 } 00227 } 00228 //do not report anything because we are not confident of it, that's in fact a negative answer 00229 if (maxProberConfidence > MINIMUM_THRESHOLD) 00230 return mCharSetProbers[maxProber]->GetConfidence(); 00231 } 00232 case eEscAscii: 00233 break; 00234 default: // pure ascii 00235 ; 00236 } 00237 return MINIMUM_THRESHOLD; 00238 } 00239 00240 nsProbingState nsUniversalDetector::GetState() 00241 { 00242 if (mDone) 00243 return eFoundIt; 00244 else 00245 return eDetecting; 00246 } 00247 } 00248 00249
KDE 4.6 API Reference