KDECore
CharDistribution.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #ifndef CharDistribution_h__ 00027 #define CharDistribution_h__ 00028 00029 #include "kdemacros.h" 00030 00031 #define ENOUGH_DATA_THRESHOLD 256 00032 00033 namespace kencodingprober { 00034 class KDE_NO_EXPORT CharDistributionAnalysis 00035 { 00036 public: 00037 CharDistributionAnalysis() {Reset();}; 00038 virtual ~CharDistributionAnalysis() {}; 00039 00040 //feed a block of data and do distribution analysis 00041 void HandleData(const char* /* aBuf */, unsigned int /* aLen */) {}; 00042 00043 //Feed a character with known length 00044 void HandleOneChar(const char* aStr, unsigned int aCharLen) 00045 { 00046 int order; 00047 00048 //we only care about 2-bytes character in our distribution analysis 00049 order = (aCharLen == 2) ? GetOrder(aStr) : -1; 00050 00051 if (order >= 0) 00052 { 00053 mTotalChars++; 00054 //order is valid 00055 if ((unsigned int)order < mTableSize) 00056 { 00057 if (512 > mCharToFreqOrder[order]) 00058 mFreqChars++; 00059 } 00060 } 00061 }; 00062 00063 //return confidence base on existing data 00064 float GetConfidence(); 00065 00066 //Reset analyser, clear any state 00067 void Reset(void) 00068 { 00069 mDone = false; 00070 mTotalChars = 0; 00071 mFreqChars = 0; 00072 }; 00073 00074 //This function is for future extension. Caller can use this function to control 00075 //analyser's behavior 00076 void SetOpion(){}; 00077 00078 //It is not necessary to receive all data to draw conclusion. For charset detection, 00079 // certain amount of data is enough 00080 bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}; 00081 00082 protected: 00083 //we do not handle character base on its original encoding string, but 00084 //convert this encoding string to a number, here called order. 00085 //This allow multiple encoding of a language to share one frequency table 00086 virtual int GetOrder(const char* /* str */) {return -1;}; 00087 00088 //If this flag is set to true, detection is done and conclusion has been made 00089 bool mDone; 00090 00091 //The number of characters whose frequency order is less than 512 00092 unsigned int mFreqChars; 00093 00094 //Total character encounted. 00095 unsigned int mTotalChars; 00096 00097 //Mapping table to get frequency order from char order (get from GetOrder()) 00098 const short *mCharToFreqOrder; 00099 00100 //Size of above table 00101 unsigned int mTableSize; 00102 00103 //This is a constant value varies from language to language, it is used in 00104 //calculating confidence. See my paper for further detail. 00105 float mTypicalDistributionRatio; 00106 }; 00107 00108 00109 class KDE_NO_EXPORT EUCTWDistributionAnalysis: public CharDistributionAnalysis 00110 { 00111 public: 00112 EUCTWDistributionAnalysis(); 00113 protected: 00114 00115 //for euc-TW encoding, we are interested 00116 // first byte range: 0xc4 -- 0xfe 00117 // second byte range: 0xa1 -- 0xfe 00118 //no validation needed here. State machine has done that 00119 int GetOrder(const char* str) 00120 { if ((unsigned char)*str >= (unsigned char)0xc4) 00121 return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; 00122 else 00123 return -1; 00124 }; 00125 }; 00126 00127 00128 class KDE_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis 00129 { 00130 public: 00131 EUCKRDistributionAnalysis(); 00132 protected: 00133 //for euc-KR encoding, we are interested 00134 // first byte range: 0xb0 -- 0xfe 00135 // second byte range: 0xa1 -- 0xfe 00136 //no validation needed here. State machine has done that 00137 int GetOrder(const char* str) 00138 { if ((unsigned char)*str >= (unsigned char)0xb0) 00139 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 00140 else 00141 return -1; 00142 }; 00143 }; 00144 00145 class KDE_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis 00146 { 00147 public: 00148 GB2312DistributionAnalysis(); 00149 protected: 00150 //for GB2312 encoding, we are interested 00151 // first byte range: 0xb0 -- 0xfe 00152 // second byte range: 0xa1 -- 0xfe 00153 //no validation needed here. State machine has done that 00154 int GetOrder(const char* str) 00155 { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) 00156 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 00157 else 00158 return -1; 00159 }; 00160 }; 00161 00162 00163 class KDE_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis 00164 { 00165 public: 00166 Big5DistributionAnalysis(); 00167 protected: 00168 //for big5 encoding, we are interested 00169 // first byte range: 0xa4 -- 0xfe 00170 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 00171 //no validation needed here. State machine has done that 00172 int GetOrder(const char* str) 00173 { if ((unsigned char)*str >= (unsigned char)0xa4) 00174 if ((unsigned char)str[1] >= (unsigned char)0xa1) 00175 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; 00176 else 00177 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 00178 else 00179 return -1; 00180 }; 00181 }; 00182 00183 class KDE_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis 00184 { 00185 public: 00186 SJISDistributionAnalysis(); 00187 protected: 00188 //for sjis encoding, we are interested 00189 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 00190 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 00191 //no validation needed here. State machine has done that 00192 int GetOrder(const char* str) 00193 { 00194 int order; 00195 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) 00196 order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); 00197 else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) 00198 order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); 00199 else 00200 return -1; 00201 order += (unsigned char)*(str+1) - 0x40; 00202 if ((unsigned char)str[1] > (unsigned char)0x7f) 00203 order--; 00204 return order; 00205 }; 00206 }; 00207 00208 class KDE_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis 00209 { 00210 public: 00211 EUCJPDistributionAnalysis(); 00212 protected: 00213 //for euc-JP encoding, we are interested 00214 // first byte range: 0xa0 -- 0xfe 00215 // second byte range: 0xa1 -- 0xfe 00216 //no validation needed here. State machine has done that 00217 int GetOrder(const char* str) 00218 { if ((unsigned char)*str >= (unsigned char)0xa0) 00219 return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 00220 else 00221 return -1; 00222 }; 00223 }; 00224 } 00225 #endif //CharDistribution_h__ 00226
KDE 4.6 API Reference