KDECore
nsLatin1Prober.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "nsLatin1Prober.h" 00027 #include <stdio.h> 00028 #include <stdlib.h> 00029 00030 #define UDF 0 // undefined 00031 #define OTH 1 //other 00032 #define ASC 2 // ascii capital letter 00033 #define ASS 3 // ascii small letter 00034 #define ACV 4 // accent capital vowel 00035 #define ACO 5 // accent capital other 00036 #define ASV 6 // accent small vowel 00037 #define ASO 7 // accent small other 00038 #define CLASS_NUM 8 // total classes 00039 00040 namespace kencodingprober { 00041 static unsigned char Latin1_CharToClass[] = 00042 { 00043 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 00044 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 00045 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 00046 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 00047 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 00048 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 00049 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 00050 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 00051 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 00052 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 00053 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 00054 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 00055 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 00056 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 00057 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 00058 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 00059 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 00060 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 00061 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 00062 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 00063 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 00064 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 00065 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 00066 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 00067 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 00068 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 00069 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 00070 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 00071 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 00072 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 00073 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 00074 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 00075 }; 00076 00077 00078 /* 0 : illegal 00079 1 : very unlikely 00080 2 : normal 00081 3 : very likely 00082 */ 00083 static unsigned char Latin1ClassModel[] = 00084 { 00085 /* UDF OTH ASC ASS ACV ACO ASV ASO */ 00086 /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 00087 /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 00088 /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 00089 /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 00090 /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 00091 /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 00092 /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 00093 /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 00094 }; 00095 00096 void nsLatin1Prober::Reset(void) 00097 { 00098 mState = eDetecting; 00099 mLastCharClass = OTH; 00100 for (int i = 0; i < FREQ_CAT_NUM; i++) 00101 mFreqCounter[i] = 0; 00102 } 00103 00104 00105 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, unsigned int aLen) 00106 { 00107 char *newBuf1 = 0; 00108 unsigned int newLen1 = 0; 00109 00110 if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 00111 newBuf1 = (char*)aBuf; 00112 newLen1 = aLen; 00113 } 00114 00115 unsigned char charClass; 00116 unsigned char freq; 00117 for (unsigned int i = 0; i < newLen1; i++) 00118 { 00119 charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 00120 freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; 00121 if (freq == 0) { 00122 mState = eNotMe; 00123 break; 00124 } 00125 mFreqCounter[freq]++; 00126 mLastCharClass = charClass; 00127 } 00128 00129 if (newBuf1 != aBuf) 00130 free(newBuf1); 00131 00132 return mState; 00133 } 00134 00135 float nsLatin1Prober::GetConfidence(void) 00136 { 00137 if (mState == eNotMe) 00138 return 0.01f; 00139 00140 float confidence; 00141 unsigned int total = 0; 00142 for (int i = 0; i < FREQ_CAT_NUM; i++) 00143 total += mFreqCounter[i]; 00144 00145 if(!total) 00146 confidence = 0.0f; 00147 else 00148 { 00149 confidence = mFreqCounter[3]*1.0f / total; 00150 confidence -= mFreqCounter[1]*20.0f/total; 00151 } 00152 00153 if (confidence < 0.0f) 00154 confidence = 0.0f; 00155 00156 // lower the confidence of latin1 so that other more accurate detector 00157 // can take priority. 00158 confidence *= 0.50f; 00159 00160 return confidence; 00161 } 00162 00163 #ifdef DEBUG_PROBE 00164 void nsLatin1Prober::DumpStatus() 00165 { 00166 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 00167 } 00168 #endif 00169 } 00170 00171
KDE 4.6 API Reference