KHTML
khtml_filter.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE project 00002 00003 Copyright (C) 2005 Ivor Hewitt <ivor@kde.org> 00004 Copyright (C) 2008 Maksim Orlovich <maksim@kde.org> 00005 Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com> 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Library General Public 00009 License as published by the Free Software Foundation; either 00010 version 2 of the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Library General Public License for more details. 00016 00017 You should have received a copy of the GNU Library General Public License 00018 along with this library; see the file COPYING.LIB. If not, write to 00019 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00020 Boston, MA 02110-1301, USA. 00021 */ 00022 00023 #include "khtml_filter_p.h" 00024 #include <QDebug> 00025 00026 // rolling hash parameters 00027 #define HASH_P (1997) 00028 #define HASH_Q (17509) 00029 // HASH_MOD = (HASH_P^7) % HASH_Q 00030 #define HASH_MOD (523) 00031 00032 namespace khtml { 00033 00034 // We only want a subset of features of wildcards -- just the 00035 // star, so we escape the rest before passing to QRegExp. 00036 // The \ is escaped due to a QRegExp bug. 00037 // ### we really should rather parse it ourselves, in order to 00038 // handle adblock-special things like | and ^ properly. 00039 static QRegExp fromAdBlockWildcard(const QString& wcStr) { 00040 QRegExp rx; 00041 rx.setPatternSyntax(QRegExp::Wildcard); 00042 00043 QString out; 00044 for (int p = 0; p < wcStr.length(); ++p) { 00045 QChar c = wcStr[p]; 00046 if (c == QLatin1Char('?')) 00047 out += QLatin1String("[?]"); 00048 else if (c == QLatin1Char('[')) 00049 out += QLatin1String("[[]"); 00050 else if (c == QLatin1Char('\\')) 00051 out += QLatin1String("[\\]"); 00052 else 00053 out += c; 00054 } 00055 00056 rx.setPattern(out); 00057 return rx; 00058 } 00059 00060 void FilterSet::addFilter(const QString& filterStr) 00061 { 00062 QString filter = filterStr; 00063 00065 QChar firstChar = filter.at(0); 00066 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) 00067 return; 00068 00069 // Strip leading @@ 00070 int first = 0; 00071 int last = filter.length() - 1; 00072 if (filter.startsWith(QLatin1String("@@"))) 00073 first = 2; 00074 00075 // Strip options, we ignore them for now. 00076 int dollar = filter.lastIndexOf(QLatin1Char('$')); 00077 if (dollar != -1) { 00078 last = dollar - 1; 00079 // If only "*" is left after ignoring the options, disregard the rule. 00080 if (first == last && firstChar == QLatin1Char('*')) 00081 return; 00082 } 00083 00084 // Perhaps nothing left? 00085 if (first > last) 00086 return; 00087 00088 filter = filter.mid(first, last - first + 1); 00089 00090 // Is it a regexp filter? 00091 if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) 00092 { 00093 QString inside = filter.mid(1, filter.length()-2); 00094 QRegExp rx(inside); 00095 reFilters.append(rx); 00096 // qDebug() << "R:" << inside; 00097 } 00098 else 00099 { 00100 // Nope, a wildcard one. 00101 // Note: For these, we also need to handle |. 00102 00103 // Strip wildcards at the ends 00104 first = 0; 00105 last = filter.length() - 1; 00106 00107 while (first < filter.length() && filter[first] == QLatin1Char('*')) 00108 ++first; 00109 00110 while (last >= 0 && filter[last] == QLatin1Char('*')) 00111 --last; 00112 00113 if (first > last) 00114 filter = QLatin1String("*"); // erm... Well, they asked for it. 00115 else 00116 filter = filter.mid(first, last - first + 1); 00117 00118 // Now, do we still have any wildcard stuff left? 00119 if (filter.contains("*")) 00120 { 00121 // check if we can use RK first (and then check full RE for the rest) for better performance 00122 int aPos = filter.indexOf('*'); 00123 if (aPos < 0) 00124 aPos = filter.length(); 00125 if (aPos > 7) { 00126 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*')); 00127 // We pad the final r.e. with * so we can check for an exact match 00128 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx); 00129 } else { 00130 QRegExp rx = fromAdBlockWildcard(filter); 00131 reFilters.append(rx); 00132 } 00133 } 00134 else 00135 { 00136 // Fast path 00137 stringFiltersMatcher.addString(filter); 00138 } 00139 } 00140 } 00141 00142 bool FilterSet::isUrlMatched(const QString& url) 00143 { 00144 if (stringFiltersMatcher.isMatched(url)) 00145 return true; 00146 00147 for (int c = 0; c < reFilters.size(); ++c) 00148 { 00149 if (url.contains(reFilters[c])) 00150 return true; 00151 } 00152 00153 return false; 00154 } 00155 00156 QString FilterSet::urlMatchedBy(const QString& url) 00157 { 00158 QString by; 00159 00160 if (stringFiltersMatcher.isMatched(url, &by)) 00161 return by; 00162 00163 for (int c = 0; c < reFilters.size(); ++c) 00164 { 00165 if (url.contains(reFilters[c])) 00166 { 00167 by = reFilters[c].pattern(); 00168 break; 00169 } 00170 } 00171 00172 return by; 00173 } 00174 00175 void FilterSet::clear() 00176 { 00177 reFilters.clear(); 00178 stringFiltersMatcher.clear(); 00179 } 00180 00181 00182 void StringsMatcher::addString(const QString& pattern) 00183 { 00184 if (pattern.length() < 8) { 00185 // handle short string differently 00186 shortStringFilters.append(pattern); 00187 } else { 00188 // use modified Rabin-Karp's algorithm with 8-length string hash 00189 // i.e. store hash of first 8 chars in the HashMap for fast look-up 00190 stringFilters.append(pattern); 00191 int ind = stringFilters.size() - 1; 00192 int current = 0; 00193 00194 // compute hash using rolling hash 00195 // hash for string: x0,x1,x2...xn-1 will be: 00196 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q 00197 // where p and q some wisely-chosen integers 00198 /*for (int k = 0; k < 8; ++k)*/ 00199 int len = pattern.length(); 00200 for (int k = len - 8; k < len; ++k) 00201 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q; 00202 00203 // insert computed hash value into HashMap 00204 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 00205 if (it == stringFiltersHash.end()) { 00206 QVector<int> list; 00207 list.append(ind); 00208 stringFiltersHash.add(current + 1, list); 00209 fastLookUp.setBit(current); 00210 } else { 00211 it->second.append(ind); 00212 } 00213 } 00214 } 00215 00216 void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx) 00217 { 00218 rePrefixes.append(prefix); 00219 reFilters.append(rx); 00220 int index = -rePrefixes.size(); 00221 00222 int current = 0; 00223 for (int k = 0; k < 8; ++k) 00224 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q; 00225 00226 // insert computed hash value into HashMap 00227 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 00228 if (it == stringFiltersHash.end()) { 00229 QVector<int> list; 00230 list.append(index); 00231 stringFiltersHash.add(current + 1, list); 00232 fastLookUp.setBit(current); 00233 } else { 00234 it->second.append(index); 00235 } 00236 } 00237 00238 bool StringsMatcher::isMatched(const QString& str, QString *by) const 00239 { 00240 // check short strings first 00241 for (int i = 0; i < shortStringFilters.size(); ++i) { 00242 if (str.contains(shortStringFilters[i])) 00243 { 00244 if (by != 0) *by = shortStringFilters[i]; 00245 return true; 00246 } 00247 } 00248 00249 int len = str.length(); 00250 int k; 00251 00252 int current = 0; 00253 int next = 0; 00254 // compute hash for first 8 characters 00255 for (k = 0; k < 8 && k < len; ++k) 00256 current = (current * HASH_P + str[k].unicode()) % HASH_Q; 00257 00258 WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end(); 00259 // main Rabin-Karp's algorithm loop 00260 for (k = 7; k < len; ++k, current = next) { 00261 // roll the hash if not at the end 00262 // (calculate hash for the next iteration) 00263 if (k + 1 < len) 00264 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q; 00265 00266 if (!fastLookUp.testBit(current)) 00267 continue; 00268 00269 // look-up the hash in the HashMap and check all strings 00270 WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1); 00271 00272 // check possible strings 00273 if (it != hashEnd) { 00274 for (int j = 0; j < it->second.size(); ++j) { 00275 int index = it->second[j]; 00276 // check if we got simple string or REs prefix 00277 if (index >= 0) { 00278 int flen = stringFilters[index].length(); 00279 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen)) 00280 { 00281 if (by != 0) *by = stringFilters[index]; 00282 return true; 00283 } 00284 } else { 00285 index = -index - 1; 00286 int flen = rePrefixes[index].length(); 00287 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) 00288 { 00289 int remStart = k - 7 + flen; 00290 QString remainder = QString::fromRawData(str.unicode() + remStart, 00291 str.length() - remStart); 00292 if (reFilters[index].exactMatch(remainder)) { 00293 if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern(); 00294 return true; 00295 } 00296 } 00297 } 00298 } 00299 } 00300 } 00301 00302 return false; 00303 } 00304 00305 void StringsMatcher::clear() 00306 { 00307 stringFilters.clear(); 00308 shortStringFilters.clear(); 00309 reFilters.clear(); 00310 rePrefixes.clear(); 00311 stringFiltersHash.clear(); 00312 fastLookUp.resize(HASH_Q); 00313 fastLookUp.fill(0, 0, HASH_Q); 00314 } 00315 00316 } 00317 00318 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;
KDE 4.6 API Reference