• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KHTML

khtml_filter.cpp

Go to the documentation of this file.
00001 /* This file is part of the KDE project
00002 
00003    Copyright (C) 2005 Ivor Hewitt     <ivor@kde.org>
00004    Copyright (C) 2008 Maksim Orlovich <maksim@kde.org>
00005    Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Library General Public
00009    License as published by the Free Software Foundation; either
00010    version 2 of the License, or (at your option) any later version.
00011 
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Library General Public License for more details.
00016 
00017    You should have received a copy of the GNU Library General Public License
00018    along with this library; see the file COPYING.LIB.  If not, write to
00019    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00020    Boston, MA 02110-1301, USA.
00021 */
00022 
00023 #include "khtml_filter_p.h"
00024 #include <QDebug>
00025 
00026 // rolling hash parameters
00027 #define HASH_P (1997)
00028 #define HASH_Q (17509)
00029 // HASH_MOD = (HASH_P^7) % HASH_Q
00030 #define HASH_MOD (523)
00031 
00032 namespace khtml {
00033 
00034 // We only want a subset of features of wildcards -- just the 
00035 // star, so we escape the rest before passing to QRegExp. 
00036 // The \ is escaped due to a QRegExp bug.
00037 // ### we really should rather parse it ourselves, in order to 
00038 // handle adblock-special things like | and ^ properly.
00039 static QRegExp fromAdBlockWildcard(const QString& wcStr) {
00040     QRegExp rx;
00041     rx.setPatternSyntax(QRegExp::Wildcard);
00042 
00043     QString out;
00044     for (int p = 0; p < wcStr.length(); ++p) {
00045         QChar c = wcStr[p];
00046         if (c == QLatin1Char('?'))
00047             out += QLatin1String("[?]");
00048         else if (c == QLatin1Char('['))
00049             out += QLatin1String("[[]");
00050         else if (c == QLatin1Char('\\'))
00051             out += QLatin1String("[\\]");
00052         else
00053             out += c;
00054     }
00055     
00056     rx.setPattern(out);
00057     return rx;
00058 }
00059 
00060 void FilterSet::addFilter(const QString& filterStr)
00061 {
00062     QString filter = filterStr;
00063 
00065     QChar firstChar = filter.at(0);
00066     if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#')))
00067         return;
00068 
00069     // Strip leading @@
00070     int first = 0;
00071     int last  = filter.length() - 1;
00072     if (filter.startsWith(QLatin1String("@@")))
00073         first = 2;
00074 
00075     // Strip options, we ignore them for now.
00076     int dollar = filter.lastIndexOf(QLatin1Char('$'));
00077     if (dollar != -1) {
00078         last = dollar - 1;
00079         // If only "*" is left after ignoring the options, disregard the rule.
00080         if (first == last && firstChar == QLatin1Char('*'))
00081             return;
00082     }
00083 
00084     // Perhaps nothing left?
00085     if (first > last)
00086         return;
00087 
00088     filter = filter.mid(first, last - first + 1);
00089 
00090     // Is it a regexp filter?
00091     if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/')))
00092     {
00093         QString inside = filter.mid(1, filter.length()-2);
00094         QRegExp rx(inside);
00095         reFilters.append(rx);
00096 //         qDebug() << "R:" << inside;
00097     }
00098     else
00099     {
00100         // Nope, a wildcard one.
00101         // Note: For these, we also need to handle |.
00102 
00103         // Strip wildcards at the ends
00104         first = 0;
00105         last  = filter.length() - 1;
00106 
00107         while (first < filter.length() && filter[first] == QLatin1Char('*'))
00108             ++first;
00109 
00110         while (last >= 0 && filter[last] == QLatin1Char('*'))
00111             --last;
00112 
00113         if (first > last)
00114             filter = QLatin1String("*"); // erm... Well, they asked for it.
00115         else
00116             filter = filter.mid(first, last - first + 1);
00117 
00118         // Now, do we still have any wildcard stuff left?
00119         if (filter.contains("*"))
00120         {
00121             // check if we can use RK first (and then check full RE for the rest) for better performance
00122             int aPos = filter.indexOf('*');
00123             if (aPos < 0)
00124                 aPos = filter.length();
00125             if (aPos > 7) {
00126                 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
00127                 // We pad the final r.e. with * so we can check for an exact match
00128                 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
00129             } else {
00130                 QRegExp rx = fromAdBlockWildcard(filter);
00131                 reFilters.append(rx);
00132             }
00133         }
00134         else
00135         {
00136             // Fast path
00137             stringFiltersMatcher.addString(filter);
00138         }
00139     }
00140 }
00141 
00142 bool FilterSet::isUrlMatched(const QString& url)
00143 {
00144     if (stringFiltersMatcher.isMatched(url))
00145         return true;
00146 
00147     for (int c = 0; c < reFilters.size(); ++c)
00148     {
00149         if (url.contains(reFilters[c]))
00150             return true;
00151     }
00152 
00153     return false;
00154 }
00155 
00156 QString FilterSet::urlMatchedBy(const QString& url)
00157 {
00158     QString by;
00159 
00160     if (stringFiltersMatcher.isMatched(url, &by))
00161         return by;
00162 
00163     for (int c = 0; c < reFilters.size(); ++c)
00164     {
00165         if (url.contains(reFilters[c]))
00166         {
00167             by = reFilters[c].pattern();
00168             break;
00169         }
00170     }
00171 
00172     return by;
00173 }
00174 
00175 void FilterSet::clear()
00176 {
00177     reFilters.clear();
00178     stringFiltersMatcher.clear();
00179 }
00180 
00181 
00182 void StringsMatcher::addString(const QString& pattern)
00183 {
00184     if (pattern.length() < 8) {
00185         // handle short string differently
00186         shortStringFilters.append(pattern);
00187     } else {
00188         // use modified Rabin-Karp's algorithm with 8-length string hash
00189         // i.e. store hash of first 8 chars in the HashMap for fast look-up
00190         stringFilters.append(pattern);
00191         int ind = stringFilters.size() - 1;
00192         int current = 0;
00193 
00194         // compute hash using rolling hash
00195         // hash for string: x0,x1,x2...xn-1 will be:
00196         // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
00197         // where p and q some wisely-chosen integers
00198         /*for (int k = 0; k < 8; ++k)*/
00199         int len = pattern.length();
00200         for (int k = len - 8; k < len; ++k)
00201             current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
00202 
00203         // insert computed hash value into HashMap
00204         WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
00205         if (it == stringFiltersHash.end()) {
00206             QVector<int> list;
00207             list.append(ind);
00208             stringFiltersHash.add(current + 1, list);
00209             fastLookUp.setBit(current);
00210         } else {
00211             it->second.append(ind);
00212         }
00213     }
00214 }
00215 
00216 void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx)
00217 {
00218     rePrefixes.append(prefix);
00219     reFilters.append(rx);
00220     int index = -rePrefixes.size();
00221 
00222     int current = 0;
00223     for (int k = 0; k < 8; ++k)
00224         current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
00225 
00226     // insert computed hash value into HashMap
00227     WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
00228     if (it == stringFiltersHash.end()) {
00229         QVector<int> list;
00230         list.append(index);
00231         stringFiltersHash.add(current + 1, list);
00232         fastLookUp.setBit(current);
00233     } else {
00234         it->second.append(index);
00235     }
00236 }
00237 
00238 bool StringsMatcher::isMatched(const QString& str, QString *by) const
00239 {
00240     // check short strings first
00241     for (int i = 0; i < shortStringFilters.size(); ++i) {
00242         if (str.contains(shortStringFilters[i]))
00243         {
00244             if (by != 0) *by = shortStringFilters[i];
00245             return true;
00246         }
00247     }
00248 
00249     int len = str.length();
00250     int k;
00251 
00252     int current = 0;
00253     int next = 0;
00254     // compute hash for first 8 characters
00255     for (k = 0; k < 8 && k < len; ++k)
00256         current = (current * HASH_P + str[k].unicode()) % HASH_Q;
00257 
00258     WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
00259     // main Rabin-Karp's algorithm loop
00260     for (k = 7; k < len; ++k, current = next) {
00261         // roll the hash if not at the end
00262         // (calculate hash for the next iteration)
00263         if (k + 1 < len)
00264             next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
00265 
00266         if (!fastLookUp.testBit(current))
00267             continue;
00268 
00269         // look-up the hash in the HashMap and check all strings
00270         WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
00271 
00272         // check possible strings
00273         if (it != hashEnd) {
00274             for (int j = 0; j < it->second.size(); ++j) {
00275                 int index = it->second[j];
00276                 // check if we got simple string or REs prefix
00277                 if (index >= 0) {
00278                     int flen = stringFilters[index].length();
00279                     if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen))
00280                     {
00281                         if (by != 0) *by = stringFilters[index];
00282                         return true;
00283                     }
00284                 } else {
00285                     index = -index - 1;
00286                     int flen = rePrefixes[index].length();
00287                     if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen))
00288                     {
00289                         int remStart = k - 7 + flen;
00290                         QString remainder = QString::fromRawData(str.unicode() + remStart,
00291                                                                  str.length() - remStart);
00292                         if (reFilters[index].exactMatch(remainder)) {
00293                             if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern();
00294                             return true;
00295                         }
00296                     }
00297                 }
00298             }
00299         }
00300     }
00301 
00302     return false;
00303 }
00304 
00305 void StringsMatcher::clear()
00306 {
00307     stringFilters.clear();
00308     shortStringFilters.clear();
00309     reFilters.clear();
00310     rePrefixes.clear();
00311     stringFiltersHash.clear();
00312     fastLookUp.resize(HASH_Q);
00313     fastLookUp.fill(0, 0, HASH_Q);
00314 }
00315 
00316 }
00317 
00318 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal