• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KHTML

tokenizer.cpp

Go to the documentation of this file.
00001 /*
00002  * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
00003  *
00004  * Redistribution and use in source and binary forms, with or without
00005  * modification, are permitted provided that the following conditions
00006  * are met:
00007  *
00008  * 1. Redistributions of source code must retain the above copyright
00009  *    notice, this list of conditions and the following disclaimer.
00010  * 2. Redistributions in binary form must reproduce the above copyright
00011  *    notice, this list of conditions and the following disclaimer in the
00012  *    documentation and/or other materials provided with the distribution.
00013  *
00014  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00015  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00016  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00017  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00018  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00019  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00020  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00021  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00022  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00023  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00024  */
00025 #include "tokenizer.h"
00026 
00027 #include "xml/dom_stringimpl.h"
00028 #include "xml/dom3_xpathimpl.h"
00029 #include "dom/dom3_xpath.h"
00030 
00031 #include <cstdio>
00032 
00033 using namespace std;
00034 
00035 using namespace DOM;
00036 using namespace DOM::XPath;
00037 using namespace khtml;
00038 using namespace khtml::XPath;
00039 
00040 namespace khtml {
00041 namespace XPath {
00042 
00043 struct AxisNameMapping
00044 {
00045     const char *name;
00046     Step::AxisType type;
00047 };
00048 
00049 static AxisNameMapping axisNames[] = {
00050     { "ancestor", Step::AncestorAxis },
00051     { "ancestor-or-self", Step::AncestorOrSelfAxis },
00052     { "attribute", Step::AttributeAxis },
00053     { "child", Step::ChildAxis },
00054     { "descendant", Step::DescendantAxis },
00055     { "descendant-or-self", Step::DescendantOrSelfAxis },
00056     { "following", Step::FollowingAxis },
00057     { "following-sibling", Step::FollowingSiblingAxis },
00058     { "namespace", Step::NamespaceAxis },
00059     { "parent", Step::ParentAxis },
00060     { "preceding", Step::PrecedingAxis },
00061     { "preceding-sibling", Step::PrecedingSiblingAxis },
00062     { "self", Step::SelfAxis }
00063 };
00064 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
00065 
00066 static const char* const nodeTypeNames[] = {
00067     "comment",
00068     "text",
00069     "processing-instruction",
00070     "node",
00071     0
00072 };
00073 
00074 QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict     = 0;
00075 QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;
00076 
00077 Tokenizer &Tokenizer::self()
00078 {
00079     static Tokenizer instance;
00080     return instance;
00081 }
00082 
00083 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
00084 {
00085     //### might need to add some special cases from the XML spec.
00086 
00087     if (aChar.unicode() == '_')
00088         return NameStart;
00089 
00090     if (aChar.unicode() == '.' || aChar.unicode() == '-')
00091         return NameCont;
00092 
00093     switch (aChar.category()) {
00094         case QChar::Letter_Lowercase: //Ll
00095         case QChar::Letter_Uppercase: //Lu
00096         case QChar::Letter_Other:     //Lo
00097         case QChar::Letter_Titlecase: //Lt
00098         case QChar::Number_Letter:    //Nl
00099             return NameStart;
00100 
00101         case QChar::Mark_SpacingCombining: //Mc
00102         case QChar::Mark_Enclosing:        //Me
00103         case QChar::Mark_NonSpacing:       //Mn
00104         case QChar::Letter_Modifier:       //Lm
00105         case QChar::Number_DecimalDigit:   //Nd
00106             return NameCont;
00107 
00108         default:
00109             return NotPartOfName;
00110     }
00111 }
00112 
00113 bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
00114 {
00115     if (!s_axisNamesDict) {
00116         s_axisNamesDict = new QHash<QString, Step::AxisType>;
00117         for (unsigned int p = 0; p < axisNamesCount; ++p)
00118             s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
00119                                     axisNames[p].type);
00120     }
00121 
00122     QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
00123     if ( it != s_axisNamesDict->constEnd() ) {
00124         *type = *it;
00125     }
00126     return it != s_axisNamesDict->constEnd();
00127 }
00128 
00129 bool Tokenizer::isNodeTypeName(QString name)
00130 {
00131     if (!s_nodeTypeNamesDict) {
00132         s_nodeTypeNamesDict = new QSet<QString>;
00133         for (int p = 0; nodeTypeNames[p]; ++p)
00134             s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
00135     }
00136     return s_nodeTypeNamesDict->contains(name);
00137 }
00138 
00139 /* Returns whether the last parsed token matches the [32] Operator rule
00140  * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
00141  * the tokens.
00142  */
00143 bool Tokenizer::isOperatorContext()
00144 {
00145     if ( m_nextPos == 0 ) {
00146         return false;
00147     }
00148 
00149     switch ( m_lastTokenType ) {
00150         case AND: case OR: case MULOP:
00151         case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
00152         case EQOP: case RELOP:
00153         case '@': case AXISNAME:   case '(': case '[':
00154             return false;
00155         default:
00156             return true;
00157     }
00158 }
00159 
00160 void Tokenizer::skipWS()
00161 {
00162     while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
00163         ++m_nextPos;
00164 }
00165 
00166 Token Tokenizer::makeTokenAndAdvance(int code, int advance)
00167 {
00168     m_nextPos += advance;
00169     return Token(code);
00170 }
00171 
00172 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
00173 {
00174     m_nextPos += advance;
00175     return Token(code, val);
00176 }
00177 
00178 //Returns next char if it's there and interesting, 0 otherwise
00179 char Tokenizer::peekAheadHelper()
00180 {
00181     if (m_nextPos + 1 >= m_data.length())
00182         return 0;
00183     QChar next = m_data[m_nextPos + 1];
00184     if (next.row() != 0)
00185         return 0;
00186     else
00187         return next.cell();
00188 }
00189 
00190 char Tokenizer::peekCurHelper()
00191 {
00192     if (m_nextPos >= m_data.length())
00193         return 0;
00194     QChar next = m_data[m_nextPos];
00195     if (next.row() != 0)
00196         return 0;
00197     else
00198         return next.cell();
00199 }
00200 
00201 Token Tokenizer::lexString()
00202 {
00203     QChar delimiter = m_data[m_nextPos];
00204     int   startPos  = m_nextPos + 1;
00205 
00206     for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
00207         if (m_data[m_nextPos] == delimiter) {
00208             QString value = m_data.mid(startPos, m_nextPos - startPos);
00209             ++m_nextPos; //Consume the char;
00210             return Token(LITERAL, value);
00211         }
00212     }
00213 
00214     //Ouch, went off the end -- report error
00215     return Token(ERROR);
00216 }
00217 
00218 Token Tokenizer::lexNumber()
00219 {
00220     int startPos = m_nextPos;
00221     bool seenDot = false;
00222 
00223     //Go until end or a non-digits character
00224     for (; m_nextPos < m_data.length(); ++m_nextPos) {
00225         QChar aChar = m_data[m_nextPos];
00226         if (aChar.row() != 0) break;
00227 
00228         if (aChar.cell() < '0' || aChar.cell() > '9') {
00229             if (aChar.cell() == '.' && !seenDot)
00230                 seenDot = true;
00231             else
00232                 break;
00233         }
00234     }
00235 
00236     QString value = m_data.mid(startPos, m_nextPos - startPos);
00237     return Token(NUMBER, value);
00238 }
00239 
00240 Token Tokenizer::lexNCName()
00241 {
00242     int startPos = m_nextPos;
00243     if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
00244     {
00245         //Keep going until we get a character that's not good for names.
00246         for (; m_nextPos < m_data.length(); ++m_nextPos) {
00247             if (charCat(m_data[m_nextPos]) == NotPartOfName)
00248                 break;
00249         }
00250 
00251         QString value = m_data.mid(startPos, m_nextPos - startPos);
00252         return Token(value);
00253     }
00254     else
00255         return makeTokenAndAdvance(ERROR);
00256 }
00257 
00258 Token Tokenizer::lexQName()
00259 {
00260     Token t1 = lexNCName();
00261     if (t1.type == ERROR) return t1;
00262     skipWS();
00263     //If the next character is :, what we just got it the prefix, if not,
00264     //it's the whole thing
00265     if (peekAheadHelper() != ':')
00266         return t1;
00267 
00268     Token t2 = lexNCName();
00269     if (t2.type == ERROR) return t2;
00270 
00271     return Token(t1.value + ":" + t2.value);
00272 }
00273 
00274 Token Tokenizer::nextTokenInternal()
00275 {
00276     skipWS();
00277 
00278     if (m_nextPos >= m_data.length()) {
00279         return Token(0);
00280     }
00281 
00282     char code = peekCurHelper();
00283     switch (code) {
00284         case '(': case ')': case '[': case ']':
00285         case '@': case ',': case '|':
00286             return makeTokenAndAdvance(code);
00287         case '\'':
00288         case '\"':
00289             return lexString();
00290         case '0': case '1': case '2': case '3': case '4':
00291         case '5': case '6': case '7': case '8': case '9':
00292             return lexNumber();
00293         case '.': {
00294             char next = peekAheadHelper();
00295             if (next == '.')
00296                 return makeTokenAndAdvance(DOTDOT, 2);
00297             else if (next >= '0' && next <= '9')
00298                 return lexNumber();
00299             else
00300                 return makeTokenAndAdvance('.');
00301         }
00302         case '/':
00303             if (peekAheadHelper() == '/')
00304                 return makeTokenAndAdvance(SLASHSLASH, 2);
00305             else
00306                 return makeTokenAndAdvance('/');
00307         case '+':
00308             return makeTokenAndAdvance(PLUS);
00309         case '-':
00310             return makeTokenAndAdvance(MINUS);
00311         case '=':
00312             return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
00313         case '!':
00314             if (peekAheadHelper() == '=')
00315                 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
00316             else {
00317                 return Token(ERROR);
00318             }
00319         case '<':
00320             if (peekAheadHelper() == '=')
00321                 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
00322             else
00323                 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
00324         case '>':
00325             if (peekAheadHelper() == '=')
00326                 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
00327             else
00328                 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
00329         case '*':
00330             if (isOperatorContext())
00331                 return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
00332             else {
00333                 ++m_nextPos;
00334                 return Token(NAMETEST, "*");
00335             }
00336         case '$': {//$ QName
00337             m_nextPos++;
00338             Token par = lexQName();
00339             if (par.type == ERROR)
00340                 return par;
00341             else
00342                 return Token(VARIABLEREFERENCE, par.value);
00343         }
00344     }
00345 
00346     Token t1 = lexNCName();
00347     if (t1.type == ERROR) return t1;
00348 
00349     skipWS();
00350 
00351     //If we're in an operator context, check for any operator names
00352     if (isOperatorContext()) {
00353         if (t1.value == QLatin1String("and")) //### hash?
00354             return Token(AND);
00355         if (t1.value == QLatin1String("or"))
00356             return Token(OR);
00357         if (t1.value == QLatin1String("mod"))
00358             return Token(MULOP, NumericOp::OP_Mod);
00359         if (t1.value == QLatin1String("div"))
00360             return Token(MULOP, NumericOp::OP_Div);
00361     }
00362 
00363     //See whether we are at a :
00364     if (peekCurHelper() == ':') {
00365         m_nextPos++;
00366         //Any chance it's an axis name?
00367         if (peekCurHelper() == ':') {
00368             m_nextPos++;
00369 
00370             //It might be an axis name.
00371             Step::AxisType axisType;
00372             if (isAxisName(t1.value, &axisType))
00373                 return Token(AXISNAME, axisType);
00374             //Ugh, :: is only valid in axis names -> error
00375             return Token(ERROR);
00376         }
00377 
00378         //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
00379         skipWS();
00380         if (peekCurHelper() == '*') {
00381             m_nextPos++;
00382             return Token(NAMETEST, t1.value + ":*");
00383         }
00384 
00385         //Make a full qname..
00386         Token t2 = lexNCName();
00387         if (t2.type == ERROR) return t2;
00388 
00389         t1.value = t1.value + ':' + t2.value;
00390     }
00391 
00392     skipWS();
00393     if (peekCurHelper() == '(') {
00394         //note: we don't swallow the ( here!
00395 
00396         //either node type of function name
00397         if (isNodeTypeName(t1.value)) {
00398             if (t1.value == "processing-instruction")
00399                 return Token(PI, t1.value);
00400             else
00401                 return Token(NODETYPE, t1.value);
00402         }
00403         //must be a function name.
00404         return Token(FUNCTIONNAME, t1.value);
00405     }
00406 
00407     //At this point, it must be NAMETEST
00408     return Token(NAMETEST, t1.value);
00409 }
00410 
00411 Token Tokenizer::nextToken()
00412 {
00413     Token toRet = nextTokenInternal();
00414     m_lastTokenType = toRet.type;
00415     return toRet;
00416 }
00417 
00418 Tokenizer::Tokenizer()
00419 {
00420     reset(QString());
00421 }
00422 
00423 Tokenizer::~Tokenizer()
00424 {
00425     delete s_axisNamesDict;
00426     delete s_nodeTypeNamesDict;
00427 }
00428 
00429 void Tokenizer::reset(QString data)
00430 {
00431     m_nextPos = 0;
00432     m_data = data;
00433     m_lastTokenType = 0;
00434 }
00435 
00436 int khtmlxpathyylex()
00437 {
00438     Token tok = Tokenizer::self().nextToken();
00439     if (tok.hasString) {
00440         khtmlxpathyylval.str = new DOMString(tok.value);
00441     } else if (tok.intValue) {
00442         khtmlxpathyylval.num = tok.intValue;
00443     }
00444     return tok.type;
00445 }
00446 
00447 void initTokenizer(const DOM::DOMString& string)
00448 {
00449     Tokenizer::self().reset(string.string());
00450 }
00451 
00452 } // namespace XPath
00453 } // namespace khtml
00454 
00455 // kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal