KHTML
tokenizer.cpp
Go to the documentation of this file.
00001 /* 00002 * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org> 00003 * 00004 * Redistribution and use in source and binary forms, with or without 00005 * modification, are permitted provided that the following conditions 00006 * are met: 00007 * 00008 * 1. Redistributions of source code must retain the above copyright 00009 * notice, this list of conditions and the following disclaimer. 00010 * 2. Redistributions in binary form must reproduce the above copyright 00011 * notice, this list of conditions and the following disclaimer in the 00012 * documentation and/or other materials provided with the distribution. 00013 * 00014 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 00015 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00016 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 00017 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 00018 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 00019 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00020 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00021 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00022 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 00023 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00024 */ 00025 #include "tokenizer.h" 00026 00027 #include "xml/dom_stringimpl.h" 00028 #include "xml/dom3_xpathimpl.h" 00029 #include "dom/dom3_xpath.h" 00030 00031 #include <cstdio> 00032 00033 using namespace std; 00034 00035 using namespace DOM; 00036 using namespace DOM::XPath; 00037 using namespace khtml; 00038 using namespace khtml::XPath; 00039 00040 namespace khtml { 00041 namespace XPath { 00042 00043 struct AxisNameMapping 00044 { 00045 const char *name; 00046 Step::AxisType type; 00047 }; 00048 00049 static AxisNameMapping axisNames[] = { 00050 { "ancestor", Step::AncestorAxis }, 00051 { "ancestor-or-self", Step::AncestorOrSelfAxis }, 00052 { "attribute", Step::AttributeAxis }, 00053 { "child", Step::ChildAxis }, 00054 { "descendant", Step::DescendantAxis }, 00055 { "descendant-or-self", Step::DescendantOrSelfAxis }, 00056 { "following", Step::FollowingAxis }, 00057 { "following-sibling", Step::FollowingSiblingAxis }, 00058 { "namespace", Step::NamespaceAxis }, 00059 { "parent", Step::ParentAxis }, 00060 { "preceding", Step::PrecedingAxis }, 00061 { "preceding-sibling", Step::PrecedingSiblingAxis }, 00062 { "self", Step::SelfAxis } 00063 }; 00064 static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]); 00065 00066 static const char* const nodeTypeNames[] = { 00067 "comment", 00068 "text", 00069 "processing-instruction", 00070 "node", 00071 0 00072 }; 00073 00074 QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict = 0; 00075 QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0; 00076 00077 Tokenizer &Tokenizer::self() 00078 { 00079 static Tokenizer instance; 00080 return instance; 00081 } 00082 00083 Tokenizer::XMLCat Tokenizer::charCat(QChar aChar) 00084 { 00085 //### might need to add some special cases from the XML spec. 00086 00087 if (aChar.unicode() == '_') 00088 return NameStart; 00089 00090 if (aChar.unicode() == '.' || aChar.unicode() == '-') 00091 return NameCont; 00092 00093 switch (aChar.category()) { 00094 case QChar::Letter_Lowercase: //Ll 00095 case QChar::Letter_Uppercase: //Lu 00096 case QChar::Letter_Other: //Lo 00097 case QChar::Letter_Titlecase: //Lt 00098 case QChar::Number_Letter: //Nl 00099 return NameStart; 00100 00101 case QChar::Mark_SpacingCombining: //Mc 00102 case QChar::Mark_Enclosing: //Me 00103 case QChar::Mark_NonSpacing: //Mn 00104 case QChar::Letter_Modifier: //Lm 00105 case QChar::Number_DecimalDigit: //Nd 00106 return NameCont; 00107 00108 default: 00109 return NotPartOfName; 00110 } 00111 } 00112 00113 bool Tokenizer::isAxisName(QString name, Step::AxisType *type) 00114 { 00115 if (!s_axisNamesDict) { 00116 s_axisNamesDict = new QHash<QString, Step::AxisType>; 00117 for (unsigned int p = 0; p < axisNamesCount; ++p) 00118 s_axisNamesDict->insert(QLatin1String(axisNames[p].name), 00119 axisNames[p].type); 00120 } 00121 00122 QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name); 00123 if ( it != s_axisNamesDict->constEnd() ) { 00124 *type = *it; 00125 } 00126 return it != s_axisNamesDict->constEnd(); 00127 } 00128 00129 bool Tokenizer::isNodeTypeName(QString name) 00130 { 00131 if (!s_nodeTypeNamesDict) { 00132 s_nodeTypeNamesDict = new QSet<QString>; 00133 for (int p = 0; nodeTypeNames[p]; ++p) 00134 s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p])); 00135 } 00136 return s_nodeTypeNamesDict->contains(name); 00137 } 00138 00139 /* Returns whether the last parsed token matches the [32] Operator rule 00140 * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate 00141 * the tokens. 00142 */ 00143 bool Tokenizer::isOperatorContext() 00144 { 00145 if ( m_nextPos == 0 ) { 00146 return false; 00147 } 00148 00149 switch ( m_lastTokenType ) { 00150 case AND: case OR: case MULOP: 00151 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: 00152 case EQOP: case RELOP: 00153 case '@': case AXISNAME: case '(': case '[': 00154 return false; 00155 default: 00156 return true; 00157 } 00158 } 00159 00160 void Tokenizer::skipWS() 00161 { 00162 while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace()) 00163 ++m_nextPos; 00164 } 00165 00166 Token Tokenizer::makeTokenAndAdvance(int code, int advance) 00167 { 00168 m_nextPos += advance; 00169 return Token(code); 00170 } 00171 00172 Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance) 00173 { 00174 m_nextPos += advance; 00175 return Token(code, val); 00176 } 00177 00178 //Returns next char if it's there and interesting, 0 otherwise 00179 char Tokenizer::peekAheadHelper() 00180 { 00181 if (m_nextPos + 1 >= m_data.length()) 00182 return 0; 00183 QChar next = m_data[m_nextPos + 1]; 00184 if (next.row() != 0) 00185 return 0; 00186 else 00187 return next.cell(); 00188 } 00189 00190 char Tokenizer::peekCurHelper() 00191 { 00192 if (m_nextPos >= m_data.length()) 00193 return 0; 00194 QChar next = m_data[m_nextPos]; 00195 if (next.row() != 0) 00196 return 0; 00197 else 00198 return next.cell(); 00199 } 00200 00201 Token Tokenizer::lexString() 00202 { 00203 QChar delimiter = m_data[m_nextPos]; 00204 int startPos = m_nextPos + 1; 00205 00206 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { 00207 if (m_data[m_nextPos] == delimiter) { 00208 QString value = m_data.mid(startPos, m_nextPos - startPos); 00209 ++m_nextPos; //Consume the char; 00210 return Token(LITERAL, value); 00211 } 00212 } 00213 00214 //Ouch, went off the end -- report error 00215 return Token(ERROR); 00216 } 00217 00218 Token Tokenizer::lexNumber() 00219 { 00220 int startPos = m_nextPos; 00221 bool seenDot = false; 00222 00223 //Go until end or a non-digits character 00224 for (; m_nextPos < m_data.length(); ++m_nextPos) { 00225 QChar aChar = m_data[m_nextPos]; 00226 if (aChar.row() != 0) break; 00227 00228 if (aChar.cell() < '0' || aChar.cell() > '9') { 00229 if (aChar.cell() == '.' && !seenDot) 00230 seenDot = true; 00231 else 00232 break; 00233 } 00234 } 00235 00236 QString value = m_data.mid(startPos, m_nextPos - startPos); 00237 return Token(NUMBER, value); 00238 } 00239 00240 Token Tokenizer::lexNCName() 00241 { 00242 int startPos = m_nextPos; 00243 if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart) 00244 { 00245 //Keep going until we get a character that's not good for names. 00246 for (; m_nextPos < m_data.length(); ++m_nextPos) { 00247 if (charCat(m_data[m_nextPos]) == NotPartOfName) 00248 break; 00249 } 00250 00251 QString value = m_data.mid(startPos, m_nextPos - startPos); 00252 return Token(value); 00253 } 00254 else 00255 return makeTokenAndAdvance(ERROR); 00256 } 00257 00258 Token Tokenizer::lexQName() 00259 { 00260 Token t1 = lexNCName(); 00261 if (t1.type == ERROR) return t1; 00262 skipWS(); 00263 //If the next character is :, what we just got it the prefix, if not, 00264 //it's the whole thing 00265 if (peekAheadHelper() != ':') 00266 return t1; 00267 00268 Token t2 = lexNCName(); 00269 if (t2.type == ERROR) return t2; 00270 00271 return Token(t1.value + ":" + t2.value); 00272 } 00273 00274 Token Tokenizer::nextTokenInternal() 00275 { 00276 skipWS(); 00277 00278 if (m_nextPos >= m_data.length()) { 00279 return Token(0); 00280 } 00281 00282 char code = peekCurHelper(); 00283 switch (code) { 00284 case '(': case ')': case '[': case ']': 00285 case '@': case ',': case '|': 00286 return makeTokenAndAdvance(code); 00287 case '\'': 00288 case '\"': 00289 return lexString(); 00290 case '0': case '1': case '2': case '3': case '4': 00291 case '5': case '6': case '7': case '8': case '9': 00292 return lexNumber(); 00293 case '.': { 00294 char next = peekAheadHelper(); 00295 if (next == '.') 00296 return makeTokenAndAdvance(DOTDOT, 2); 00297 else if (next >= '0' && next <= '9') 00298 return lexNumber(); 00299 else 00300 return makeTokenAndAdvance('.'); 00301 } 00302 case '/': 00303 if (peekAheadHelper() == '/') 00304 return makeTokenAndAdvance(SLASHSLASH, 2); 00305 else 00306 return makeTokenAndAdvance('/'); 00307 case '+': 00308 return makeTokenAndAdvance(PLUS); 00309 case '-': 00310 return makeTokenAndAdvance(MINUS); 00311 case '=': 00312 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ); 00313 case '!': 00314 if (peekAheadHelper() == '=') 00315 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2); 00316 else { 00317 return Token(ERROR); 00318 } 00319 case '<': 00320 if (peekAheadHelper() == '=') 00321 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2); 00322 else 00323 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT); 00324 case '>': 00325 if (peekAheadHelper() == '=') 00326 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2); 00327 else 00328 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT); 00329 case '*': 00330 if (isOperatorContext()) 00331 return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul); 00332 else { 00333 ++m_nextPos; 00334 return Token(NAMETEST, "*"); 00335 } 00336 case '$': {//$ QName 00337 m_nextPos++; 00338 Token par = lexQName(); 00339 if (par.type == ERROR) 00340 return par; 00341 else 00342 return Token(VARIABLEREFERENCE, par.value); 00343 } 00344 } 00345 00346 Token t1 = lexNCName(); 00347 if (t1.type == ERROR) return t1; 00348 00349 skipWS(); 00350 00351 //If we're in an operator context, check for any operator names 00352 if (isOperatorContext()) { 00353 if (t1.value == QLatin1String("and")) //### hash? 00354 return Token(AND); 00355 if (t1.value == QLatin1String("or")) 00356 return Token(OR); 00357 if (t1.value == QLatin1String("mod")) 00358 return Token(MULOP, NumericOp::OP_Mod); 00359 if (t1.value == QLatin1String("div")) 00360 return Token(MULOP, NumericOp::OP_Div); 00361 } 00362 00363 //See whether we are at a : 00364 if (peekCurHelper() == ':') { 00365 m_nextPos++; 00366 //Any chance it's an axis name? 00367 if (peekCurHelper() == ':') { 00368 m_nextPos++; 00369 00370 //It might be an axis name. 00371 Step::AxisType axisType; 00372 if (isAxisName(t1.value, &axisType)) 00373 return Token(AXISNAME, axisType); 00374 //Ugh, :: is only valid in axis names -> error 00375 return Token(ERROR); 00376 } 00377 00378 //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest 00379 skipWS(); 00380 if (peekCurHelper() == '*') { 00381 m_nextPos++; 00382 return Token(NAMETEST, t1.value + ":*"); 00383 } 00384 00385 //Make a full qname.. 00386 Token t2 = lexNCName(); 00387 if (t2.type == ERROR) return t2; 00388 00389 t1.value = t1.value + ':' + t2.value; 00390 } 00391 00392 skipWS(); 00393 if (peekCurHelper() == '(') { 00394 //note: we don't swallow the ( here! 00395 00396 //either node type of function name 00397 if (isNodeTypeName(t1.value)) { 00398 if (t1.value == "processing-instruction") 00399 return Token(PI, t1.value); 00400 else 00401 return Token(NODETYPE, t1.value); 00402 } 00403 //must be a function name. 00404 return Token(FUNCTIONNAME, t1.value); 00405 } 00406 00407 //At this point, it must be NAMETEST 00408 return Token(NAMETEST, t1.value); 00409 } 00410 00411 Token Tokenizer::nextToken() 00412 { 00413 Token toRet = nextTokenInternal(); 00414 m_lastTokenType = toRet.type; 00415 return toRet; 00416 } 00417 00418 Tokenizer::Tokenizer() 00419 { 00420 reset(QString()); 00421 } 00422 00423 Tokenizer::~Tokenizer() 00424 { 00425 delete s_axisNamesDict; 00426 delete s_nodeTypeNamesDict; 00427 } 00428 00429 void Tokenizer::reset(QString data) 00430 { 00431 m_nextPos = 0; 00432 m_data = data; 00433 m_lastTokenType = 0; 00434 } 00435 00436 int khtmlxpathyylex() 00437 { 00438 Token tok = Tokenizer::self().nextToken(); 00439 if (tok.hasString) { 00440 khtmlxpathyylval.str = new DOMString(tok.value); 00441 } else if (tok.intValue) { 00442 khtmlxpathyylval.num = tok.intValue; 00443 } 00444 return tok.type; 00445 } 00446 00447 void initTokenizer(const DOM::DOMString& string) 00448 { 00449 Tokenizer::self().reset(string.string()); 00450 } 00451 00452 } // namespace XPath 00453 } // namespace khtml 00454 00455 // kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;
KDE 4.6 API Reference