KIOSlave
parsinghelpers.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com> 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public License 00015 along with this library; see the file COPYING.LIB. If not, write to 00016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00017 Boston, MA 02110-1301, USA. 00018 */ 00019 00020 #include <QDir> 00021 #include <QMap> 00022 #include <QTextCodec> 00023 #include <QUrl> 00024 00025 #include <kcodecs.h> 00026 00027 // Advance *pos beyond spaces / tabs 00028 static void skipSpace(const char input[], int *pos, int end) 00029 { 00030 int idx = *pos; 00031 while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) { 00032 idx++; 00033 } 00034 *pos = idx; 00035 return; 00036 } 00037 00038 // Advance *pos to start of next line while being forgiving about line endings. 00039 // Return false if the end of the header has been reached, true otherwise. 00040 static bool nextLine(const char input[], int *pos, int end) 00041 { 00042 int idx = *pos; 00043 while (idx < end && input[idx] != '\r' && input[idx] != '\n') { 00044 idx++; 00045 } 00046 int rCount = 0; 00047 int nCount = 0; 00048 while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) { 00049 input[idx] == '\r' ? rCount++ : nCount++; 00050 idx++; 00051 } 00052 if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) { 00053 // if just one of the others is missing eat it too. 00054 // this ensures that conforming headers using the proper 00055 // \r\n sequence (and also \n\r) will be parsed correctly. 00056 if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) { 00057 idx++; 00058 } 00059 } 00060 00061 *pos = idx; 00062 return idx < end && rCount < 2 && nCount < 2; 00063 } 00064 00065 //Return true if the term was found, false otherwise. Advance *pos. 00066 //If (*pos + strlen(term) >= end) just advance *pos to end and return false. 00067 //This means that users should always search for the shortest terms first. 00068 static bool consume(const char input[], int *pos, int end, const char *term) 00069 { 00070 // note: gcc/g++ is quite good at optimizing away redundant strlen()s 00071 int idx = *pos; 00072 if (idx + (int)strlen(term) >= end) { 00073 *pos = end; 00074 return false; 00075 } 00076 if (strncasecmp(&input[idx], term, strlen(term)) == 0) { 00077 *pos = idx + strlen(term); 00078 return true; 00079 } 00080 return false; 00081 } 00082 00083 00084 QByteArray TokenIterator::next() 00085 { 00086 QPair<int, int> token = m_tokens[m_currentToken++]; 00087 //fromRawData brings some speed advantage but also the requirement to keep the text buffer 00088 //around. this together with implicit sharing (you don't know where copies end up) 00089 //is dangerous! 00090 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00091 return QByteArray(&m_buffer[token.first], token.second - token.first); 00092 } 00093 00094 QByteArray TokenIterator::current() const 00095 { 00096 QPair<int, int> token = m_tokens[m_currentToken - 1]; 00097 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00098 return QByteArray(&m_buffer[token.first], token.second - token.first); 00099 } 00100 00101 QList<QByteArray> TokenIterator::all() const 00102 { 00103 QList<QByteArray> ret; 00104 for (int i = 0; i < m_tokens.count(); i++) { 00105 QPair<int, int> token = m_tokens[i]; 00106 ret.append(QByteArray(&m_buffer[token.first], token.second - token.first)); 00107 } 00108 return ret; 00109 } 00110 00111 00112 HeaderTokenizer::HeaderTokenizer(char *buffer) 00113 : m_buffer(buffer) 00114 { 00115 // add information about available headers and whether they have one or multiple, 00116 // comma-separated values. 00117 00118 //The following response header fields are from RFC 2616 unless otherwise specified. 00119 //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about 00120 //a header field. 00121 static const HeaderFieldTemplate headerFieldTemplates[] = { 00122 {"accept-ranges", false}, 00123 {"age", false}, 00124 {"cache-control", true}, 00125 {"connection", true}, 00126 {"content-disposition", false}, //is multi-valued in a way, but with ";" separator! 00127 {"content-encoding", true}, 00128 {"content-language", true}, 00129 {"content-length", false}, 00130 {"content-location", false}, 00131 {"content-md5", false}, 00132 {"content-type", false}, 00133 {"date", false}, 00134 {"dav", true}, //RFC 2518 00135 {"etag", false}, 00136 {"expires", false}, 00137 {"keep-alive", false}, //RFC 2068 00138 {"last-modified", false}, 00139 {"link", false}, //RFC 2068, multi-valued with ";" separator 00140 {"location", false}, 00141 {"p3p", true}, // http://www.w3.org/TR/P3P/ 00142 {"pragma", true}, 00143 {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate 00144 //multiple values. we handle this at a higher level. 00145 {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings 00146 //when using "connection" when talking to a proxy. 00147 {"refresh", false}, //not sure, only found some mailing list posts mentioning it 00148 {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved 00149 //by sending several instances of this field as opposed to 00150 //usually comma-separated lists with maybe multiple instances. 00151 {"transfer-encoding", true}, 00152 {"upgrade", true}, 00153 {"warning", true}, 00154 {"www-authenticate", false} //see proxy-authenticate 00155 }; 00156 00157 for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) { 00158 const HeaderFieldTemplate &ft = headerFieldTemplates[i]; 00159 insert(QByteArray(ft.name), HeaderField(ft.isMultiValued)); 00160 } 00161 } 00162 00163 int HeaderTokenizer::tokenize(int begin, int end) 00164 { 00165 char *buf = m_buffer; //keep line length in check :/ 00166 int idx = begin; 00167 int startIdx = begin; //multi-purpose start of current token 00168 bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma? 00169 QByteArray headerKey; 00170 do { 00171 00172 if (buf[idx] == ' ' || buf [idx] == '\t') { 00173 // line continuation; preserve startIdx except (see below) 00174 if (headerKey.isEmpty()) { 00175 continue; 00176 } 00177 // turn CR/LF into spaces for later parsing convenience 00178 int backIdx = idx - 1; 00179 while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) { 00180 buf[backIdx--] = ' '; 00181 } 00182 00183 // multiple values, comma-separated: add new value or continue previous? 00184 if (operator[](headerKey).isMultiValued) { 00185 if (multiValuedEndedWithComma) { 00186 // start new value; this is almost like no line continuation 00187 skipSpace(buf, &idx, end); 00188 startIdx = idx; 00189 } else { 00190 // continue previous value; this is tricky. unit tests to the rescue! 00191 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00192 // remove entry, it will be re-added because already idx != startIdx 00193 operator[](headerKey).beginEnd.removeLast(); 00194 } else { 00195 // no comma, no entry: the prev line was whitespace only - start new value 00196 skipSpace(buf, &idx, end); 00197 startIdx = idx; 00198 } 00199 } 00200 } 00201 00202 } else { 00203 // new field 00204 startIdx = idx; 00205 // also make sure that there is at least one char after the colon 00206 while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') { 00207 buf[idx] = tolower(buf[idx]); 00208 idx++; 00209 } 00210 if (buf[idx] != ':') { 00211 //malformed line: no colon 00212 headerKey.clear(); 00213 continue; 00214 } 00215 headerKey = QByteArray(&buf[startIdx], idx - startIdx); 00216 if (!contains(headerKey)) { 00217 //we don't recognize this header line 00218 headerKey.clear(); 00219 continue; 00220 } 00221 // skip colon & leading whitespace 00222 idx++; 00223 skipSpace(buf, &idx, end); 00224 startIdx = idx; 00225 } 00226 00227 // we have the name/key of the field, now parse the value 00228 if (!operator[](headerKey).isMultiValued) { 00229 00230 // scan to end of line 00231 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') { 00232 idx++; 00233 } 00234 if (!operator[](headerKey).beginEnd.isEmpty()) { 00235 // there already is an entry; are we just in a line continuation? 00236 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00237 // line continuation: delete previous entry and later insert a new, longer one. 00238 operator[](headerKey).beginEnd.removeLast(); 00239 } 00240 } 00241 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00242 00243 } else { 00244 00245 // comma-separated list 00246 while (true) { 00247 //skip one value 00248 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') { 00249 idx++; 00250 } 00251 if (idx != startIdx) { 00252 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00253 } 00254 multiValuedEndedWithComma = buf[idx] == ','; 00255 //skip comma(s) and leading whitespace, if any respectively 00256 while (idx < end && buf[idx] == ',') { 00257 idx++; 00258 } 00259 skipSpace(buf, &idx, end); 00260 //next value or end-of-line / end of header? 00261 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') { 00262 break; 00263 } 00264 //next value 00265 startIdx = idx; 00266 } 00267 } 00268 } while (nextLine(buf, &idx, end)); 00269 return idx; 00270 } 00271 00272 00273 TokenIterator HeaderTokenizer::iterator(const char *key) const 00274 { 00275 QByteArray keyBa = QByteArray::fromRawData(key, strlen(key)); 00276 if (contains(keyBa)) { 00277 return TokenIterator(value(keyBa).beginEnd, m_buffer); 00278 } else { 00279 return TokenIterator(m_nullTokens, m_buffer); 00280 } 00281 } 00282 00283 static void skipLWS(const QString &str, int &pos) 00284 { 00285 while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) 00286 ++pos; 00287 } 00288 00289 // keep the common ending, this allows the compiler to join them 00290 static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?="; 00291 static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?="; 00292 static const char valueSpecials[] = "()<>@,;:\\\"/[]?="; 00293 00294 static bool specialChar(const QChar &ch, const char *specials) 00295 { 00296 // WORKAROUND: According to RFC 2616, any character other than ascii 00297 // characters should NOT be allowed in unquoted content-disposition file 00298 // names. However, since none of the major browsers follow this rule, we do 00299 // the same thing here and allow all printable unicode characters. See 00300 // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials. 00301 if(!ch.isPrint()) 00302 return true; 00303 00304 for( int i = qstrlen(specials) - 1; i>= 0; i--) 00305 if( ch == QLatin1Char(specials[i]) ) 00306 return true; 00307 00308 return false; 00309 } 00310 00326 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials) 00327 { 00328 QString out; 00329 skipLWS(str, pos); 00330 bool valid = true; 00331 00332 while (pos < str.length() && (str[pos] != term)) { 00333 out += str[pos]; 00334 valid = (valid && !specialChar(str[pos], specials)); 00335 ++pos; 00336 } 00337 00338 if (pos < str.length()) // Stopped due to finding term 00339 ++pos; 00340 00341 if( !valid ) 00342 return QString(); 00343 00344 // Remove trailing linear whitespace... 00345 while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) 00346 out.chop(1); 00347 00348 if( out.contains(QLatin1Char(' ')) ) 00349 out.clear(); 00350 00351 return out; 00352 } 00353 00354 // As above, but also handles quotes.. 00355 // pos is set to -1 on parse error 00356 static QString extractMaybeQuotedUntil(const QString &str, int &pos) 00357 { 00358 const QChar term = QLatin1Char(';'); 00359 00360 skipLWS(str, pos); 00361 00362 // Are we quoted? 00363 if (pos < str.length() && str[pos] == QLatin1Char('"')) { 00364 QString out; 00365 00366 // Skip the quote... 00367 ++pos; 00368 00369 // when quoted we also need an end-quote 00370 bool endquote = false; 00371 00372 // Parse until trailing quote... 00373 while (pos < str.length()) { 00374 if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) { 00375 // quoted-pair = "\" CHAR 00376 out += str[pos + 1]; 00377 pos += 2; // Skip both... 00378 } else if (str[pos] == QLatin1Char('"')) { 00379 ++pos; 00380 endquote = true; 00381 break; 00382 } else { 00383 out += str[pos]; 00384 ++pos; 00385 } 00386 } 00387 00388 if( !endquote ) { 00389 pos = -1; 00390 return QString(); 00391 } 00392 00393 // Skip until term.. 00394 while (pos < str.length() && (str[pos] != term)) { 00395 if( (str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t')) ) { 00396 pos = -1; 00397 return QString(); 00398 } 00399 ++pos; 00400 } 00401 00402 if (pos < str.length()) // Stopped due to finding term 00403 ++pos; 00404 00405 return out; 00406 } else { 00407 return extractUntil(str, term, pos, valueSpecials); 00408 } 00409 } 00410 00411 static QMap<QString, QString> contentDispositionParser(const QString &disposition) 00412 { 00413 kDebug(7113) << "disposition: " << disposition; 00414 int pos = 0; 00415 const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower(); 00416 00417 QMap<QString, QString> parameters; 00418 QMap<QString, QString> contparams; // all parameters that contain continuations 00419 QMap<QString, QString> encparams; // all parameters that have character encoding 00420 00421 // the type is invalid, the complete header is junk 00422 if( strDisposition.isEmpty() ) 00423 return parameters; 00424 00425 parameters.insert(QLatin1String("type"), strDisposition); 00426 00427 while (pos < disposition.length()) { 00428 QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower(); 00429 00430 if( key.isEmpty() ) { 00431 // parse error in this key: do not parse more, but add up 00432 // everything we already got 00433 kDebug(7113) << "parse error, abort parsing"; 00434 break; 00435 } 00436 00437 QString val; 00438 if( key.endsWith(QLatin1Char('*')) ) 00439 val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials).toLower(); 00440 else 00441 val = extractMaybeQuotedUntil(disposition, pos); 00442 00443 if( val.isEmpty() ) { 00444 if( pos == -1 ) { 00445 kDebug(7113) << "parse error, abort parsing"; 00446 break; 00447 } 00448 continue; 00449 } 00450 00451 const int spos = key.indexOf(QLatin1Char('*')); 00452 if( spos == key.length() - 1 ) { 00453 key.chop(1); 00454 encparams.insert(key, val); 00455 } else if( spos >= 0 ) { 00456 contparams.insert(key, val); 00457 } else if( parameters.contains(key) ) { 00458 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00459 parameters.remove(key); 00460 return parameters; 00461 } else { 00462 parameters.insert(key, val); 00463 } 00464 } 00465 00466 QMap<QString, QString>::iterator i = contparams.begin(); 00467 while( i != contparams.end() ) { 00468 QString key = i.key(); 00469 int spos = key.indexOf(QLatin1Char('*')); 00470 bool hasencoding = false; 00471 00472 if( key.at(spos + 1) != QLatin1Char('0') ) { 00473 ++i; 00474 continue; 00475 } 00476 00477 // no leading zeros allowed, so delete the junk 00478 int klen = key.length(); 00479 if( klen > spos + 2 ) { 00480 // nothing but continuations and encodings may insert * into parameter name 00481 if( (klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*'))) ) { 00482 kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2); 00483 i = contparams.erase(i); 00484 continue; 00485 } 00486 hasencoding = true; 00487 } 00488 00489 int seqnum = 1; 00490 QMap<QString, QString>::iterator partsi; 00491 // we do not need to care about encoding specifications: only the first 00492 // part is allowed to have one 00493 QString val = i.value(); 00494 00495 if (hasencoding) 00496 key.chop(2); 00497 else 00498 key.chop(1); 00499 00500 while( (partsi = contparams.find(key + QString::number(seqnum))) != contparams.end() ) 00501 { 00502 val += partsi.value(); 00503 contparams.erase(partsi); 00504 } 00505 00506 i = contparams.erase(i); 00507 00508 key.chop(1); 00509 if (hasencoding) { 00510 encparams.insert(key, val); 00511 } else { 00512 if( parameters.contains(key) ) { 00513 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00514 parameters.remove(key); 00515 return parameters; 00516 } 00517 00518 parameters.insert(key, val); 00519 } 00520 } 00521 00522 for( QMap<QString, QString>::iterator i = encparams.begin(), iEnd = encparams.end(); 00523 i != encparams.end(); ++i ) 00524 { 00525 QString val = i.value(); 00526 00527 // RfC 2231 encoded character set in filename 00528 int spos = val.indexOf(QLatin1Char('\'')); 00529 if (spos == -1) 00530 continue; 00531 int npos = val.indexOf(QLatin1Char('\''), spos + 1); 00532 if (npos == -1) 00533 continue; 00534 00535 const QString charset = val.left( spos ); 00536 const QString lang = val.mid( spos + 1, npos - spos - 1 ); 00537 const QByteArray rawval = QByteArray::fromPercentEncoding( val.mid(npos + 1).toAscii() ); 00538 if( charset.isEmpty() || (charset == QLatin1String("us-ascii")) ) { 00539 bool valid = true; 00540 for( int j = rawval.length() - 1; (j >= 0) && valid; j-- ) 00541 valid = (rawval.at(j) >= 32); 00542 00543 if( valid ) 00544 val = QString::fromAscii(rawval.constData()); 00545 else 00546 val.clear(); 00547 } else { 00548 QTextCodec *codec = QTextCodec::codecForName( charset.toAscii() ); 00549 if( codec ) 00550 val = codec->toUnicode( rawval ); 00551 else 00552 val.clear(); 00553 } 00554 00555 if( !val.isEmpty() ) { 00556 parameters.insert( i.key(), val ); 00557 } 00558 } 00559 00560 const QLatin1String fn("filename"); 00561 if( parameters.contains(fn) ) { 00562 // Content-Disposition is not allowed to dictate directory 00563 // path, thus we extract the filename only. 00564 const QString val = QDir::toNativeSeparators( parameters[fn] ); 00565 int slpos = val.lastIndexOf( QDir::separator() ); 00566 00567 if( slpos > -1 ) 00568 parameters.insert(fn, val.mid( slpos + 1 )); 00569 } 00570 00571 return parameters; 00572 }
KDE 4.6 API Reference