KIOSlave
parsinghelpers.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com> 00003 Copyright (C) 2010,2011 Rolf Eike Beer <kde@opensource.sf-tec.de> 00004 00005 This library is free software; you can redistribute it and/or 00006 modify it under the terms of the GNU Library General Public 00007 License as published by the Free Software Foundation; either 00008 version 2 of the License, or (at your option) any later version. 00009 00010 This library is distributed in the hope that it will be useful, 00011 but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 Library General Public License for more details. 00014 00015 You should have received a copy of the GNU Library General Public License 00016 along with this library; see the file COPYING.LIB. If not, write to 00017 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00018 Boston, MA 02110-1301, USA. 00019 */ 00020 00021 #include <QDir> 00022 #include <QMap> 00023 #include <QTextCodec> 00024 #include <QUrl> 00025 00026 #include <kcodecs.h> 00027 #include <kdebug.h> 00028 00029 // Advance *pos beyond spaces / tabs 00030 static void skipSpace(const char input[], int *pos, int end) 00031 { 00032 int idx = *pos; 00033 while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) { 00034 idx++; 00035 } 00036 *pos = idx; 00037 return; 00038 } 00039 00040 // Advance *pos to start of next line while being forgiving about line endings. 00041 // Return false if the end of the header has been reached, true otherwise. 00042 static bool nextLine(const char input[], int *pos, int end) 00043 { 00044 int idx = *pos; 00045 while (idx < end && input[idx] != '\r' && input[idx] != '\n') { 00046 idx++; 00047 } 00048 int rCount = 0; 00049 int nCount = 0; 00050 while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) { 00051 input[idx] == '\r' ? rCount++ : nCount++; 00052 idx++; 00053 } 00054 if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) { 00055 // if just one of the others is missing eat it too. 00056 // this ensures that conforming headers using the proper 00057 // \r\n sequence (and also \n\r) will be parsed correctly. 00058 if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) { 00059 idx++; 00060 } 00061 } 00062 00063 *pos = idx; 00064 return idx < end && rCount < 2 && nCount < 2; 00065 } 00066 00067 QByteArray TokenIterator::next() 00068 { 00069 QPair<int, int> token = m_tokens[m_currentToken++]; 00070 //fromRawData brings some speed advantage but also the requirement to keep the text buffer 00071 //around. this together with implicit sharing (you don't know where copies end up) 00072 //is dangerous! 00073 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00074 return QByteArray(&m_buffer[token.first], token.second - token.first); 00075 } 00076 00077 QByteArray TokenIterator::current() const 00078 { 00079 QPair<int, int> token = m_tokens[m_currentToken - 1]; 00080 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00081 return QByteArray(&m_buffer[token.first], token.second - token.first); 00082 } 00083 00084 QList<QByteArray> TokenIterator::all() const 00085 { 00086 QList<QByteArray> ret; 00087 for (int i = 0; i < m_tokens.count(); i++) { 00088 QPair<int, int> token = m_tokens[i]; 00089 ret.append(QByteArray(&m_buffer[token.first], token.second - token.first)); 00090 } 00091 return ret; 00092 } 00093 00094 00095 HeaderTokenizer::HeaderTokenizer(char *buffer) 00096 : m_buffer(buffer) 00097 { 00098 // add information about available headers and whether they have one or multiple, 00099 // comma-separated values. 00100 00101 //The following response header fields are from RFC 2616 unless otherwise specified. 00102 //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about 00103 //a header field. 00104 static const HeaderFieldTemplate headerFieldTemplates[] = { 00105 {"accept-ranges", false}, 00106 {"age", false}, 00107 {"cache-control", true}, 00108 {"connection", true}, 00109 {"content-disposition", false}, //is multi-valued in a way, but with ";" separator! 00110 {"content-encoding", true}, 00111 {"content-language", true}, 00112 {"content-length", false}, 00113 {"content-location", false}, 00114 {"content-md5", false}, 00115 {"content-type", false}, 00116 {"date", false}, 00117 {"dav", true}, //RFC 2518 00118 {"etag", false}, 00119 {"expires", false}, 00120 {"keep-alive", false}, //RFC 2068 00121 {"last-modified", false}, 00122 {"link", false}, //RFC 2068, multi-valued with ";" separator 00123 {"location", false}, 00124 {"p3p", true}, // http://www.w3.org/TR/P3P/ 00125 {"pragma", true}, 00126 {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate 00127 //multiple values. we handle this at a higher level. 00128 {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings 00129 //when using "connection" when talking to a proxy. 00130 {"refresh", false}, //not sure, only found some mailing list posts mentioning it 00131 {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved 00132 //by sending several instances of this field as opposed to 00133 //usually comma-separated lists with maybe multiple instances. 00134 {"transfer-encoding", true}, 00135 {"upgrade", true}, 00136 {"warning", true}, 00137 {"www-authenticate", false} //see proxy-authenticate 00138 }; 00139 00140 for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) { 00141 const HeaderFieldTemplate &ft = headerFieldTemplates[i]; 00142 insert(QByteArray(ft.name), HeaderField(ft.isMultiValued)); 00143 } 00144 } 00145 00146 int HeaderTokenizer::tokenize(int begin, int end) 00147 { 00148 char *buf = m_buffer; //keep line length in check :/ 00149 int idx = begin; 00150 int startIdx = begin; //multi-purpose start of current token 00151 bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma? 00152 QByteArray headerKey; 00153 do { 00154 00155 if (buf[idx] == ' ' || buf [idx] == '\t') { 00156 // line continuation; preserve startIdx except (see below) 00157 if (headerKey.isEmpty()) { 00158 continue; 00159 } 00160 // turn CR/LF into spaces for later parsing convenience 00161 int backIdx = idx - 1; 00162 while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) { 00163 buf[backIdx--] = ' '; 00164 } 00165 00166 // multiple values, comma-separated: add new value or continue previous? 00167 if (operator[](headerKey).isMultiValued) { 00168 if (multiValuedEndedWithComma) { 00169 // start new value; this is almost like no line continuation 00170 skipSpace(buf, &idx, end); 00171 startIdx = idx; 00172 } else { 00173 // continue previous value; this is tricky. unit tests to the rescue! 00174 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00175 // remove entry, it will be re-added because already idx != startIdx 00176 operator[](headerKey).beginEnd.removeLast(); 00177 } else { 00178 // no comma, no entry: the prev line was whitespace only - start new value 00179 skipSpace(buf, &idx, end); 00180 startIdx = idx; 00181 } 00182 } 00183 } 00184 00185 } else { 00186 // new field 00187 startIdx = idx; 00188 // also make sure that there is at least one char after the colon 00189 while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') { 00190 buf[idx] = tolower(buf[idx]); 00191 idx++; 00192 } 00193 if (buf[idx] != ':') { 00194 //malformed line: no colon 00195 headerKey.clear(); 00196 continue; 00197 } 00198 headerKey = QByteArray(&buf[startIdx], idx - startIdx); 00199 if (!contains(headerKey)) { 00200 //we don't recognize this header line 00201 headerKey.clear(); 00202 continue; 00203 } 00204 // skip colon & leading whitespace 00205 idx++; 00206 skipSpace(buf, &idx, end); 00207 startIdx = idx; 00208 } 00209 00210 // we have the name/key of the field, now parse the value 00211 if (!operator[](headerKey).isMultiValued) { 00212 00213 // scan to end of line 00214 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') { 00215 idx++; 00216 } 00217 if (!operator[](headerKey).beginEnd.isEmpty()) { 00218 // there already is an entry; are we just in a line continuation? 00219 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00220 // line continuation: delete previous entry and later insert a new, longer one. 00221 operator[](headerKey).beginEnd.removeLast(); 00222 } 00223 } 00224 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00225 00226 } else { 00227 00228 // comma-separated list 00229 while (true) { 00230 //skip one value 00231 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') { 00232 idx++; 00233 } 00234 if (idx != startIdx) { 00235 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00236 } 00237 multiValuedEndedWithComma = buf[idx] == ','; 00238 //skip comma(s) and leading whitespace, if any respectively 00239 while (idx < end && buf[idx] == ',') { 00240 idx++; 00241 } 00242 skipSpace(buf, &idx, end); 00243 //next value or end-of-line / end of header? 00244 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') { 00245 break; 00246 } 00247 //next value 00248 startIdx = idx; 00249 } 00250 } 00251 } while (nextLine(buf, &idx, end)); 00252 return idx; 00253 } 00254 00255 00256 TokenIterator HeaderTokenizer::iterator(const char *key) const 00257 { 00258 QByteArray keyBa = QByteArray::fromRawData(key, strlen(key)); 00259 if (contains(keyBa)) { 00260 return TokenIterator(value(keyBa).beginEnd, m_buffer); 00261 } else { 00262 return TokenIterator(m_nullTokens, m_buffer); 00263 } 00264 } 00265 00266 static void skipLWS(const QString &str, int &pos) 00267 { 00268 while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) { 00269 ++pos; 00270 } 00271 } 00272 00273 // keep the common ending, this allows the compiler to join them 00274 static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?="; 00275 static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?="; 00276 static const char valueSpecials[] = "()<>@,;:\\\"/[]?="; 00277 00278 static bool specialChar(const QChar &ch, const char *specials) 00279 { 00280 // WORKAROUND: According to RFC 2616, any character other than ascii 00281 // characters should NOT be allowed in unquoted content-disposition file 00282 // names. However, since none of the major browsers follow this rule, we do 00283 // the same thing here and allow all printable unicode characters. See 00284 // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials. 00285 if (!ch.isPrint()) { 00286 return true; 00287 } 00288 00289 for (int i = qstrlen(specials) - 1; i >= 0; i--) { 00290 if (ch == QLatin1Char(specials[i])) { 00291 return true; 00292 } 00293 } 00294 00295 return false; 00296 } 00297 00313 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials) 00314 { 00315 QString out; 00316 skipLWS(str, pos); 00317 bool valid = true; 00318 00319 while (pos < str.length() && (str[pos] != term)) { 00320 out += str[pos]; 00321 valid = (valid && !specialChar(str[pos], specials)); 00322 ++pos; 00323 } 00324 00325 if (pos < str.length()) { // Stopped due to finding term 00326 ++pos; 00327 } 00328 00329 if (!valid) { 00330 return QString(); 00331 } 00332 00333 // Remove trailing linear whitespace... 00334 while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) { 00335 out.chop(1); 00336 } 00337 00338 if (out.contains(QLatin1Char(' '))) { 00339 out.clear(); 00340 } 00341 00342 return out; 00343 } 00344 00345 // As above, but also handles quotes.. 00346 // pos is set to -1 on parse error 00347 static QString extractMaybeQuotedUntil(const QString &str, int &pos) 00348 { 00349 const QChar term = QLatin1Char(';'); 00350 00351 skipLWS(str, pos); 00352 00353 // Are we quoted? 00354 if (pos < str.length() && str[pos] == QLatin1Char('"')) { 00355 QString out; 00356 00357 // Skip the quote... 00358 ++pos; 00359 00360 // when quoted we also need an end-quote 00361 bool endquote = false; 00362 00363 // Parse until trailing quote... 00364 while (pos < str.length()) { 00365 if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) { 00366 // quoted-pair = "\" CHAR 00367 out += str[pos + 1]; 00368 pos += 2; // Skip both... 00369 } else if (str[pos] == QLatin1Char('"')) { 00370 ++pos; 00371 endquote = true; 00372 break; 00373 } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2 00374 break; 00375 } else { 00376 out += str[pos]; 00377 ++pos; 00378 } 00379 } 00380 00381 if (!endquote) { 00382 pos = -1; 00383 return QString(); 00384 } 00385 00386 // Skip until term.. 00387 while (pos < str.length() && (str[pos] != term)) { 00388 if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) { 00389 pos = -1; 00390 return QString(); 00391 } 00392 ++pos; 00393 } 00394 00395 if (pos < str.length()) { // Stopped due to finding term 00396 ++pos; 00397 } 00398 00399 return out; 00400 } else { 00401 return extractUntil(str, term, pos, valueSpecials); 00402 } 00403 } 00404 00405 static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition) 00406 { 00407 kDebug(7113) << "disposition: " << disposition; 00408 int pos = 0; 00409 const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower(); 00410 00411 QMap<QString, QString> parameters; 00412 QMap<QString, QString> contparams; // all parameters that contain continuations 00413 QMap<QString, QString> encparams; // all parameters that have character encoding 00414 00415 // the type is invalid, the complete header is junk 00416 if (strDisposition.isEmpty()) { 00417 return parameters; 00418 } 00419 00420 parameters.insert(QLatin1String("type"), strDisposition); 00421 00422 while (pos < disposition.length()) { 00423 QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower(); 00424 00425 if (key.isEmpty()) { 00426 // parse error in this key: do not parse more, but add up 00427 // everything we already got 00428 kDebug(7113) << "parse error in key, abort parsing"; 00429 break; 00430 } 00431 00432 QString val; 00433 if (key.endsWith(QLatin1Char('*'))) { 00434 val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials); 00435 } else { 00436 val = extractMaybeQuotedUntil(disposition, pos); 00437 } 00438 00439 if (val.isEmpty()) { 00440 if (pos == -1) { 00441 kDebug(7113) << "parse error in value, abort parsing"; 00442 break; 00443 } 00444 continue; 00445 } 00446 00447 const int spos = key.indexOf(QLatin1Char('*')); 00448 if (spos == key.length() - 1) { 00449 key.chop(1); 00450 encparams.insert(key, val); 00451 } else if (spos >= 0) { 00452 contparams.insert(key, val); 00453 } else if (parameters.contains(key)) { 00454 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00455 parameters.remove(key); 00456 return parameters; 00457 } else { 00458 parameters.insert(key, val); 00459 } 00460 } 00461 00462 QMap<QString, QString>::iterator i = contparams.begin(); 00463 while (i != contparams.end()) { 00464 QString key = i.key(); 00465 int spos = key.indexOf(QLatin1Char('*')); 00466 bool hasencoding = false; 00467 00468 if (key.at(spos + 1) != QLatin1Char('0')) { 00469 ++i; 00470 continue; 00471 } 00472 00473 // no leading zeros allowed, so delete the junk 00474 int klen = key.length(); 00475 if (klen > spos + 2) { 00476 // nothing but continuations and encodings may insert * into parameter name 00477 if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) { 00478 kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2); 00479 i = contparams.erase(i); 00480 continue; 00481 } 00482 hasencoding = true; 00483 } 00484 00485 int seqnum = 1; 00486 QMap<QString, QString>::iterator partsi; 00487 // we do not need to care about encoding specifications: only the first 00488 // part is allowed to have one 00489 QString val = i.value(); 00490 00491 key.chop(hasencoding ? 2 : 1); 00492 00493 while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) { 00494 val += partsi.value(); 00495 contparams.erase(partsi); 00496 } 00497 00498 i = contparams.erase(i); 00499 00500 key.chop(1); 00501 if (hasencoding) { 00502 encparams.insert(key, val); 00503 } else { 00504 if (parameters.contains(key)) { 00505 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00506 parameters.remove(key); 00507 return parameters; 00508 } 00509 00510 parameters.insert(key, val); 00511 } 00512 } 00513 00514 for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) { 00515 QString val = i.value(); 00516 00517 // RfC 2231 encoded character set in filename 00518 int spos = val.indexOf(QLatin1Char('\'')); 00519 if (spos == -1) { 00520 continue; 00521 } 00522 int npos = val.indexOf(QLatin1Char('\''), spos + 1); 00523 if (npos == -1) { 00524 continue; 00525 } 00526 00527 const QString charset = val.left(spos); 00528 const QString lang = val.mid(spos + 1, npos - spos - 1); 00529 const QByteArray rawval = QByteArray::fromPercentEncoding(val.mid(npos + 1).toAscii()); 00530 00531 if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) { 00532 bool valid = true; 00533 for (int j = rawval.length() - 1; (j >= 0) && valid; j--) { 00534 valid = (rawval.at(j) >= 32); 00535 } 00536 00537 if (!valid) 00538 continue; 00539 val = QString::fromAscii(rawval.constData()); 00540 } else { 00541 QTextCodec *codec = QTextCodec::codecForName(charset.toAscii()); 00542 if (!codec) 00543 continue; 00544 val = codec->toUnicode(rawval); 00545 } 00546 00547 parameters.insert(i.key(), val); 00548 } 00549 00550 return parameters; 00551 } 00552 00553 static QMap<QString, QString> contentDispositionParser(const QString &disposition) 00554 { 00555 QMap<QString, QString> parameters = contentDispositionParserInternal(disposition); 00556 00557 const QLatin1String fn("filename"); 00558 if (parameters.contains(fn)) { 00559 // Content-Disposition is not allowed to dictate directory 00560 // path, thus we extract the filename only. 00561 const QString val = QDir::toNativeSeparators(parameters[fn]); 00562 int slpos = val.lastIndexOf(QDir::separator()); 00563 00564 if (slpos > -1) { 00565 parameters.insert(fn, val.mid(slpos + 1)); 00566 } 00567 } 00568 00569 return parameters; 00570 }
KDE 4.7 API Reference