• Skip to content
  • Skip to link menu
KDE 4.6 API Reference
  • KDE API Reference
  • kdelibs
  • KDE Home
  • Contact Us
 

KIOSlave

parsinghelpers.cpp

Go to the documentation of this file.
00001 /* This file is part of the KDE libraries
00002     Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com>
00003 
00004     This library is free software; you can redistribute it and/or
00005     modify it under the terms of the GNU Library General Public
00006     License as published by the Free Software Foundation; either
00007     version 2 of the License, or (at your option) any later version.
00008 
00009     This library is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012     Library General Public License for more details.
00013 
00014     You should have received a copy of the GNU Library General Public License
00015     along with this library; see the file COPYING.LIB.  If not, write to
00016     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017     Boston, MA 02110-1301, USA.
00018 */
00019 
00020 #include <QDir>
00021 #include <QMap>
00022 #include <QTextCodec>
00023 #include <QUrl>
00024 
00025 #include <kcodecs.h>
00026 
00027 // Advance *pos beyond spaces / tabs
00028 static void skipSpace(const char input[], int *pos, int end)
00029 {
00030     int idx = *pos;
00031     while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) {
00032         idx++;
00033     }
00034     *pos = idx;
00035     return;
00036 }
00037 
00038 // Advance *pos to start of next line while being forgiving about line endings.
00039 // Return false if the end of the header has been reached, true otherwise.
00040 static bool nextLine(const char input[], int *pos, int end)
00041 {
00042     int idx = *pos;
00043     while (idx < end && input[idx] != '\r' && input[idx] != '\n') {
00044         idx++;
00045     }
00046     int rCount = 0;
00047     int nCount = 0;
00048     while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) {
00049         input[idx] == '\r' ? rCount++ : nCount++;
00050         idx++;
00051     }
00052     if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) {
00053         // if just one of the others is missing eat it too.
00054         // this ensures that conforming headers using the proper
00055         // \r\n sequence (and also \n\r) will be parsed correctly.
00056         if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) {
00057             idx++;
00058         }
00059     }
00060 
00061     *pos = idx;
00062     return idx < end && rCount < 2 && nCount < 2;
00063 }
00064 
00065 //Return true if the term was found, false otherwise. Advance *pos.
00066 //If (*pos + strlen(term) >= end) just advance *pos to end and return false.
00067 //This means that users should always search for the shortest terms first.
00068 static bool consume(const char input[], int *pos, int end, const char *term)
00069 {
00070     // note: gcc/g++ is quite good at optimizing away redundant strlen()s
00071     int idx = *pos;
00072     if (idx + (int)strlen(term) >= end) {
00073         *pos = end;
00074         return false;
00075     }
00076     if (strncasecmp(&input[idx], term, strlen(term)) == 0) {
00077         *pos = idx + strlen(term);
00078         return true;
00079     }
00080     return false;
00081 }
00082 
00083 
00084 QByteArray TokenIterator::next()
00085 {
00086     QPair<int, int> token = m_tokens[m_currentToken++];
00087     //fromRawData brings some speed advantage but also the requirement to keep the text buffer
00088     //around. this together with implicit sharing (you don't know where copies end up)
00089     //is dangerous!
00090     //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
00091     return QByteArray(&m_buffer[token.first], token.second - token.first);
00092 }
00093 
00094 QByteArray TokenIterator::current() const
00095 {
00096     QPair<int, int> token = m_tokens[m_currentToken - 1];
00097     //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
00098     return QByteArray(&m_buffer[token.first], token.second - token.first);
00099 }
00100 
00101 QList<QByteArray> TokenIterator::all() const
00102 {
00103     QList<QByteArray> ret;
00104     for (int i = 0; i < m_tokens.count(); i++) {
00105         QPair<int, int> token = m_tokens[i];
00106         ret.append(QByteArray(&m_buffer[token.first], token.second - token.first));
00107     }
00108     return ret;
00109 }
00110 
00111 
00112 HeaderTokenizer::HeaderTokenizer(char *buffer)
00113  : m_buffer(buffer)
00114 {
00115     // add information about available headers and whether they have one or multiple,
00116     // comma-separated values.
00117 
00118     //The following response header fields are from RFC 2616 unless otherwise specified.
00119     //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about
00120     //a header field.
00121     static const HeaderFieldTemplate headerFieldTemplates[] = {
00122         {"accept-ranges", false},
00123         {"age", false},
00124         {"cache-control", true},
00125         {"connection", true},
00126         {"content-disposition", false}, //is multi-valued in a way, but with ";" separator!
00127         {"content-encoding", true},
00128         {"content-language", true},
00129         {"content-length", false},
00130         {"content-location", false},
00131         {"content-md5", false},
00132         {"content-type", false},
00133         {"date", false},
00134         {"dav", true}, //RFC 2518
00135         {"etag", false},
00136         {"expires", false},
00137         {"keep-alive", false}, //RFC 2068
00138         {"last-modified", false},
00139         {"link", false}, //RFC 2068, multi-valued with ";" separator
00140         {"location", false},
00141         {"p3p", true}, // http://www.w3.org/TR/P3P/
00142         {"pragma", true},
00143         {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate
00144                                        //multiple values. we handle this at a higher level.
00145         {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings
00146                                     //when using "connection" when talking to a proxy.
00147         {"refresh", false}, //not sure, only found some mailing list posts mentioning it
00148         {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved
00149                                //by sending several instances of this field as opposed to
00150                                //usually comma-separated lists with maybe multiple instances.
00151         {"transfer-encoding", true},
00152         {"upgrade", true},
00153         {"warning", true},
00154         {"www-authenticate", false} //see proxy-authenticate
00155     };
00156 
00157     for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) {
00158         const HeaderFieldTemplate &ft = headerFieldTemplates[i];
00159         insert(QByteArray(ft.name), HeaderField(ft.isMultiValued));
00160     }
00161 }
00162 
00163 int HeaderTokenizer::tokenize(int begin, int end)
00164 {
00165     char *buf = m_buffer;  //keep line length in check :/
00166     int idx = begin;
00167     int startIdx = begin; //multi-purpose start of current token
00168     bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma?
00169     QByteArray headerKey;
00170     do {
00171 
00172         if (buf[idx] == ' ' || buf [idx] == '\t') {
00173             // line continuation; preserve startIdx except (see below)
00174             if (headerKey.isEmpty()) {
00175                 continue;
00176             }
00177             // turn CR/LF into spaces for later parsing convenience
00178             int backIdx = idx - 1;
00179             while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) {
00180                 buf[backIdx--] = ' ';
00181             }
00182 
00183             // multiple values, comma-separated: add new value or continue previous?
00184             if (operator[](headerKey).isMultiValued) {
00185                 if (multiValuedEndedWithComma) {
00186                     // start new value; this is almost like no line continuation
00187                     skipSpace(buf, &idx, end);
00188                     startIdx = idx;
00189                 } else {
00190                     // continue previous value; this is tricky. unit tests to the rescue!
00191                     if (operator[](headerKey).beginEnd.last().first == startIdx) {
00192                         // remove entry, it will be re-added because already idx != startIdx
00193                         operator[](headerKey).beginEnd.removeLast();
00194                     } else {
00195                         // no comma, no entry: the prev line was whitespace only - start new value
00196                         skipSpace(buf, &idx, end);
00197                         startIdx = idx;
00198                     }
00199                 }
00200             }
00201 
00202         } else {
00203             // new field
00204             startIdx = idx;
00205             // also make sure that there is at least one char after the colon
00206             while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') {
00207                 buf[idx] = tolower(buf[idx]);
00208                 idx++;
00209             }
00210             if (buf[idx] != ':') {
00211                 //malformed line: no colon
00212                 headerKey.clear();
00213                 continue;
00214             }
00215             headerKey = QByteArray(&buf[startIdx], idx - startIdx);
00216             if (!contains(headerKey)) {
00217                 //we don't recognize this header line
00218                 headerKey.clear();
00219                 continue;
00220             }
00221             // skip colon & leading whitespace
00222             idx++;
00223             skipSpace(buf, &idx, end);
00224             startIdx = idx;
00225         }
00226 
00227         // we have the name/key of the field, now parse the value
00228         if (!operator[](headerKey).isMultiValued) {
00229 
00230             // scan to end of line
00231             while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') {
00232                 idx++;
00233             }
00234             if (!operator[](headerKey).beginEnd.isEmpty()) {
00235                 // there already is an entry; are we just in a line continuation?
00236                 if (operator[](headerKey).beginEnd.last().first == startIdx) {
00237                     // line continuation: delete previous entry and later insert a new, longer one.
00238                     operator[](headerKey).beginEnd.removeLast();
00239                 }
00240             }
00241             operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));
00242 
00243         } else {
00244 
00245             // comma-separated list
00246             while (true) {
00247                 //skip one value
00248                 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') {
00249                     idx++;
00250                 }
00251                 if (idx != startIdx) {
00252                     operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx));
00253                 }
00254                 multiValuedEndedWithComma = buf[idx] == ',';
00255                 //skip comma(s) and leading whitespace, if any respectively
00256                 while (idx < end && buf[idx] == ',') {
00257                     idx++;
00258                 }
00259                 skipSpace(buf, &idx, end);
00260                 //next value or end-of-line / end of header?
00261                 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') {
00262                     break;
00263                 }
00264                 //next value
00265                 startIdx = idx;
00266             }
00267         }
00268     } while (nextLine(buf, &idx, end));
00269     return idx;
00270 }
00271 
00272 
00273 TokenIterator HeaderTokenizer::iterator(const char *key) const
00274 {
00275     QByteArray keyBa = QByteArray::fromRawData(key, strlen(key));
00276     if (contains(keyBa)) {
00277         return TokenIterator(value(keyBa).beginEnd, m_buffer);
00278     } else {
00279         return TokenIterator(m_nullTokens, m_buffer);
00280     }
00281 }
00282 
00283 static void skipLWS(const QString &str, int &pos)
00284 {
00285     while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t')))
00286         ++pos;
00287 }
00288 
00289 // keep the common ending, this allows the compiler to join them
00290 static const char typeSpecials[] =  "{}*'%()<>@,;:\\\"/[]?=";
00291 static const char attrSpecials[] =     "'%()<>@,;:\\\"/[]?=";
00292 static const char valueSpecials[] =      "()<>@,;:\\\"/[]?=";
00293 
00294 static bool specialChar(const QChar &ch, const char *specials)
00295 {
00296     // WORKAROUND: According to RFC 2616, any character other than ascii
00297     // characters should NOT be allowed in unquoted content-disposition file
00298     // names. However, since none of the major browsers follow this rule, we do
00299     // the same thing here and allow all printable unicode characters. See
00300     // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials.
00301     if(!ch.isPrint())
00302         return true;
00303 
00304     for( int i = qstrlen(specials) - 1; i>= 0; i--)
00305        if( ch == QLatin1Char(specials[i]) )
00306            return true;
00307 
00308     return false;
00309 }
00310 
00326 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials)
00327 {
00328     QString out;
00329     skipLWS(str, pos);
00330     bool valid = true;
00331 
00332     while (pos < str.length() && (str[pos] != term)) {
00333         out += str[pos];
00334         valid = (valid && !specialChar(str[pos], specials));
00335         ++pos;
00336     }
00337 
00338     if (pos < str.length()) // Stopped due to finding term
00339         ++pos;
00340 
00341     if( !valid )
00342         return QString();
00343 
00344     // Remove trailing linear whitespace...
00345     while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t')))
00346         out.chop(1);
00347 
00348     if( out.contains(QLatin1Char(' ')) )
00349         out.clear();
00350 
00351     return out;
00352 }
00353 
00354 // As above, but also handles quotes..
00355 // pos is set to -1 on parse error
00356 static QString extractMaybeQuotedUntil(const QString &str, int &pos)
00357 {
00358     const QChar term = QLatin1Char(';');
00359 
00360     skipLWS(str, pos);
00361 
00362     // Are we quoted?
00363     if (pos < str.length() && str[pos] == QLatin1Char('"')) {
00364         QString out;
00365 
00366         // Skip the quote...
00367         ++pos;
00368 
00369         // when quoted we also need an end-quote
00370         bool endquote = false;
00371 
00372         // Parse until trailing quote...
00373         while (pos < str.length()) {
00374             if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) {
00375                 // quoted-pair = "\" CHAR
00376                 out += str[pos + 1];
00377                 pos += 2; // Skip both...
00378             } else if (str[pos] == QLatin1Char('"')) {
00379                 ++pos;
00380                 endquote = true;
00381                 break;
00382             }  else {
00383                 out += str[pos];
00384                 ++pos;
00385             }
00386         }
00387 
00388         if( !endquote ) {
00389             pos = -1;
00390             return QString();
00391         }
00392 
00393         // Skip until term..
00394         while (pos < str.length() && (str[pos] != term)) {
00395             if( (str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t')) ) {
00396               pos = -1;
00397               return QString();
00398             }
00399             ++pos;
00400         }
00401 
00402         if (pos < str.length()) // Stopped due to finding term
00403             ++pos;
00404 
00405         return out;
00406     } else {
00407         return extractUntil(str, term, pos, valueSpecials);
00408     }
00409 }
00410 
00411 static QMap<QString, QString> contentDispositionParser(const QString &disposition)
00412 {
00413     kDebug(7113) << "disposition: " << disposition;
00414     int pos = 0;
00415     const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower();
00416 
00417     QMap<QString, QString> parameters;
00418     QMap<QString, QString> contparams;   // all parameters that contain continuations
00419     QMap<QString, QString> encparams;    // all parameters that have character encoding
00420 
00421     // the type is invalid, the complete header is junk
00422     if( strDisposition.isEmpty() )
00423         return parameters;
00424 
00425     parameters.insert(QLatin1String("type"), strDisposition);
00426 
00427     while (pos < disposition.length()) {
00428         QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower();
00429 
00430         if( key.isEmpty() ) {
00431             // parse error in this key: do not parse more, but add up
00432             // everything we already got
00433             kDebug(7113) << "parse error, abort parsing";
00434             break;
00435         }
00436 
00437         QString val;
00438         if( key.endsWith(QLatin1Char('*')) )
00439             val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials).toLower();
00440         else
00441             val = extractMaybeQuotedUntil(disposition, pos);
00442 
00443         if( val.isEmpty() ) {
00444             if( pos == -1 ) {
00445                 kDebug(7113) << "parse error, abort parsing";
00446                 break;
00447             }
00448             continue;
00449         }
00450 
00451         const int spos = key.indexOf(QLatin1Char('*'));
00452         if( spos == key.length() - 1 ) {
00453             key.chop(1);
00454             encparams.insert(key, val);
00455         } else if( spos >= 0 ) {
00456             contparams.insert(key, val);
00457         } else if( parameters.contains(key) ) {
00458             kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
00459             parameters.remove(key);
00460             return parameters;
00461         } else {
00462             parameters.insert(key, val);
00463         }
00464     }
00465 
00466     QMap<QString, QString>::iterator i = contparams.begin();
00467     while( i != contparams.end() ) {
00468         QString key = i.key();
00469         int spos = key.indexOf(QLatin1Char('*'));
00470         bool hasencoding = false;
00471 
00472         if( key.at(spos + 1) != QLatin1Char('0') ) {
00473             ++i;
00474             continue;
00475         }
00476 
00477         // no leading zeros allowed, so delete the junk
00478         int klen = key.length();
00479         if( klen > spos + 2 ) {
00480             // nothing but continuations and encodings may insert * into parameter name
00481             if( (klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*'))) ) {
00482                 kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2);
00483                 i = contparams.erase(i);
00484                 continue;
00485             }
00486             hasencoding = true;
00487         }
00488 
00489         int seqnum = 1;
00490         QMap<QString, QString>::iterator partsi;
00491         // we do not need to care about encoding specifications: only the first
00492         // part is allowed to have one
00493         QString val = i.value();
00494 
00495         if (hasencoding)
00496             key.chop(2);
00497         else
00498             key.chop(1);
00499 
00500         while( (partsi = contparams.find(key + QString::number(seqnum))) != contparams.end() )
00501         {
00502             val += partsi.value();
00503             contparams.erase(partsi);
00504         }
00505 
00506         i = contparams.erase(i);
00507 
00508         key.chop(1);
00509         if (hasencoding) {
00510             encparams.insert(key, val);
00511         } else {
00512             if( parameters.contains(key) ) {
00513                 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more";
00514                 parameters.remove(key);
00515                 return parameters;
00516             }
00517 
00518             parameters.insert(key, val);
00519         }
00520     }
00521 
00522     for( QMap<QString, QString>::iterator i = encparams.begin(), iEnd = encparams.end();
00523          i != encparams.end(); ++i )
00524     {
00525         QString val = i.value();
00526 
00527         // RfC 2231 encoded character set in filename
00528         int spos = val.indexOf(QLatin1Char('\''));
00529         if (spos == -1)
00530             continue;
00531         int npos = val.indexOf(QLatin1Char('\''), spos + 1);
00532         if (npos == -1)
00533             continue;
00534 
00535         const QString charset = val.left( spos );
00536         const QString lang = val.mid( spos + 1, npos - spos - 1 );
00537         const QByteArray rawval = QByteArray::fromPercentEncoding( val.mid(npos + 1).toAscii() );
00538         if( charset.isEmpty() || (charset == QLatin1String("us-ascii")) ) {
00539             bool valid = true;
00540             for( int j = rawval.length() - 1; (j >= 0) && valid; j-- )
00541                 valid = (rawval.at(j) >= 32);
00542 
00543             if( valid )
00544                 val = QString::fromAscii(rawval.constData());
00545             else
00546                 val.clear();
00547         } else {
00548             QTextCodec *codec = QTextCodec::codecForName( charset.toAscii() );
00549             if( codec )
00550                 val = codec->toUnicode( rawval );
00551             else
00552                 val.clear();
00553         }
00554 
00555         if( !val.isEmpty() ) {
00556             parameters.insert( i.key(), val );
00557         }
00558     }
00559 
00560     const QLatin1String fn("filename");
00561     if( parameters.contains(fn) ) {
00562         // Content-Disposition is not allowed to dictate directory
00563         // path, thus we extract the filename only.
00564         const QString val = QDir::toNativeSeparators( parameters[fn] );
00565         int slpos = val.lastIndexOf( QDir::separator() );
00566 
00567         if( slpos > -1 )
00568             parameters.insert(fn, val.mid( slpos + 1 ));
00569     }
00570 
00571     return parameters;
00572 }

KIOSlave

Skip menu "KIOSlave"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.7.3
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal