KIOSlave
parsinghelpers.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 2008 Andreas Hartmetz <ahartmetz@gmail.com> 00003 Copyright (C) 2010,2011 Rolf Eike Beer <kde@opensource.sf-tec.de> 00004 00005 This library is free software; you can redistribute it and/or 00006 modify it under the terms of the GNU Library General Public 00007 License as published by the Free Software Foundation; either 00008 version 2 of the License, or (at your option) any later version. 00009 00010 This library is distributed in the hope that it will be useful, 00011 but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 Library General Public License for more details. 00014 00015 You should have received a copy of the GNU Library General Public License 00016 along with this library; see the file COPYING.LIB. If not, write to 00017 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00018 Boston, MA 02110-1301, USA. 00019 */ 00020 00021 #include <ctype.h> 00022 00023 #include <QDir> 00024 #include <QMap> 00025 #include <QTextCodec> 00026 #include <QUrl> 00027 00028 #include <kcodecs.h> 00029 #include <kdebug.h> 00030 00031 // Advance *pos beyond spaces / tabs 00032 static void skipSpace(const char input[], int *pos, int end) 00033 { 00034 int idx = *pos; 00035 while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) { 00036 idx++; 00037 } 00038 *pos = idx; 00039 return; 00040 } 00041 00042 // Advance *pos to start of next line while being forgiving about line endings. 00043 // Return false if the end of the header has been reached, true otherwise. 00044 static bool nextLine(const char input[], int *pos, int end) 00045 { 00046 int idx = *pos; 00047 while (idx < end && input[idx] != '\r' && input[idx] != '\n') { 00048 idx++; 00049 } 00050 int rCount = 0; 00051 int nCount = 0; 00052 while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) { 00053 input[idx] == '\r' ? rCount++ : nCount++; 00054 idx++; 00055 } 00056 if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) { 00057 // if just one of the others is missing eat it too. 00058 // this ensures that conforming headers using the proper 00059 // \r\n sequence (and also \n\r) will be parsed correctly. 00060 if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) { 00061 idx++; 00062 } 00063 } 00064 00065 *pos = idx; 00066 return idx < end && rCount < 2 && nCount < 2; 00067 } 00068 00069 // QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need 00070 // to check here if this is valid at all. 00071 static bool isValidPercentEncoding(const QByteArray &data) 00072 { 00073 int i = 0; 00074 const int last = data.length() - 1; 00075 const char *d = data.constData(); 00076 00077 while ( (i = data.indexOf('%', i)) != -1) { 00078 if ( i >= last - 2 ) 00079 return false; 00080 if ( ! isxdigit(d[i + 1]) ) 00081 return false; 00082 if ( ! isxdigit(d[i + 2]) ) 00083 return false; 00084 i++; 00085 } 00086 00087 return true; 00088 } 00089 00090 QByteArray TokenIterator::next() 00091 { 00092 QPair<int, int> token = m_tokens[m_currentToken++]; 00093 //fromRawData brings some speed advantage but also the requirement to keep the text buffer 00094 //around. this together with implicit sharing (you don't know where copies end up) 00095 //is dangerous! 00096 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00097 return QByteArray(&m_buffer[token.first], token.second - token.first); 00098 } 00099 00100 QByteArray TokenIterator::current() const 00101 { 00102 QPair<int, int> token = m_tokens[m_currentToken - 1]; 00103 //return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 00104 return QByteArray(&m_buffer[token.first], token.second - token.first); 00105 } 00106 00107 QList<QByteArray> TokenIterator::all() const 00108 { 00109 QList<QByteArray> ret; 00110 for (int i = 0; i < m_tokens.count(); i++) { 00111 QPair<int, int> token = m_tokens[i]; 00112 ret.append(QByteArray(&m_buffer[token.first], token.second - token.first)); 00113 } 00114 return ret; 00115 } 00116 00117 00118 HeaderTokenizer::HeaderTokenizer(char *buffer) 00119 : m_buffer(buffer) 00120 { 00121 // add information about available headers and whether they have one or multiple, 00122 // comma-separated values. 00123 00124 //The following response header fields are from RFC 2616 unless otherwise specified. 00125 //Hint: search the web for e.g. 'http "accept-ranges header"' to find information about 00126 //a header field. 00127 static const HeaderFieldTemplate headerFieldTemplates[] = { 00128 {"accept-ranges", false}, 00129 {"age", false}, 00130 {"cache-control", true}, 00131 {"connection", true}, 00132 {"content-disposition", false}, //is multi-valued in a way, but with ";" separator! 00133 {"content-encoding", true}, 00134 {"content-language", true}, 00135 {"content-length", false}, 00136 {"content-location", false}, 00137 {"content-md5", false}, 00138 {"content-type", false}, 00139 {"date", false}, 00140 {"dav", true}, //RFC 2518 00141 {"etag", false}, 00142 {"expires", false}, 00143 {"keep-alive", true}, //RFC 2068 00144 {"last-modified", false}, 00145 {"link", false}, //RFC 2068, multi-valued with ";" separator 00146 {"location", false}, 00147 {"p3p", true}, // http://www.w3.org/TR/P3P/ 00148 {"pragma", true}, 00149 {"proxy-authenticate", false}, //complicated multi-valuedness: quoted commas don't separate 00150 //multiple values. we handle this at a higher level. 00151 {"proxy-connection", true}, //inofficial but well-known; to avoid misunderstandings 00152 //when using "connection" when talking to a proxy. 00153 {"refresh", false}, //not sure, only found some mailing list posts mentioning it 00154 {"set-cookie", false}, //RFC 2109; the multi-valuedness seems to be usually achieved 00155 //by sending several instances of this field as opposed to 00156 //usually comma-separated lists with maybe multiple instances. 00157 {"transfer-encoding", true}, 00158 {"upgrade", true}, 00159 {"warning", true}, 00160 {"www-authenticate", false} //see proxy-authenticate 00161 }; 00162 00163 for (uint i = 0; i < sizeof(headerFieldTemplates) / sizeof(HeaderFieldTemplate); i++) { 00164 const HeaderFieldTemplate &ft = headerFieldTemplates[i]; 00165 insert(QByteArray(ft.name), HeaderField(ft.isMultiValued)); 00166 } 00167 } 00168 00169 int HeaderTokenizer::tokenize(int begin, int end) 00170 { 00171 char *buf = m_buffer; //keep line length in check :/ 00172 int idx = begin; 00173 int startIdx = begin; //multi-purpose start of current token 00174 bool multiValuedEndedWithComma = false; //did the last multi-valued line end with a comma? 00175 QByteArray headerKey; 00176 do { 00177 00178 if (buf[idx] == ' ' || buf [idx] == '\t') { 00179 // line continuation; preserve startIdx except (see below) 00180 if (headerKey.isEmpty()) { 00181 continue; 00182 } 00183 // turn CR/LF into spaces for later parsing convenience 00184 int backIdx = idx - 1; 00185 while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) { 00186 buf[backIdx--] = ' '; 00187 } 00188 00189 // multiple values, comma-separated: add new value or continue previous? 00190 if (operator[](headerKey).isMultiValued) { 00191 if (multiValuedEndedWithComma) { 00192 // start new value; this is almost like no line continuation 00193 skipSpace(buf, &idx, end); 00194 startIdx = idx; 00195 } else { 00196 // continue previous value; this is tricky. unit tests to the rescue! 00197 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00198 // remove entry, it will be re-added because already idx != startIdx 00199 operator[](headerKey).beginEnd.removeLast(); 00200 } else { 00201 // no comma, no entry: the prev line was whitespace only - start new value 00202 skipSpace(buf, &idx, end); 00203 startIdx = idx; 00204 } 00205 } 00206 } 00207 00208 } else { 00209 // new field 00210 startIdx = idx; 00211 // also make sure that there is at least one char after the colon 00212 while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') { 00213 buf[idx] = tolower(buf[idx]); 00214 idx++; 00215 } 00216 if (buf[idx] != ':') { 00217 //malformed line: no colon 00218 headerKey.clear(); 00219 continue; 00220 } 00221 headerKey = QByteArray(&buf[startIdx], idx - startIdx); 00222 if (!contains(headerKey)) { 00223 //we don't recognize this header line 00224 headerKey.clear(); 00225 continue; 00226 } 00227 // skip colon & leading whitespace 00228 idx++; 00229 skipSpace(buf, &idx, end); 00230 startIdx = idx; 00231 } 00232 00233 // we have the name/key of the field, now parse the value 00234 if (!operator[](headerKey).isMultiValued) { 00235 00236 // scan to end of line 00237 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') { 00238 idx++; 00239 } 00240 if (!operator[](headerKey).beginEnd.isEmpty()) { 00241 // there already is an entry; are we just in a line continuation? 00242 if (operator[](headerKey).beginEnd.last().first == startIdx) { 00243 // line continuation: delete previous entry and later insert a new, longer one. 00244 operator[](headerKey).beginEnd.removeLast(); 00245 } 00246 } 00247 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00248 00249 } else { 00250 00251 // comma-separated list 00252 while (true) { 00253 //skip one value 00254 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') { 00255 idx++; 00256 } 00257 if (idx != startIdx) { 00258 operator[](headerKey).beginEnd.append(QPair<int, int>(startIdx, idx)); 00259 } 00260 multiValuedEndedWithComma = buf[idx] == ','; 00261 //skip comma(s) and leading whitespace, if any respectively 00262 while (idx < end && buf[idx] == ',') { 00263 idx++; 00264 } 00265 skipSpace(buf, &idx, end); 00266 //next value or end-of-line / end of header? 00267 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') { 00268 break; 00269 } 00270 //next value 00271 startIdx = idx; 00272 } 00273 } 00274 } while (nextLine(buf, &idx, end)); 00275 return idx; 00276 } 00277 00278 00279 TokenIterator HeaderTokenizer::iterator(const char *key) const 00280 { 00281 QByteArray keyBa = QByteArray::fromRawData(key, strlen(key)); 00282 if (contains(keyBa)) { 00283 return TokenIterator(value(keyBa).beginEnd, m_buffer); 00284 } else { 00285 return TokenIterator(m_nullTokens, m_buffer); 00286 } 00287 } 00288 00289 static void skipLWS(const QString &str, int &pos) 00290 { 00291 while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) { 00292 ++pos; 00293 } 00294 } 00295 00296 // keep the common ending, this allows the compiler to join them 00297 static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?="; 00298 static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?="; 00299 static const char valueSpecials[] = "()<>@,;:\\\"/[]?="; 00300 00301 static bool specialChar(const QChar &ch, const char *specials) 00302 { 00303 // WORKAROUND: According to RFC 2616, any character other than ascii 00304 // characters should NOT be allowed in unquoted content-disposition file 00305 // names. However, since none of the major browsers follow this rule, we do 00306 // the same thing here and allow all printable unicode characters. See 00307 // https://bugs.kde.org/show_bug.cgi?id=261223 for the detials. 00308 if (!ch.isPrint()) { 00309 return true; 00310 } 00311 00312 for (int i = qstrlen(specials) - 1; i >= 0; i--) { 00313 if (ch == QLatin1Char(specials[i])) { 00314 return true; 00315 } 00316 } 00317 00318 return false; 00319 } 00320 00336 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials) 00337 { 00338 QString out; 00339 skipLWS(str, pos); 00340 bool valid = true; 00341 00342 while (pos < str.length() && (str[pos] != term)) { 00343 out += str[pos]; 00344 valid = (valid && !specialChar(str[pos], specials)); 00345 ++pos; 00346 } 00347 00348 if (pos < str.length()) { // Stopped due to finding term 00349 ++pos; 00350 } 00351 00352 if (!valid) { 00353 return QString(); 00354 } 00355 00356 // Remove trailing linear whitespace... 00357 while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) { 00358 out.chop(1); 00359 } 00360 00361 if (out.contains(QLatin1Char(' '))) { 00362 out.clear(); 00363 } 00364 00365 return out; 00366 } 00367 00368 // As above, but also handles quotes.. 00369 // pos is set to -1 on parse error 00370 static QString extractMaybeQuotedUntil(const QString &str, int &pos) 00371 { 00372 const QChar term = QLatin1Char(';'); 00373 00374 skipLWS(str, pos); 00375 00376 // Are we quoted? 00377 if (pos < str.length() && str[pos] == QLatin1Char('"')) { 00378 QString out; 00379 00380 // Skip the quote... 00381 ++pos; 00382 00383 // when quoted we also need an end-quote 00384 bool endquote = false; 00385 00386 // Parse until trailing quote... 00387 while (pos < str.length()) { 00388 if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) { 00389 // quoted-pair = "\" CHAR 00390 out += str[pos + 1]; 00391 pos += 2; // Skip both... 00392 } else if (str[pos] == QLatin1Char('"')) { 00393 ++pos; 00394 endquote = true; 00395 break; 00396 } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2 00397 break; 00398 } else { 00399 out += str[pos]; 00400 ++pos; 00401 } 00402 } 00403 00404 if (!endquote) { 00405 pos = -1; 00406 return QString(); 00407 } 00408 00409 // Skip until term.. 00410 while (pos < str.length() && (str[pos] != term)) { 00411 if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) { 00412 pos = -1; 00413 return QString(); 00414 } 00415 ++pos; 00416 } 00417 00418 if (pos < str.length()) { // Stopped due to finding term 00419 ++pos; 00420 } 00421 00422 return out; 00423 } else { 00424 return extractUntil(str, term, pos, valueSpecials); 00425 } 00426 } 00427 00428 static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition) 00429 { 00430 kDebug(7113) << "disposition: " << disposition; 00431 int pos = 0; 00432 const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower(); 00433 00434 QMap<QString, QString> parameters; 00435 QMap<QString, QString> contparams; // all parameters that contain continuations 00436 QMap<QString, QString> encparams; // all parameters that have character encoding 00437 00438 // the type is invalid, the complete header is junk 00439 if (strDisposition.isEmpty()) { 00440 return parameters; 00441 } 00442 00443 parameters.insert(QLatin1String("type"), strDisposition); 00444 00445 while (pos < disposition.length()) { 00446 QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower(); 00447 00448 if (key.isEmpty()) { 00449 // parse error in this key: do not parse more, but add up 00450 // everything we already got 00451 kDebug(7113) << "parse error in key, abort parsing"; 00452 break; 00453 } 00454 00455 QString val; 00456 if (key.endsWith(QLatin1Char('*'))) { 00457 val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials); 00458 } else { 00459 val = extractMaybeQuotedUntil(disposition, pos); 00460 } 00461 00462 if (val.isEmpty()) { 00463 if (pos == -1) { 00464 kDebug(7113) << "parse error in value, abort parsing"; 00465 break; 00466 } 00467 continue; 00468 } 00469 00470 const int spos = key.indexOf(QLatin1Char('*')); 00471 if (spos == key.length() - 1) { 00472 key.chop(1); 00473 encparams.insert(key, val); 00474 } else if (spos >= 0) { 00475 contparams.insert(key, val); 00476 } else if (parameters.contains(key)) { 00477 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00478 parameters.remove(key); 00479 return parameters; 00480 } else { 00481 parameters.insert(key, val); 00482 } 00483 } 00484 00485 QMap<QString, QString>::iterator i = contparams.begin(); 00486 while (i != contparams.end()) { 00487 QString key = i.key(); 00488 int spos = key.indexOf(QLatin1Char('*')); 00489 bool hasencoding = false; 00490 00491 if (key.at(spos + 1) != QLatin1Char('0')) { 00492 ++i; 00493 continue; 00494 } 00495 00496 // no leading zeros allowed, so delete the junk 00497 int klen = key.length(); 00498 if (klen > spos + 2) { 00499 // nothing but continuations and encodings may insert * into parameter name 00500 if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) { 00501 kDebug(7113) << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2); 00502 i = contparams.erase(i); 00503 continue; 00504 } 00505 hasencoding = true; 00506 } 00507 00508 int seqnum = 1; 00509 QMap<QString, QString>::iterator partsi; 00510 // we do not need to care about encoding specifications: only the first 00511 // part is allowed to have one 00512 QString val = i.value(); 00513 00514 key.chop(hasencoding ? 2 : 1); 00515 00516 while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) { 00517 val += partsi.value(); 00518 contparams.erase(partsi); 00519 } 00520 00521 i = contparams.erase(i); 00522 00523 key.chop(1); 00524 if (hasencoding) { 00525 encparams.insert(key, val); 00526 } else { 00527 if (parameters.contains(key)) { 00528 kDebug(7113) << "duplicate key" << key << "found, ignoring everything more"; 00529 parameters.remove(key); 00530 return parameters; 00531 } 00532 00533 parameters.insert(key, val); 00534 } 00535 } 00536 00537 for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) { 00538 QString val = i.value(); 00539 00540 // RfC 2231 encoded character set in filename 00541 int spos = val.indexOf(QLatin1Char('\'')); 00542 if (spos == -1) { 00543 continue; 00544 } 00545 int npos = val.indexOf(QLatin1Char('\''), spos + 1); 00546 if (npos == -1) { 00547 continue; 00548 } 00549 00550 const QString charset = val.left(spos); 00551 const QString lang = val.mid(spos + 1, npos - spos - 1); 00552 const QByteArray encodedVal = val.mid(npos + 1).toAscii(); 00553 00554 if ( ! isValidPercentEncoding(encodedVal) ) 00555 continue; 00556 00557 const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal); 00558 00559 if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) { 00560 bool valid = true; 00561 for (int j = rawval.length() - 1; (j >= 0) && valid; j--) { 00562 valid = (rawval.at(j) >= 32); 00563 } 00564 00565 if (!valid) 00566 continue; 00567 val = QString::fromAscii(rawval.constData()); 00568 } else { 00569 QTextCodec *codec = QTextCodec::codecForName(charset.toAscii()); 00570 if (!codec) 00571 continue; 00572 val = codec->toUnicode(rawval); 00573 } 00574 00575 parameters.insert(i.key(), val); 00576 } 00577 00578 return parameters; 00579 } 00580 00581 static QMap<QString, QString> contentDispositionParser(const QString &disposition) 00582 { 00583 QMap<QString, QString> parameters = contentDispositionParserInternal(disposition); 00584 00585 const QLatin1String fn("filename"); 00586 if (parameters.contains(fn)) { 00587 // Content-Disposition is not allowed to dictate directory 00588 // path, thus we extract the filename only. 00589 const QString val = QDir::toNativeSeparators(parameters[fn]); 00590 int slpos = val.lastIndexOf(QDir::separator()); 00591 00592 if (slpos > -1) { 00593 parameters.insert(fn, val.mid(slpos + 1)); 00594 } 00595 } 00596 00597 return parameters; 00598 }
This file is part of the KDE documentation.
Documentation copyright © 1996-2012 The KDE developers.
Generated on Thu May 10 2012 20:57:54 by doxygen 1.8.0 written by Dimitri van Heesch, © 1997-2006
Documentation copyright © 1996-2012 The KDE developers.
Generated on Thu May 10 2012 20:57:54 by doxygen 1.8.0 written by Dimitri van Heesch, © 1997-2006
KDE's Doxygen guidelines are available online.