KDECore
kstringhandler.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 Copyright (C) 1999 Ian Zepp (icszepp@islc.net) 00003 Copyright (C) 2006 by Dominic Battre <dominic@battre.de> 00004 Copyright (C) 2006 by Martin Pool <mbp@canonical.com> 00005 00006 This library is free software; you can redistribute it and/or 00007 modify it under the terms of the GNU Library General Public 00008 License as published by the Free Software Foundation; either 00009 version 2 of the License, or (at your option) any later version. 00010 00011 This library is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 Library General Public License for more details. 00015 00016 You should have received a copy of the GNU Library General Public License 00017 along with this library; see the file COPYING.LIB. If not, write to 00018 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 Boston, MA 02110-1301, USA. 00020 */ 00021 00022 #include "kstringhandler.h" 00023 00024 #include <stdlib.h> // random() 00025 00026 #include <kglobal.h> 00027 00028 #include <QtCore/QRegExp> // for the word ranges 00029 #include <QtCore/QCharRef> 00030 #include <QtCore/QMutableStringListIterator> 00031 00032 00033 00034 // 00035 // Capitalization routines 00036 // 00037 QString KStringHandler::capwords( const QString &text ) 00038 { 00039 if ( text.isEmpty() ) { 00040 return text; 00041 } 00042 00043 const QString strippedText = text.trimmed(); 00044 const QString space = QString(QLatin1Char(' ')); 00045 const QStringList words = capwords(strippedText.split(space)); 00046 00047 QString result = text; 00048 result.replace(strippedText, words.join(space)); 00049 return result; 00050 } 00051 00052 QStringList KStringHandler::capwords( const QStringList &list ) 00053 { 00054 QStringList tmp = list; 00055 for ( QStringList::Iterator it = tmp.begin(); it != tmp.end(); ++it ) { 00056 *it = ( *it )[ 0 ].toUpper() + ( *it ).mid( 1 ); 00057 } 00058 return tmp; 00059 } 00060 00061 00062 QString KStringHandler::lsqueeze( const QString & str, int maxlen ) 00063 { 00064 if (str.length() > maxlen) { 00065 int part = maxlen-3; 00066 return QString::fromLatin1("...") + str.right(part); 00067 } 00068 else return str; 00069 } 00070 00071 QString KStringHandler::csqueeze( const QString & str, int maxlen ) 00072 { 00073 if (str.length() > maxlen && maxlen > 3) { 00074 const int part = (maxlen-3)/2; 00075 return str.left(part) + QLatin1String("...") + str.right(part); 00076 } 00077 else return str; 00078 } 00079 00080 QString KStringHandler::rsqueeze( const QString & str, int maxlen ) 00081 { 00082 if (str.length() > maxlen) { 00083 int part = maxlen-3; 00084 return str.left(part) + QLatin1String("..."); 00085 } 00086 else return str; 00087 } 00088 00089 QStringList KStringHandler::perlSplit(const QString & sep, const QString & s, int max) 00090 { 00091 bool ignoreMax = 0 == max; 00092 00093 QStringList l; 00094 00095 int searchStart = 0; 00096 00097 int tokenStart = s.indexOf(sep, searchStart); 00098 00099 while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) 00100 { 00101 if (!s.mid(searchStart, tokenStart - searchStart).isEmpty()) 00102 l << s.mid(searchStart, tokenStart - searchStart); 00103 00104 searchStart = tokenStart + sep.length(); 00105 tokenStart = s.indexOf(sep, searchStart); 00106 } 00107 00108 if (!s.mid(searchStart, s.length() - searchStart).isEmpty()) 00109 l << s.mid(searchStart, s.length() - searchStart); 00110 00111 return l; 00112 } 00113 00114 QStringList KStringHandler::perlSplit(const QChar & sep, const QString & s, int max) 00115 { 00116 bool ignoreMax = 0 == max; 00117 00118 QStringList l; 00119 00120 int searchStart = 0; 00121 00122 int tokenStart = s.indexOf(sep, searchStart); 00123 00124 while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) 00125 { 00126 if (!s.mid(searchStart, tokenStart - searchStart).isEmpty()) 00127 l << s.mid(searchStart, tokenStart - searchStart); 00128 00129 searchStart = tokenStart + 1; 00130 tokenStart = s.indexOf(sep, searchStart); 00131 } 00132 00133 if (!s.mid(searchStart, s.length() - searchStart).isEmpty()) 00134 l << s.mid(searchStart, s.length() - searchStart); 00135 00136 return l; 00137 } 00138 00139 QStringList KStringHandler::perlSplit(const QRegExp & sep, const QString & s, int max) 00140 { 00141 bool ignoreMax = 0 == max; 00142 00143 QStringList l; 00144 00145 int searchStart = 0; 00146 int tokenStart = sep.indexIn(s, searchStart); 00147 int len = sep.matchedLength(); 00148 00149 while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) 00150 { 00151 if (!s.mid(searchStart, tokenStart - searchStart).isEmpty()) 00152 l << s.mid(searchStart, tokenStart - searchStart); 00153 00154 searchStart = tokenStart + len; 00155 tokenStart = sep.indexIn(s, searchStart); 00156 len = sep.matchedLength(); 00157 } 00158 00159 if (!s.mid(searchStart, s.length() - searchStart).isEmpty()) 00160 l << s.mid(searchStart, s.length() - searchStart); 00161 00162 return l; 00163 } 00164 00165 QString KStringHandler::tagUrls( const QString& text ) 00166 { 00167 /*static*/ QRegExp urlEx(QLatin1String("(www\\.(?!\\.)|(fish|(f|ht)tp(|s))://)[\\d\\w\\./,:_~\\?=&;#@\\-\\+\\%\\$]+[\\d\\w/]")); 00168 00169 QString richText( text ); 00170 int urlPos = 0, urlLen; 00171 while ((urlPos = urlEx.indexIn(richText, urlPos)) >= 0) 00172 { 00173 urlLen = urlEx.matchedLength(); 00174 QString href = richText.mid( urlPos, urlLen ); 00175 // Qt doesn't support (?<=pattern) so we do it here 00176 if((urlPos > 0) && richText[urlPos-1].isLetterOrNumber()){ 00177 urlPos++; 00178 continue; 00179 } 00180 // Don't use QString::arg since %01, %20, etc could be in the string 00181 QString anchor = QString::fromLatin1("<a href=\"") + href + QLatin1String("\">") + href + QLatin1String("</a>"); 00182 richText.replace( urlPos, urlLen, anchor ); 00183 00184 00185 urlPos += anchor.length(); 00186 } 00187 return richText; 00188 } 00189 00190 QString KStringHandler::obscure( const QString &str ) 00191 { 00192 QString result; 00193 const QChar *unicode = str.unicode(); 00194 for ( int i = 0; i < str.length(); ++i ) 00195 // yes, no typo. can't encode ' ' or '!' because 00196 // they're the unicode BOM. stupid scrambling. stupid. 00197 result += ( unicode[ i ].unicode() <= 0x21 ) ? unicode[ i ] : 00198 QChar( 0x1001F - unicode[ i ].unicode() ); 00199 00200 return result; 00201 } 00202 00203 00204 bool KStringHandler::isUtf8( const char *buf ) 00205 { 00206 int i, n; 00207 register unsigned char c; 00208 bool gotone = false; 00209 00210 if (!buf) 00211 return true; // whatever, just don't crash 00212 00213 #define F 0 /* character never appears in text */ 00214 #define T 1 /* character appears in plain ASCII text */ 00215 #define I 2 /* character appears in ISO-8859 text */ 00216 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 00217 00218 static const unsigned char text_chars[256] = { 00219 /* BEL BS HT LF FF CR */ 00220 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 00221 /* ESC */ 00222 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 00223 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 00224 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 00225 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 00226 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 00227 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 00228 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 00229 /* NEL */ 00230 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 00231 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 00232 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 00233 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 00234 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 00235 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 00236 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 00237 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 00238 }; 00239 00240 /* *ulen = 0; */ 00241 for (i = 0; (c = buf[i]); ++i) { 00242 if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 00243 /* 00244 * Even if the whole file is valid UTF-8 sequences, 00245 * still reject it if it uses weird control characters. 00246 */ 00247 00248 if (text_chars[c] != T) 00249 return false; 00250 00251 } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 00252 return false; 00253 } else { /* 11xxxxxx begins UTF-8 */ 00254 int following; 00255 00256 if ((c & 0x20) == 0) { /* 110xxxxx */ 00257 following = 1; 00258 } else if ((c & 0x10) == 0) { /* 1110xxxx */ 00259 following = 2; 00260 } else if ((c & 0x08) == 0) { /* 11110xxx */ 00261 following = 3; 00262 } else if ((c & 0x04) == 0) { /* 111110xx */ 00263 following = 4; 00264 } else if ((c & 0x02) == 0) { /* 1111110x */ 00265 following = 5; 00266 } else 00267 return false; 00268 00269 for (n = 0; n < following; ++n) { 00270 i++; 00271 if (!(c = buf[i])) 00272 goto done; 00273 00274 if ((c & 0x80) == 0 || (c & 0x40)) 00275 return false; 00276 } 00277 gotone = true; 00278 } 00279 } 00280 done: 00281 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 00282 } 00283 00284 #undef F 00285 #undef T 00286 #undef I 00287 #undef X 00288 00289 QString KStringHandler::from8Bit( const char *str ) 00290 { 00291 if (!str) 00292 return QString(); 00293 if (!*str) { 00294 static const QString &emptyString = KGlobal::staticQString(""); 00295 return emptyString; 00296 } 00297 return KStringHandler::isUtf8( str ) ? 00298 QString::fromUtf8( str ) : 00299 QString::fromLocal8Bit( str ); 00300 } 00301 00302 int KStringHandler::naturalCompare(const QString &_a, const QString &_b, Qt::CaseSensitivity caseSensitivity) 00303 { 00304 // This method chops the input a and b into pieces of 00305 // digits and non-digits (a1.05 becomes a | 1 | . | 05) 00306 // and compares these pieces of a and b to each other 00307 // (first with first, second with second, ...). 00308 // 00309 // This is based on the natural sort order code code by Martin Pool 00310 // http://sourcefrog.net/projects/natsort/ 00311 // Martin Pool agreed to license this under LGPL or GPL. 00312 00313 // FIXME: Using toLower() to implement case insensitive comparison is 00314 // sub-optimal, but is needed because we compare strings with 00315 // localeAwareCompare(), which does not know about case sensitivity. 00316 // A task has been filled for this in Qt Task Tracker with ID 205990. 00317 // http://trolltech.com/developer/task-tracker/index_html?method=entry&id=205990 00318 QString a; 00319 QString b; 00320 if (caseSensitivity == Qt::CaseSensitive) { 00321 a = _a; 00322 b = _b; 00323 } else { 00324 a = _a.toLower(); 00325 b = _b.toLower(); 00326 } 00327 00328 const QChar* currA = a.unicode(); // iterator over a 00329 const QChar* currB = b.unicode(); // iterator over b 00330 00331 if (currA == currB) { 00332 return 0; 00333 } 00334 00335 while (!currA->isNull() && !currB->isNull()) { 00336 const QChar* begSeqA = currA; // beginning of a new character sequence of a 00337 const QChar* begSeqB = currB; 00338 if (currA->unicode() == QChar::ObjectReplacementCharacter) { 00339 return 1; 00340 } 00341 00342 if (currB->unicode() == QChar::ObjectReplacementCharacter) { 00343 return -1; 00344 } 00345 00346 if (currA->unicode() == QChar::ReplacementCharacter) { 00347 return 1; 00348 } 00349 00350 if (currB->unicode() == QChar::ReplacementCharacter) { 00351 return -1; 00352 } 00353 00354 // find sequence of characters ending at the first non-character 00355 while (!currA->isNull() && !currA->isDigit() && !currA->isPunct() && !currA->isSpace()) { 00356 ++currA; 00357 } 00358 00359 while (!currB->isNull() && !currB->isDigit() && !currB->isPunct() && !currB->isSpace()) { 00360 ++currB; 00361 } 00362 00363 // compare these sequences 00364 const QStringRef& subA(a.midRef(begSeqA - a.unicode(), currA - begSeqA)); 00365 const QStringRef& subB(b.midRef(begSeqB - b.unicode(), currB - begSeqB)); 00366 const int cmp = QStringRef::localeAwareCompare(subA, subB); 00367 if (cmp != 0) { 00368 return cmp < 0 ? -1 : +1; 00369 } 00370 00371 if (currA->isNull() || currB->isNull()) { 00372 break; 00373 } 00374 00375 // find sequence of characters ending at the first non-character 00376 while ((currA->isPunct() || currA->isSpace()) && (currB->isPunct() || currB->isSpace())) { 00377 if (*currA != *currB) { 00378 return (*currA < *currB) ? -1 : +1; 00379 } 00380 ++currA; 00381 ++currB; 00382 if (currA->isNull() || currB->isNull()) { 00383 break; 00384 } 00385 } 00386 00387 // now some digits follow... 00388 if ((*currA == QLatin1Char('0')) || (*currB == QLatin1Char('0'))) { 00389 // one digit-sequence starts with 0 -> assume we are in a fraction part 00390 // do left aligned comparison (numbers are considered left aligned) 00391 while (1) { 00392 if (!currA->isDigit() && !currB->isDigit()) { 00393 break; 00394 } else if (!currA->isDigit()) { 00395 return +1; 00396 } else if (!currB->isDigit()) { 00397 return -1; 00398 } else if (*currA < *currB) { 00399 return -1; 00400 } else if (*currA > *currB) { 00401 return + 1; 00402 } 00403 ++currA; 00404 ++currB; 00405 } 00406 } else { 00407 // No digit-sequence starts with 0 -> assume we are looking at some integer 00408 // do right aligned comparison. 00409 // 00410 // The longest run of digits wins. That aside, the greatest 00411 // value wins, but we can't know that it will until we've scanned 00412 // both numbers to know that they have the same magnitude. 00413 00414 bool isFirstRun = true; 00415 int weight = 0; 00416 while (1) { 00417 if (!currA->isDigit() && !currB->isDigit()) { 00418 if (weight != 0) { 00419 return weight; 00420 } 00421 break; 00422 } else if (!currA->isDigit()) { 00423 if (isFirstRun) { 00424 return *currA < *currB ? -1 : +1; 00425 } else { 00426 return -1; 00427 } 00428 } else if (!currB->isDigit()) { 00429 if (isFirstRun) { 00430 return *currA < *currB ? -1 : +1; 00431 } else { 00432 return +1; 00433 } 00434 } else if ((*currA < *currB) && (weight == 0)) { 00435 weight = -1; 00436 } else if ((*currA > *currB) && (weight == 0)) { 00437 weight = + 1; 00438 } 00439 ++currA; 00440 ++currB; 00441 isFirstRun = false; 00442 } 00443 } 00444 } 00445 00446 if (currA->isNull() && currB->isNull()) { 00447 return 0; 00448 } 00449 00450 return currA->isNull() ? -1 : + 1; 00451 } 00452 00453 QString KStringHandler::preProcessWrap(const QString &text) 00454 { 00455 const QChar zwsp(0x200b); 00456 00457 QString result; 00458 result.reserve(text.length()); 00459 00460 for (int i = 0; i < text.length(); i++) { 00461 const QChar c = text[i]; 00462 bool openingParens = (c == QLatin1Char('(') || c == QLatin1Char('{') || c == QLatin1Char('[')); 00463 bool singleQuote = (c == QLatin1Char('\'') ); 00464 bool closingParens = (c == QLatin1Char(')') || c == QLatin1Char('}') || c == QLatin1Char(']')); 00465 bool breakAfter = (closingParens || c.isPunct() || c.isSymbol()); 00466 bool nextIsSpace = (i == (text.length() - 1) || text[i + 1].isSpace()); 00467 bool prevIsSpace = (i == 0 || text[i - 1].isSpace() || result[result.length() - 1] == zwsp); 00468 00469 // Provide a breaking opportunity before opening parenthesis 00470 if (openingParens && !prevIsSpace) 00471 result += zwsp; 00472 00473 // Provide a word joiner before the single quote 00474 if (singleQuote && !prevIsSpace) 00475 result += QChar(0x2060); 00476 00477 result += c; 00478 00479 if (breakAfter && !openingParens && !nextIsSpace && !singleQuote) 00480 result += zwsp; 00481 } 00482 00483 return result; 00484 } 00485
KDE 4.6 API Reference