KDECore
ksycocadict.cpp
Go to the documentation of this file.
00001 /* This file is part of the KDE libraries 00002 * Copyright (C) 1999 Waldo Bastian <bastian@kde.org> 00003 * 00004 * This library is free software; you can redistribute it and/or 00005 * modify it under the terms of the GNU Library General Public 00006 * License version 2 as published by the Free Software Foundation; 00007 * 00008 * This library is distributed in the hope that it will be useful, 00009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00011 * Library General Public License for more details. 00012 * 00013 * You should have received a copy of the GNU Library General Public License 00014 * along with this library; see the file COPYING.LIB. If not, write to 00015 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00016 * Boston, MA 02110-1301, USA. 00017 **/ 00018 00019 #include "ksycocadict_p.h" 00020 #include <kservice.h> 00021 #include "ksycocaentry.h" 00022 #include "ksycoca.h" 00023 #include "kdebug.h" 00024 00025 #include <QtCore/QBitArray> 00026 00027 namespace 00028 { 00029 struct string_entry { 00030 string_entry(const QString& _key, const KSycocaEntry::Ptr& _payload) 00031 : hash(0), length(_key.length()), keyStr(_key), key(keyStr.unicode()), payload(_payload) 00032 {} 00033 uint hash; 00034 const int length; 00035 const QString keyStr; 00036 const QChar * const key; // always points to keyStr.unicode(); just an optimization 00037 const KSycocaEntry::Ptr payload; 00038 }; 00039 } 00040 00041 class KSycocaDictStringList : public QList<string_entry*> 00042 { 00043 public: 00044 ~KSycocaDictStringList() { 00045 qDeleteAll(*this); 00046 } 00047 }; 00048 00049 class KSycocaDict::Private 00050 { 00051 public: 00052 Private() 00053 : stringlist( 0 ), 00054 stream( 0 ), 00055 offset( 0 ) 00056 { 00057 } 00058 00059 ~Private() 00060 { 00061 delete stringlist; 00062 } 00063 00064 // Helper for find_string and findMultiString 00065 qint32 offsetForKey(const QString& key) const; 00066 00067 // Calculate hash - can be used during loading and during saving. 00068 quint32 hashKey(const QString & key) const; 00069 00070 KSycocaDictStringList *stringlist; 00071 QDataStream *stream; 00072 qint64 offset; 00073 quint32 hashTableSize; 00074 QList<qint32> hashList; 00075 }; 00076 00077 KSycocaDict::KSycocaDict() 00078 : d( new Private ) 00079 { 00080 } 00081 00082 KSycocaDict::KSycocaDict(QDataStream *str, int offset) 00083 : d( new Private ) 00084 { 00085 d->stream = str; 00086 d->offset = offset; 00087 00088 quint32 test1, test2; 00089 str->device()->seek(offset); 00090 (*str) >> test1 >> test2; 00091 if ((test1 > 0x000fffff) || (test2 > 1024)) 00092 { 00093 KSycoca::flagError(); 00094 d->hashTableSize = 0; 00095 d->offset = 0; 00096 return; 00097 } 00098 00099 str->device()->seek(offset); 00100 (*str) >> d->hashTableSize; 00101 (*str) >> d->hashList; 00102 d->offset = str->device()->pos(); // Start of hashtable 00103 } 00104 00105 KSycocaDict::~KSycocaDict() 00106 { 00107 delete d; 00108 } 00109 00110 void 00111 KSycocaDict::add(const QString &key, const KSycocaEntry::Ptr& payload) 00112 { 00113 if (key.isEmpty()) return; // Not allowed (should never happen) 00114 if (!payload) return; // Not allowed! 00115 if (!d->stringlist) 00116 { 00117 d->stringlist = new KSycocaDictStringList; 00118 } 00119 00120 string_entry *entry = new string_entry(key, payload); 00121 d->stringlist->append(entry); 00122 } 00123 00124 void 00125 KSycocaDict::remove(const QString &key) 00126 { 00127 if (!d || !d->stringlist) { 00128 return; 00129 } 00130 00131 bool found = false; 00132 for(KSycocaDictStringList::Iterator it = d->stringlist->begin(); it != d->stringlist->end(); ++it) { 00133 string_entry* entry = *it; 00134 if (entry->keyStr == key) { 00135 d->stringlist->erase(it); 00136 delete entry; 00137 found = true; 00138 break; 00139 } 00140 } 00141 if (!found) { 00142 kWarning(7011) << "key not found:" << key; 00143 } 00144 } 00145 00146 int KSycocaDict::find_string(const QString &key ) const 00147 { 00148 Q_ASSERT(d); 00149 00150 //kDebug(7011) << QString("KSycocaDict::find_string(%1)").arg(key); 00151 qint32 offset = d->offsetForKey(key); 00152 00153 //kDebug(7011) << QString("offset is %1").arg(offset,8,16); 00154 if (offset == 0) 00155 return 0; 00156 00157 if (offset > 0) 00158 return offset; // Positive ID 00159 00160 // Lookup duplicate list. 00161 offset = -offset; 00162 00163 d->stream->device()->seek(offset); 00164 //kDebug(7011) << QString("Looking up duplicate list at %1").arg(offset,8,16); 00165 00166 while(true) 00167 { 00168 (*d->stream) >> offset; 00169 if (offset == 0) break; 00170 QString dupkey; 00171 (*d->stream) >> dupkey; 00172 //kDebug(7011) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey); 00173 if (dupkey == key) return offset; 00174 } 00175 //kWarning(7011) << "Not found!"; 00176 00177 return 0; 00178 } 00179 00180 00181 QList<int> KSycocaDict::findMultiString(const QString &key ) const 00182 { 00183 qint32 offset = d->offsetForKey(key); 00184 QList<int> offsetList; 00185 if (offset == 0) 00186 return offsetList; 00187 00188 if (offset > 0) { // Positive ID: one entry found 00189 offsetList.append(offset); 00190 return offsetList; 00191 } 00192 00193 // Lookup duplicate list. 00194 offset = -offset; 00195 00196 d->stream->device()->seek(offset); 00197 //kDebug(7011) << QString("Looking up duplicate list at %1").arg(offset,8,16); 00198 00199 while(true) 00200 { 00201 (*d->stream) >> offset; 00202 if (offset == 0) break; 00203 QString dupkey; 00204 (*d->stream) >> dupkey; 00205 //kDebug(7011) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey); 00206 if (dupkey == key) 00207 offsetList.append(offset); 00208 } 00209 return offsetList; 00210 } 00211 00212 uint KSycocaDict::count() const 00213 { 00214 if ( !d || !d->stringlist ) return 0; 00215 00216 return d->stringlist->count(); 00217 } 00218 00219 void 00220 KSycocaDict::clear() 00221 { 00222 delete d; 00223 d = 0; 00224 } 00225 00226 uint KSycocaDict::Private::hashKey( const QString &key) const 00227 { 00228 int len = key.length(); 00229 uint h = 0; 00230 00231 for(int i = 0; i < hashList.count(); i++) 00232 { 00233 int pos = hashList[i]; 00234 if (pos < 0) { 00235 pos = -pos-1; 00236 if (pos < len) 00237 h = ((h * 13) + (key[len-pos].cell() % 29)) & 0x3ffffff; 00238 } else { 00239 pos = pos-1; 00240 if (pos < len) 00241 h = ((h * 13) + (key[pos].cell() % 29)) & 0x3ffffff; 00242 } 00243 } 00244 return h; 00245 } 00246 00247 // If we have the strings 00248 // hello 00249 // world 00250 // kde 00251 // Then we end up with 00252 // ABCDE 00253 // where A = diversity of 'h' + 'w' + 'k' etc. 00254 // Also, diversity(-2) == 'l'+'l'+'d' (second character from the end) 00255 00256 // The hasList is used for hashing: 00257 // hashList = (-2, 1, 3) means that the hash key comes from 00258 // the 2nd character from the right, then the 1st from the left, then the 3rd from the left. 00259 00260 // Calculate the diversity of the strings at position 'pos' 00261 // NOTE: this code is slow, it takes 12% of the _overall_ `kbuildsycoca4 --noincremental` running time 00262 static int 00263 calcDiversity(KSycocaDictStringList *stringlist, int inPos, uint sz) 00264 { 00265 if (inPos == 0) return 0; 00266 QBitArray matrix(sz); 00267 int pos; 00268 00269 //static const int s_maxItems = 50; 00270 //int numItem = 0; 00271 00272 if (inPos < 0) { 00273 pos = -inPos-1; 00274 for(KSycocaDictStringList::const_iterator it = stringlist->constBegin(), end = stringlist->constEnd(); it != end; ++it) 00275 { 00276 string_entry* entry = *it; 00277 int len = entry->length; 00278 if (pos < len && pos != 0) { 00279 uint hash = ((entry->hash * 13) + (entry->key[len-pos].cell() % 29)) & 0x3ffffff; 00280 matrix.setBit( hash % sz, true ); 00281 } 00282 //if (++numItem == s_maxItems) 00283 // break; 00284 } 00285 } else { 00286 pos = inPos-1; 00287 for(KSycocaDictStringList::const_iterator it = stringlist->constBegin(), end = stringlist->constEnd(); it != end; ++it) 00288 { 00289 string_entry* entry = *it; 00290 if (pos < entry->length) { 00291 uint hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3ffffff; 00292 matrix.setBit( hash % sz, true ); 00293 } 00294 //if (++numItem == s_maxItems) 00295 // break; 00296 } 00297 } 00298 return matrix.count(true); 00299 } 00300 00301 // 00302 // Add the diversity of the strings at position 'pos' 00303 static void 00304 addDiversity(KSycocaDictStringList *stringlist, int pos) 00305 { 00306 if (pos == 0) return; 00307 if (pos < 0) { 00308 pos = -pos-1; 00309 for(KSycocaDictStringList::const_iterator it = stringlist->constBegin(); it != stringlist->constEnd(); ++it) 00310 { 00311 string_entry* entry = *it; 00312 register int l = entry->length; 00313 if (pos < l) 00314 entry->hash = ((entry->hash * 13) + (entry->key[l-pos].cell() % 29)) & 0x3fffffff; 00315 } 00316 } else { 00317 pos = pos - 1; 00318 for(KSycocaDictStringList::const_iterator it = stringlist->constBegin(); it != stringlist->constEnd(); ++it) 00319 { 00320 string_entry* entry = *it; 00321 if (pos < entry->length) 00322 entry->hash = ((entry->hash * 13) + (entry->key[pos].cell() % 29)) & 0x3fffffff; 00323 } 00324 } 00325 } 00326 00327 00328 void 00329 KSycocaDict::save(QDataStream &str) 00330 { 00331 if (count() == 0) 00332 { 00333 d->hashTableSize = 0; 00334 d->hashList.clear(); 00335 str << d->hashTableSize; 00336 str << d->hashList; 00337 return; 00338 } 00339 00340 d->offset = str.device()->pos(); 00341 00342 //kDebug(7011) << "KSycocaDict:" << count() << "entries."; 00343 00344 //kDebug(7011) << "Calculating hash keys.."; 00345 00346 int maxLength = 0; 00347 //kDebug(7011) << "Finding maximum string length"; 00348 for(KSycocaDictStringList::const_iterator it = d->stringlist->constBegin(); it != d->stringlist->constEnd(); ++it) 00349 { 00350 string_entry* entry = *it; 00351 entry->hash = 0; 00352 if (entry->length > maxLength) 00353 maxLength = entry->length; 00354 } 00355 00356 //kDebug(7011) << "Max string length=" << maxLength << "existing hashList=" << d->hashList; 00357 00358 // use "almost prime" number for sz (to calculate diversity) and later 00359 // for the table size of big tables 00360 // int sz = d->stringlist->count()*5-1; 00361 register unsigned int sz = count()*4 + 1; 00362 while(!(((sz % 3) && (sz % 5) && (sz % 7) && (sz % 11) && (sz % 13)))) 00363 sz+=2; 00364 00365 d->hashList.clear(); 00366 00367 // Times (with warm caches, i.e. after multiple runs) 00368 // kbuildsycoca4 --noincremental 2.83s user 0.20s system 95% cpu 3.187 total 00369 // kbuildsycoca4 --noincremental 2.74s user 0.25s system 93% cpu 3.205 total 00370 // unittest: 0.50-60 msec per iteration / 0.40-50 msec per iteration 00371 00372 // Now that MimeTypes are not parsed anymore: 00373 // kbuildsycoca4 --noincremental 2.18s user 0.30s system 91% cpu 2.719 total 00374 // kbuildsycoca4 --noincremental 2.07s user 0.34s system 89% cpu 2.681 total 00375 00376 // If I enabled s_maxItems = 50, it goes down to 00377 // but I don't know if that's a good idea. 00378 // kbuildsycoca4 --noincremental 1.73s user 0.31s system 85% cpu 2.397 total 00379 // kbuildsycoca4 --noincremental 1.84s user 0.29s system 95% cpu 2.230 total 00380 00381 // try to limit diversity scan by "predicting" positions 00382 // with high diversity 00383 QVector<int> oldvec(maxLength*2+1); 00384 oldvec.fill(0); 00385 int mindiv=0; 00386 int lastDiv = 0; 00387 00388 while(true) 00389 { 00390 int divsum=0,divnum=0; 00391 00392 int maxDiv = 0; 00393 int maxPos = 0; 00394 for (int pos = -maxLength; pos <= maxLength; ++pos) { 00395 // cut off 00396 if (oldvec[pos+maxLength] < mindiv) { oldvec[pos+maxLength]=0; continue; } 00397 00398 const int diversity = calcDiversity(d->stringlist, pos, sz); 00399 if (diversity > maxDiv) { 00400 maxDiv = diversity; 00401 maxPos = pos; 00402 } 00403 oldvec[pos + maxLength] = diversity; 00404 divsum += diversity; 00405 ++divnum; 00406 } 00407 // arbitrary cut-off value 3/4 of average seems to work 00408 if (divnum) 00409 mindiv=(3*divsum)/(4*divnum); 00410 00411 if (maxDiv <= lastDiv) 00412 break; 00413 //kDebug() << "Max Div=" << maxDiv << "at pos" << maxPos; 00414 lastDiv = maxDiv; 00415 addDiversity(d->stringlist, maxPos); 00416 d->hashList.append(maxPos); 00417 } 00418 00419 00420 for(KSycocaDictStringList::Iterator it = d->stringlist->begin(); it != d->stringlist->end(); ++it) { 00421 (*it)->hash = d->hashKey((*it)->keyStr); 00422 } 00423 // fprintf(stderr, "Calculating minimum table size..\n"); 00424 00425 d->hashTableSize = sz; 00426 00427 //kDebug() << "hashTableSize=" << sz << "hashList=" << d->hashList << "oldvec=" << oldvec; 00428 00429 struct hashtable_entry { 00430 string_entry *entry; 00431 QList<string_entry*>* duplicates; 00432 qint64 duplicate_offset; 00433 }; 00434 00435 hashtable_entry *hashTable = new hashtable_entry[ sz ]; 00436 00437 //kDebug(7011) << "Clearing hashtable..."; 00438 for (unsigned int i=0; i < sz; i++) 00439 { 00440 hashTable[i].entry = 0; 00441 hashTable[i].duplicates = 0; 00442 } 00443 00444 //kDebug(7011) << "Filling hashtable..."; 00445 for(KSycocaDictStringList::const_iterator it = d->stringlist->constBegin(); it != d->stringlist->constEnd(); ++it) 00446 { 00447 string_entry* entry = *it; 00448 //kDebug(7011) << "entry keyStr=" << entry->keyStr << entry->payload.data() << entry->payload->entryPath(); 00449 int hash = entry->hash % sz; 00450 if (!hashTable[hash].entry) 00451 { // First entry 00452 hashTable[hash].entry = entry; 00453 } 00454 else 00455 { 00456 if (!hashTable[hash].duplicates) 00457 { // Second entry, build duplicate list. 00458 hashTable[hash].duplicates = new QList<string_entry*>; 00459 hashTable[hash].duplicates->append(hashTable[hash].entry); 00460 hashTable[hash].duplicate_offset = 0; 00461 } 00462 hashTable[hash].duplicates->append(entry); 00463 } 00464 } 00465 00466 str << d->hashTableSize; 00467 str << d->hashList; 00468 00469 d->offset = str.device()->pos(); // d->offset points to start of hashTable 00470 //kDebug(7011) << QString("Start of Hash Table, offset = %1").arg(d->offset,8,16); 00471 00472 // Write the hashtable + the duplicates twice. 00473 // The duplicates are after the normal hashtable, but the offset of each 00474 // duplicate entry is written into the normal hashtable. 00475 for(int pass = 1; pass <= 2; pass++) 00476 { 00477 str.device()->seek(d->offset); 00478 //kDebug(7011) << QString("Writing hash table (pass #%1)").arg(pass); 00479 for(uint i=0; i < d->hashTableSize; i++) 00480 { 00481 qint32 tmpid; 00482 if (!hashTable[i].entry) 00483 tmpid = (qint32) 0; 00484 else if (!hashTable[i].duplicates) 00485 tmpid = (qint32) hashTable[i].entry->payload->offset(); // Positive ID 00486 else 00487 tmpid = (qint32) -hashTable[i].duplicate_offset; // Negative ID 00488 str << tmpid; 00489 //kDebug(7011) << QString("Hash table : %1").arg(tmpid,8,16); 00490 } 00491 //kDebug(7011) << QString("End of Hash Table, offset = %1").arg(str.device()->at(),8,16); 00492 00493 //kDebug(7011) << QString("Writing duplicate lists (pass #%1)").arg(pass); 00494 for(uint i=0; i < d->hashTableSize; i++) 00495 { 00496 const QList<string_entry*> *dups = hashTable[i].duplicates; 00497 if (dups) 00498 { 00499 hashTable[i].duplicate_offset = str.device()->pos(); 00500 00501 /*kDebug(7011) << QString("Duplicate lists: Offset = %1 list_size = %2") .arg(hashTable[i].duplicate_offset,8,16).arg(dups->count()); 00502 */ 00503 for(QList<string_entry*>::ConstIterator dup = dups->begin(); dup != dups->end(); ++dup) 00504 { 00505 const qint32 offset = (*dup)->payload->offset(); 00506 if (!offset) { 00507 const QString storageId = (*dup)->payload->storageId(); 00508 kDebug() << "about to assert! dict=" << this << "storageId=" << storageId << (*dup)->payload.data(); 00509 if ((*dup)->payload->isType(KST_KService)) { 00510 KService::Ptr service = KService::Ptr::staticCast((*dup)->payload); 00511 kDebug() << service->storageId() << service->entryPath(); 00512 } 00513 // save() must have been called on the entry 00514 Q_ASSERT_X( offset, "KSycocaDict::save", 00515 QByteArray("entry offset is 0, save() was not called on " 00516 + (*dup)->payload->storageId().toLatin1() 00517 + " entryPath=" 00518 + (*dup)->payload->entryPath().toLatin1()) 00519 ); 00520 } 00521 str << offset ; // Positive ID 00522 str << (*dup)->keyStr; // Key (QString) 00523 } 00524 str << (qint32) 0; // End of list marker (0) 00525 } 00526 } 00527 //kDebug(7011) << QString("End of Dict, offset = %1").arg(str.device()->at(),8,16); 00528 } 00529 00530 //kDebug(7011) << "Cleaning up hash table."; 00531 for(uint i=0; i < d->hashTableSize; i++) 00532 { 00533 delete hashTable[i].duplicates; 00534 } 00535 delete [] hashTable; 00536 } 00537 00538 qint32 KSycocaDict::Private::offsetForKey(const QString& key) const 00539 { 00540 if ( !stream || !offset ) 00541 { 00542 kError() << "No ksycoca4 database available!" << endl; 00543 return 0; 00544 } 00545 00546 if (hashTableSize == 0) 00547 return 0; // Unlikely to find anything :-] 00548 00549 // Read hash-table data 00550 const uint hash = hashKey(key) % hashTableSize; 00551 //kDebug(7011) << "hash is" << hash; 00552 00553 const qint32 off = offset+sizeof(qint32)*hash; 00554 //kDebug(7011) << QString("off is %1").arg(off,8,16); 00555 stream->device()->seek( off ); 00556 00557 qint32 retOffset; 00558 (*stream) >> retOffset; 00559 return retOffset; 00560 }
KDE 4.7 API Reference