Kate
katetextloader.h
Go to the documentation of this file.
00001 /* This file is part of the Kate project. 00002 * 00003 * Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org> 00004 * 00005 * This library is free software; you can redistribute it and/or 00006 * modify it under the terms of the GNU Library General Public 00007 * License as published by the Free Software Foundation; either 00008 * version 2 of the License, or (at your option) any later version. 00009 * 00010 * This library is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 * Library General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Library General Public License 00016 * along with this library; see the file COPYING.LIB. If not, write to 00017 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00018 * Boston, MA 02110-1301, USA. 00019 */ 00020 00021 #ifndef KATE_TEXTLOADER_H 00022 #define KATE_TEXTLOADER_H 00023 00024 #include <QtCore/QString> 00025 #include <QtCore/QFile> 00026 00027 // on the fly compression 00028 #include <kfilterdev.h> 00029 #include <kmimetype.h> 00030 00031 namespace Kate { 00032 00038 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; 00039 00043 class TextLoader 00044 { 00045 public: 00051 TextLoader (const QString &filename, KEncodingProber::ProberType proberType) 00052 : m_codec (0) 00053 , m_eof (false) // default to not eof 00054 , m_lastWasEndOfLine (true) // at start of file, we had a virtual newline 00055 , m_lastWasR (false) // we have not found a \r as last char 00056 , m_position (0) 00057 , m_lastLineStart (0) 00058 , m_eol (TextBuffer::eolUnknown) // no eol type detected atm 00059 , m_buffer (KATE_FILE_LOADER_BS, 0) 00060 , m_converterState (0) 00061 , m_bomFound (false) 00062 , m_firstRead (true) 00063 , m_proberType (proberType) 00064 { 00065 // try to get mimetype for on the fly decompression, don't rely on filename! 00066 QFile testMime (filename); 00067 if (testMime.open (QIODevice::ReadOnly)) 00068 m_mimeType = KMimeType::findByContent (&testMime)->name (); 00069 else 00070 m_mimeType = KMimeType::findByPath (filename, 0, false)->name (); 00071 00072 // construct filter device 00073 m_file = KFilterDev::deviceForFile (filename, m_mimeType, false); 00074 } 00075 00079 ~TextLoader () 00080 { 00081 delete m_file; 00082 delete m_converterState; 00083 } 00084 00090 bool open (QTextCodec *codec) 00091 { 00092 m_codec = codec; 00093 m_eof = false; 00094 m_lastWasEndOfLine = true; 00095 m_lastWasR = false; 00096 m_position = 0; 00097 m_lastLineStart = 0; 00098 m_eol = TextBuffer::eolUnknown; 00099 m_text.clear (); 00100 delete m_converterState; 00101 m_converterState = new QTextCodec::ConverterState (QTextCodec::ConvertInvalidToNull); 00102 m_bomFound = false; 00103 m_firstRead = true; 00104 00105 // if already opened, close the file... 00106 if (m_file->isOpen()) 00107 m_file->close (); 00108 00109 return m_file->open (QIODevice::ReadOnly); 00110 } 00111 00116 bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); } 00117 00123 TextBuffer::EndOfLineMode eol () const { return m_eol; } 00124 00129 bool byteOrderMarkFound () const { return m_bomFound; } 00130 00135 const QString &mimeTypeForFilterDev () const { return m_mimeType; } 00136 00141 const QChar *unicode () const { return m_text.unicode(); } 00142 00147 QTextCodec *textCodec () const { return m_codec; } 00148 00155 bool readLine (int &offset, int &length) 00156 { 00157 length = 0; 00158 offset = 0; 00159 bool encodingError = false; 00160 00161 static const QLatin1Char cr(QLatin1Char('\r')); 00162 static const QLatin1Char lf(QLatin1Char('\n')); 00163 00164 while (m_position <= m_text.length()) 00165 { 00166 if (m_position == m_text.length()) 00167 { 00168 // try to load more text if something is around 00169 if (!m_eof) 00170 { 00171 int c = m_file->read (m_buffer.data(), m_buffer.size()); 00172 00173 // kill the old lines... 00174 m_text.remove (0, m_lastLineStart); 00175 00176 // if any text is there, append it.... 00177 if (c > 0) 00178 { 00179 // detect byte order marks & codec for byte order markers on first read 00180 int bomBytes = 0; 00181 if (m_firstRead) { 00182 // use first 16 bytes max to allow BOM detection of codec 00183 QByteArray bom (m_buffer.data(), qMin (16, c)); 00184 QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, 0); 00185 00186 // if codec != null, we found a BOM! 00187 if (codecForByteOrderMark) { 00188 m_bomFound = true; 00189 00190 // eat away the different boms! 00191 int mib = codecForByteOrderMark->mibEnum (); 00192 if (mib == 106) // utf8 00193 bomBytes = 3; 00194 if (mib == 1013 || mib == 1014) // utf16 00195 bomBytes = 2; 00196 if (mib == 1018 || mib == 1019) // utf32 00197 bomBytes = 4; 00198 } 00199 00203 if (!m_codec) { 00207 if (codecForByteOrderMark) 00208 m_codec = codecForByteOrderMark; 00209 else { 00213 KEncodingProber prober (m_proberType); 00214 prober.feed (m_buffer.constData(), c); 00215 00216 // we found codec with some confidence? 00217 if (prober.confidence() > 0.5) 00218 m_codec = QTextCodec::codecForName(prober.encoding()); 00219 00220 // no codec, no chance, encoding error 00221 if (!m_codec) 00222 return false; 00223 } 00224 } 00225 00226 m_firstRead = false; 00227 } 00228 00229 Q_ASSERT (m_codec); 00230 QString unicode = m_codec->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes, m_converterState); 00231 00232 // detect broken encoding 00233 for (int i = 0; i < unicode.size(); ++i) { 00234 if (unicode[i] == 0) { 00235 encodingError = true; 00236 break; 00237 } 00238 } 00239 00240 m_text.append (unicode); 00241 } 00242 00243 // is file completely read ? 00244 m_eof = (c == -1) || (c == 0); 00245 00246 // recalc current pos and last pos 00247 m_position -= m_lastLineStart; 00248 m_lastLineStart = 0; 00249 } 00250 00251 // oh oh, end of file, escape ! 00252 if (m_eof && (m_position == m_text.length())) 00253 { 00254 m_lastWasEndOfLine = false; 00255 00256 // line data 00257 offset = m_lastLineStart; 00258 length = m_position-m_lastLineStart; 00259 00260 m_lastLineStart = m_position; 00261 00262 return !encodingError; 00263 } 00264 } 00265 00266 if (m_text.at(m_position) == lf) 00267 { 00268 m_lastWasEndOfLine = true; 00269 00270 if (m_lastWasR) 00271 { 00272 m_lastLineStart++; 00273 m_lastWasR = false; 00274 m_eol = TextBuffer::eolDos; 00275 } 00276 else 00277 { 00278 // line data 00279 offset = m_lastLineStart; 00280 length = m_position-m_lastLineStart; 00281 00282 m_lastLineStart = m_position+1; 00283 m_position++; 00284 00285 // only win, if not dos! 00286 if (m_eol != TextBuffer::eolDos) 00287 m_eol = TextBuffer::eolUnix; 00288 00289 return !encodingError; 00290 } 00291 } 00292 else if (m_text.at(m_position) == cr) 00293 { 00294 m_lastWasEndOfLine = true; 00295 m_lastWasR = true; 00296 00297 // line data 00298 offset = m_lastLineStart; 00299 length = m_position-m_lastLineStart; 00300 00301 m_lastLineStart = m_position+1; 00302 m_position++; 00303 00304 // should only win of first time! 00305 if (m_eol == TextBuffer::eolUnknown) 00306 m_eol = TextBuffer::eolMac; 00307 00308 return !encodingError; 00309 } 00310 else if (m_text.at(m_position) == QChar::LineSeparator) 00311 { 00312 m_lastWasEndOfLine = true; 00313 00314 // line data 00315 offset = m_lastLineStart; 00316 length = m_position-m_lastLineStart; 00317 00318 m_lastLineStart = m_position+1; 00319 m_position++; 00320 00321 return !encodingError; 00322 } 00323 else 00324 { 00325 m_lastWasEndOfLine = false; 00326 m_lastWasR = false; 00327 } 00328 00329 m_position++; 00330 } 00331 00332 return !encodingError; 00333 } 00334 00335 private: 00336 QTextCodec *m_codec; 00337 bool m_eof; 00338 bool m_lastWasEndOfLine; 00339 bool m_lastWasR; 00340 int m_position; 00341 int m_lastLineStart; 00342 TextBuffer::EndOfLineMode m_eol; 00343 QString m_mimeType; 00344 QIODevice *m_file; 00345 QByteArray m_buffer; 00346 QString m_text; 00347 QTextCodec::ConverterState *m_converterState; 00348 bool m_bomFound; 00349 bool m_firstRead; 00350 KEncodingProber::ProberType m_proberType; 00351 }; 00352 00353 } 00354 00355 #endif
KDE 4.6 API Reference