00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 class CXMLElement;
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050 #ifndef _CACIFFILESYSTEM
00051 #define _CACIFFILESYSTEM
00052 #include "libGIFTAcInvertedFile/include/uses-declarations.h"
00053 #include <string>
00054 #include "libMRML/include/TID.h"
00055 #include "libMRML/include/CSelfDestroyPointer.h"
00056 #include "libMRML/include/CArraySelfDestroyPointer.h"
00057 #include "libGIFTAcInvertedFile/include/CDocumentFrequencyList.h"
00058 #include "libMRML/include/CMutex.h"
00059
00060 #include "libGIFTAcInvertedFile/include/CADIHash.h"
00061 #include "libGIFTAcURL2FTS/include/CAcURL2FTS.h"
00062 #include "libGIFTAcInvertedFile/include/CAcInvertedFile.h"
00063 #include <iostream>
00064 #include <fstream>
00065 #include <map>
00066 #include <vector>
00067 #ifdef HAS_HASH_MAP
00068 #include <hash_map>
00069 #define HASH_MAP hash_map
00070 #else
00071 #define HASH_MAP map
00072 #endif
00073 #include <functional>
00074 #include <algorithm>
00075
00076 #include "libMRML/include/CMagic.h"
00077
00078
00079 typedef TID TFeatureID ;
00080
00091 class CAcIFFileSystem:public CAcInvertedFile{
00092
00093 protected:
00095 CMutex mMutex;
00101 CSelfDestroyPointer<CAcURL2FTS> mURL2FTS;
00103 TID mMaximumFeatureID;
00106 #ifndef V295
00107 string mInvertedFileBuffer;
00108 #else
00109 CArraySelfDestroyPointer<char> mInvertedFileBuffer;
00110 #endif
00111
00113 string mTemporaryIndexingFileBase;
00115 mutable CSelfDestroyPointer<istream> mInvertedFile;
00116
00118 mutable ifstream mOffsetFile;
00119
00121 ifstream mFeatureDescriptionFile;
00122
00124 string mInvertedFileName;
00125
00127 string mOffsetFileName;
00128
00130 string mFeatureDescriptionFileName;
00131
00133 typedef HASH_MAP<TID,unsigned int> CIDToOffset;
00135 CIDToOffset mIDToOffset;
00136
00138 mutable HASH_MAP<TID,double> mFeatureToCollectionFrequency;
00139
00143 HASH_MAP<TID,unsigned int> mFeatureDescription;
00144
00148 CADIHash mDocumentInformation;
00150
00153 void writeOffsetFileElement(TID inFeatureID,
00154 int inPosition,
00155 ostream& inOpenOffsetFile);
00157 CDocumentFrequencyList* getFeatureFile(string inFileName)const;
00158 public:
00160 bool operator()()const;
00161
00188 CAcIFFileSystem(const CXMLElement& inCollectionElement);
00190 bool init(bool);
00191
00193 ~CAcIFFileSystem();
00194
00196 string IDToURL(TID inID)const;
00197
00201 CDocumentFrequencyList* FeatureToList(TFeatureID)const;
00202
00204 CDocumentFrequencyList* URLToFeatureList(string inURL)const;
00205
00207 CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;
00208
00210
00211
00215 double FeatureToCollectionFrequency(TFeatureID)const;
00216
00218 unsigned int getFeatureDescription(TID inFeatureID)const;
00220
00224 double DIDToMaxDocumentFrequency(TID)const;
00225
00227 double DIDToDFSquareSum(TID)const;
00228
00230 double DIDToSquareDFLogICFSum(TID)const;
00232
00233
00235
00243 bool generateInvertedFile();
00244
00252 bool newGenerateInvertedFile();
00253
00256 bool checkConsistency();
00257
00264 bool findWithinStream(TID inFeatureID,
00265 TID inDocumentID,
00266 double inDocumentFrequency)const;
00267
00269
00275 virtual pair<bool,TID> URLToID(const string& inURL)const;
00276
00278 void getAllIDs(list<TID>&)const;
00281 void getAllAccessorElements(list<CAccessorElement>&)const;
00286 void getRandomIDs(list<TID>&,
00287 list<TID>::size_type)const;
00296 void getRandomAccessorElements(list<CAccessorElement>& outResult,
00297 list<CAccessorElement>::size_type inSize)const;
00299 int size()const;
00301
00302 TID getMaximumFeatureID()const;
00310 list<TID>* getAllFeatureIDs()const;
00316 virtual pair<bool,CAccessorElement> IDToAccessorElement(TID inID)const;
00318 operator bool()const;
00319
00320 };
00321
00322 #endif