forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 9
/
IndexReadInfo.h
143 lines (109 loc) · 4.28 KB
/
IndexReadInfo.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// Matt Wells, copyright Oct 2001
// . used for looking up IndexLists for queries
// . call init() to get initial read info per IndexList (1 per termId in query)
// . call update() to update read info for next read of lists
// . use getStartKey() , getEndKey(), getNumRecsToRead() to extract read info
// . tries to keep the amount of reading to a minimal
// . if # of results is not achieved the call update() to get read info for
// another read to hopefully get the # of requested docIds
#ifndef _INDEXREADINFO_H_
#define _INDEXREADINFO_H_
#include "Query.h" // MAX_QUERY_TERMS
#include "IndexList.h"
#include "Titledb.h"
#include "Indexdb.h"
// how many tiered might we break an indexlist into?
#define MAX_TIERS 3
// . define read sizes of each stage
// . each docid is 6 bytes, but first is 12
// . stage0 was 5000, but made it 8000 for trek today,
// . let's see how the powers of ten perform
#define STAGE0 (10000 *6)
#define STAGE1 (100000 *6)
#define STAGE2 (1000000 *6)
#define STAGESUM (STAGE0 + STAGE1 + STAGE2) // + STAGE3)
class IndexReadInfo {
public:
// just sets m_numLists to 0
IndexReadInfo();
// . this will calculate minStartKey and maxEndKey for each termId
// . does not copy these, so don't trash this stack
// . "stage0" is the first # of docIds to read from each IndexList
// -- dynamic truncation
void init ( Query *q ,
int64_t *termFreqs ,
int32_t docsWanted , char callNum , int32_t stage0 ,
int32_t *tierStage ,
bool useDateLists ,
bool sortByDate ,
uint32_t date1 ,
uint32_t date2 ,
bool isDebug );
// . this updates the start keys and docsToRead for each list
// in preparation for another read
// . call this after you've done a read and called
// IndexTable::addLists() so it can hash them and calculate the #
// of results it got
// . it advances m_startKey[i] to lastKey + 1 in lists[i]
void update ( IndexList *lists , int32_t numLists , char callNum );
void update2 ( int32_t tier ) ;
/* void updateForMsg3b ( char *lastParts,
int64_t *termFreqs,
int32_t numLists );*/
void update ( int64_t *termFreqs,
int32_t numLists,
char callNum );
// update without the full lists, just the last part and size
void update ( char *lastParts,
int32_t *listSizes,
int32_t numLists );
// call this after calling update to determine read info per list
char *getStartKeys ( ) { return (char *)m_startKeys ; };
char *getEndKeys ( ) { return (char *)m_endKeys ; };
char getIgnored ( int32_t i ) { return m_ignore[i] ; };
char getHalfKeySize( ) { return m_hks ; };
// getting info directly, like above
int32_t getReadSize ( int32_t i ) { return m_readSizes[i]; };
int32_t *getReadSizes( ) { return m_readSizes; };
// . did we get the # of required results
// . or are all our lists exhausted?
// . call only AFTER calling update() above
bool isDone ( ) { return m_isDone ; };
// call only after calling init() to estimate # of results
int64_t getEstimatedTotalHits();
int32_t getNumLists () { return m_numLists; };
int32_t getStage0Default ( ) ;
private:
// . reading positions to read next portion of each list
// . set initially by init()
// . updated by addLists
// . might read one list multiple tims if we don't get enough hits
//key_t m_startKeys [ MAX_QUERY_TERMS ];
//key_t m_endKeys [ MAX_QUERY_TERMS ];
//key128_t m_startKeys2 [ MAX_QUERY_TERMS ];
//key128_t m_endKeys2 [ MAX_QUERY_TERMS ];
char m_startKeys [ MAX_QUERY_TERMS * MAX_KEY_BYTES ];
char m_endKeys [ MAX_QUERY_TERMS * MAX_KEY_BYTES ];
// how many docIds/recs/keys should we read?
int32_t m_readSizes [ MAX_QUERY_TERMS ];
char m_ignore [ MAX_QUERY_TERMS ];
// . the query we're doing
// . the above arrays are 1-1 with the arrays in m_q, 1 for each termId
Query *m_q;
// how many index lists we're reading
int32_t m_numLists;
// may be set to true after update() is called
bool m_isDone;
// . for dynamic truncation, first # of docs to read from each list
// . stages can now be set dynamically on a per query basis
int32_t m_stage[MAX_TIERS];
//int32_t m_stageSum;
char m_ks;
char m_hks;
char m_useDateLists;
char m_sortByDate;
uint32_t m_date1;
uint32_t m_date2;
bool m_isDebug;
};
#endif