forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Clusterdb.h
178 lines (140 loc) · 5.25 KB
/
Clusterdb.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// Copyright Matt Wells, Jul 2002
// . a clusterRec now no longer exists, per se
// . it is the same thing as the key of the titleRec in titledb
// . titleRecs now contain the site and content hashes in the low bits
// of their key.
// . this allows us to store much cluster info in Titledb's RdbMap
// . so to get cluster info, just read in the titleRec, you do not even
// need to uncompress it, just get the info from its key
// . we still use the cache here, however, to cache the keys (clusterRecs)
// . later, i may have to do some fancy footwork if we want to store all
// clusterRecs (titleKeys) in memory.
// . TODO: what if stored file offsets in tfndb, too, then titledb RdbMap
// would not be necessary?
//
// . clusterdb will now serve to help do fast site clustering by retaining
// docids and site hashes in memory
//
// 00000000 00000000 0000000d dddddddd d = docid
// dddddddd dddddddd dddddddd dddddfll f = family filter bit
// llllssss ssssssss ssssssss sssssshz q = year quarter bits
// l = language bits
// s = site hash
// h = half bit
// z = del bit
#ifndef _CLUSTERDB_H_
#define _CLUSTERDB_H_
//#include "TitleRec.h" // SAMPLE_VECTOR_SIZE
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Titledb.h"
//#include "DiskPageCache.h"
// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key_t))
// this now includes the gigabit vector
#define VECTOR_REC_SIZE (sizeof(key_t)+SAMPLE_VECTOR_SIZE+GIGABIT_VECTOR_SIZE)
class Clusterdb {
public:
// reset rdb
void reset();
// set up our private rdb
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
Rdb *getRdb ( ) { return &m_rdb; };
// make the cluster rec
void makeRecFromTitleRec ( char *rec,
class TitleRec *titleRec,
bool isDelKey );
// make the cluster rec
void makeRecFromTitleRecKey ( char *rec,
char *key,
bool isDelKey );
// make the cluster rec key
key_t makeClusterRecKey ( int64_t docId,
bool familyFilter,
uint8_t languageBits,
int32_t siteHash,
bool isDelKey,
bool isHalfKey = false );
key_t makeFirstClusterRecKey ( int64_t docId ) {
return makeClusterRecKey ( docId, false, 0, 0, true ); };
key_t makeLastClusterRecKey ( int64_t docId ) {
return makeClusterRecKey ( docId, true, 0xff, 0xffffffff,
false, true ); };
// convert a titlerec key into a clusterec key
key_t convertTitleRecKey ( key_t titleKey );
/*
uint32_t getGroupId ( int64_t docId ) {
return g_titledb.getGroupId ( docId ); };
// cluster rec should be stored on same host as titleRec with the
// same docId that this key contains
uint32_t getGroupIdFromKey ( key_t *key ) {
return g_titledb.getGroupId ( getDocId ( *key ) ); };
*/
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
// // docId occupies the most significant bytes of the key
// now docId occupies the bits after the first 23
int64_t getDocId ( void *k ) {
//int64_t docId = (k.n0) >> (32+24);
//docId |= ( ((uint64_t)(k.n1)) << 8 );
int64_t docId = (((key_t *)k)->n0) >> 35;
docId |= ( ((uint64_t)(((key_t *)k)->n1)) << 29 );
return docId;
};
//int64_t getDocId ( char *r ) {
// return getDocId(*(key_t*)r);
//}
uint32_t getSiteHash26 ( char *r ) {
//return g_titledb.getSiteHash ( (key_t *)r ); };
return ((uint32_t)(((key_t*)r)->n0 >> 2) & 0x03FFFFFF);
};
uint32_t hasAdultContent ( char *r ) {
//return g_titledb.hasAdultContent ( *(key_t *)r ); };
return ((uint32_t)(((key_t*)r)->n0 >> 34) & 0x00000001);
};
unsigned char getLanguage ( char *r ) {
return ((unsigned char)(((key_t*)r)->n0 >> 28) & 0x0000003F);
}
// NOTE: THESE USE THE OLD "CLUSTERDB" REC GENERATED BY MSG22 (VECTOR)
//uint32_t getContentHash ( char *r ) {
// return g_titledb.getContentHash ( *(key_t *)r ); };
char getFamilyFilter ( char *r ) {
if ( (*(int64_t *)r) & 0x0000000400000000LL ) return 1;
return 0;
};
//uint32_t hasAdultWords ( char *r ) {
// return g_titledb.hasAdultWords ( *(key_t *)r ); };
//uint32_t hasAdultCategory ( char *r ) {
// return g_titledb.hasAdultCategory ( *(key_t *)r ); };
//unsigned char getLanguageFromVector ( char *r ) {
// return 0;
//}
// the random sample vector
/*
void getSampleVector ( char *vec ,
class Doc *doc,
char *coll ,
int32_t collLen ,
int32_t niceness = 0 );
*/
//void getSampleVector ( char *vec , class TermTable *table );
char getSampleSimilarity ( char *vec0 , char *vec1 , int32_t size );
// get the content vector from a cluster rec (used by Msg38.cpp)
//char *getSampleVector ( char *rec ) { return rec + sizeof(key_t); };
//char *getGigabitVector ( char *rec ) {
// return rec + sizeof(key_t) + SAMPLE_VECTOR_SIZE ; };
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
// int32_t *qtable , int32_t numSlots ) ;
//DiskPageCache *getDiskPageCache() { return &m_pc; };
private:
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
//DiskPageCache m_pc;
};
extern class Clusterdb g_clusterdb;
extern class Clusterdb g_clusterdb2;
#endif