forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 9
/
AdultBit.cpp
165 lines (146 loc) · 3.42 KB
/
AdultBit.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#include "gb-include.h"
#include "AdultBit.h"
#include "HashTableX.h"
// . returns true if document is adult, false otherwise
bool AdultBit::getBit ( char *s , int32_t niceness) {
// rudimentary adult detection algorithm
int32_t i = 0;
int32_t dirties = 0;
int32_t j;
int32_t slen;
loop:
// skip until we hit an alpha
while ( s[i] && ! is_alpha_a(s[i]) ) i++;
// return if done
if ( ! s[i] ) return false;
// . point to char after this alpha
// . return if none
j = i + 1;
// find end of the alpha char sequence
while ( s[j] && is_alpha_a(s[j]) ) j++;
// skip over 1 or 2 letter words
slen = j - i;
if ( slen <= 2 ) { i = j; goto loop; }
// it's adult content if it has just 1 obscene word
if ( isObscene ( (char *) s+i , slen ) ) return true;
// W = non-dirty word
// D = dirty word
// . = sequence of punctuation/num and/or 1 to 2 letter words
// dirty sequences:
// . D . D . D . (dirties=6)
// . D . W . D . D . (dirties=5)
// . basically, if 3 out of 4 words in a subsequence are
// "dirty" then the whole document is "adult" content
if ( isDirty ( (char *) s+i , slen ) ) {
dirties += 2;
if ( dirties >= 5 ) return true;
i = j;
goto loop;
}
dirties--;
if ( dirties < 0 ) dirties = 0;
QUICKPOLL((niceness));
i = j;
goto loop;
}
static HashTableX s_dtable;
bool AdultBit::isDirty ( char *s , int32_t len ) {
static bool s_isInitialized = false;
static char *s_dirty[] = {
"anal",
"analsex",
"blowjob",
"blowjobs",
"boob",
"boobs",
"clitoris",
"cock",
"cocks",
"cum",
"dick",
"dicks",
"gangbang",
"gangbangs",
"gangbanging",
"movie",
"movies",
"oral",
"oralsex",
"porn",
"porno",
"pussy",
"pussies",
"sex",
"sexy",
"tit",
"tits",
"video",
"videos",
"xxx",
"xxxx",
"xxxx"
};
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0,
"adulttab"))
return log("build: Error initializing "
"dirty word hash table." );
// now add in all the dirty words
int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
int64_t h = hash64b ( s_dirty [i] );
if ( ! s_dtable.addTerm (&h, i+1) ) return false;
}
s_isInitialized = true;
}
// compute the hash of the word "s"
int64_t h = hash64Lower_a ( s , len );
// get from table
return s_dtable.getScore ( &h );
}
static HashTableX s_otable;
bool AdultBit::isObscene ( char *s , int32_t len ) {
static bool s_isInitialized = false;
static char *s_obscene[] = {
"clit",
"clits",
// "cum", magna cum laude
"cums",
"cumshot",
"cunt",
"cunts",
"milf",
"rimjob",
"felch",
"fuck",
"fucked",
"fucker",
"fucking",
"fucks",
"whore",
"whores"
};
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
"obscenetab") )
return log("build: Error initializing "
"obscene word hash table." );
// now add in all the stop words
int32_t n = sizeof(s_obscene) / sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
int64_t h = hash64b ( s_obscene[i] );
if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
}
s_isInitialized = true;
}
// compute the hash of the word "s"
int64_t h = hash64Lower_a ( s , len );
// get from table
return s_otable.getScore ( &h );
}
void resetAdultBit ( ) {
s_dtable.reset();
s_otable.reset();
}