Skip to content

Commit

Permalink
feat: selectively zero match lists
Browse files Browse the repository at this point in the history
Instead of completely zeroing match lists at the end of each scan
(which is O(num_strings) and thus scales badly with large rule sets),
keep track of which list entries have been used. Only those entries
need to be zeroed at the end. This essentially changes the
O(num_strings) to O(num_matched_strings), which is usually far lesser.
  • Loading branch information
secDre4mer committed Jun 13, 2024
1 parent 4fc1ff8 commit 361301f
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 48 deletions.
20 changes: 10 additions & 10 deletions libyara/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)

case OP_FOUND:
pop(r1);
r2.i = context->matches[r1.s->idx].tail != NULL ? 1 : 0;
r2.i = context->matches.entries[r1.s->idx].tail != NULL ? 1 : 0;
YR_DEBUG_FPRINTF(
2,
stderr,
Expand All @@ -1460,7 +1460,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r2.p);
#endif

match = context->matches[r2.s->idx].head;
match = context->matches.entries[r2.s->idx].head;
r3.i = false;

while (match != NULL)
Expand Down Expand Up @@ -1494,7 +1494,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r3.p);
#endif

match = context->matches[r3.s->idx].head;
match = context->matches.entries[r3.s->idx].head;
r4.i = false;

while (match != NULL && !r4.i)
Expand Down Expand Up @@ -1522,7 +1522,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r1.p);
#endif

r2.i = context->matches[r1.s->idx].count;
r2.i = context->matches.entries[r1.s->idx].count;
push(r2);
break;

Expand All @@ -1540,7 +1540,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r3.p);
#endif

match = context->matches[r3.s->idx].head;
match = context->matches.entries[r3.s->idx].head;
r4.i = 0;

while (match != NULL)
Expand Down Expand Up @@ -1571,7 +1571,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r2.p);
#endif

match = context->matches[r2.s->idx].head;
match = context->matches.entries[r2.s->idx].head;

i = 1;
r3.i = YR_UNDEFINED;
Expand Down Expand Up @@ -1599,7 +1599,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
ensure_within_rules_arena(r2.p);
#endif

match = context->matches[r2.s->idx].head;
match = context->matches.entries[r2.s->idx].head;

i = 1;
r3.i = YR_UNDEFINED;
Expand Down Expand Up @@ -1629,7 +1629,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
{
if (r2.i == OF_STRING_SET)
{
if (context->matches[r1.s->idx].tail != NULL)
if (context->matches.entries[r1.s->idx].tail != NULL)
{
found++;
}
Expand Down Expand Up @@ -1714,7 +1714,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
#if YR_PARANOID_EXEC
ensure_within_rules_arena(r3.p);
#endif
match = context->matches[r3.s->idx].head;
match = context->matches.entries[r3.s->idx].head;

while (match != NULL)
{
Expand Down Expand Up @@ -1790,7 +1790,7 @@ int yr_execute_code(YR_SCAN_CONTEXT* context)
#if YR_PARANOID_EXEC
ensure_within_rules_arena(r1.p);
#endif
match = context->matches[r1.s->idx].head;
match = context->matches.entries[r1.s->idx].head;

while (match != NULL)
{
Expand Down
2 changes: 1 addition & 1 deletion libyara/include/yara/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
string = STRING_IS_LAST_IN_RULE(string) ? NULL : string + 1)

#define yr_string_matches_foreach(context, string, match) \
for (match = context->matches[string->idx].head; match != NULL; \
for (match = context->matches.entries[string->idx].head; match != NULL; \
match = match->next) \
/* private matches are skipped */ \
if (match->is_private) \
Expand Down
30 changes: 26 additions & 4 deletions libyara/include/yara/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ typedef struct YR_AC_MATCH YR_AC_MATCH;
typedef struct YR_NAMESPACE YR_NAMESPACE;
typedef struct YR_META YR_META;
typedef struct YR_MATCHES YR_MATCHES;
typedef struct YR_MATCHLIST YR_MATCHLIST;
typedef struct YR_STRING YR_STRING;
typedef struct YR_RULE YR_RULE;
typedef struct YR_RULES YR_RULES;
Expand Down Expand Up @@ -479,6 +480,28 @@ struct YR_MATCHES
YR_MATCH* tail;

int32_t count;

// If true, this YR_MATCHES instance contained a YR_MATCH at some point
// of the scan and will need to be zeroed at the end of the scan.
bool dirty;
};

struct YR_MATCHLIST
{
// Array with pointers to lists of matches. Item N in the array has the
// list of matches for string with index N.
// Total size is equal to length.
YR_MATCHES* entries;

// Array with indices of "dirty" matches, that is, elements in entry that
// must be cleaned before this match list can be reused.
// Total size is equal to length, but only the first dirty_count elements
// are actually valid; all further elements must be ignored.
int32_t* dirty_entries;
int32_t dirty_count;

// Size of this match list
int32_t length;
};

struct YR_MATCH
Expand Down Expand Up @@ -816,17 +839,16 @@ struct YR_SCAN_CONTEXT
// N has too many matches.
YR_BITMASK* strings_temp_disabled;

// Array with pointers to lists of matches. Item N in the array has the
// list of matches for string with index N.
YR_MATCHES* matches;
// A match list containing the matches per string.
YR_MATCHLIST matches;

// "unconfirmed_matches" is like "matches" but for strings that are part of
// a chain. Let's suppose that the string S is split in two chained strings
// S1 <- S2. When a match is found for S1, we can't be sure that S matches
// until a match for S2 is found (within the range defined by chain_gap_min
// and chain_gap_max), so the matches for S1 are put in "unconfirmed_matches"
// until they can be confirmed or discarded.
YR_MATCHES* unconfirmed_matches;
YR_MATCHLIST unconfirmed_matches;

// A bitmap with one bit per rule, bit N is set if the corresponding rule
// must evaluated.
Expand Down
52 changes: 39 additions & 13 deletions libyara/scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ static void _yr_scan_update_match_chain_length(
if (string->chained_to == NULL)
return;

match = context->unconfirmed_matches[string->chained_to->idx].head;
match = context->unconfirmed_matches.entries[string->chained_to->idx].head;

while (match != NULL)
{
Expand All @@ -289,9 +289,23 @@ static void _yr_scan_update_match_chain_length(

static int _yr_scan_add_match_to_list(
YR_MATCH* match,
YR_MATCHES* matches_list,
YR_MATCHLIST* matchlist,
int string_index,
int replace_if_exists)
{
YR_MATCHES* matches_list = &matchlist->entries[string_index];

// Mark entry as dirty if it is't marked as such yet
if (!matches_list->dirty)
{
matches_list->dirty = true;
// Since dirty_entries has sufficient size for all strings, and
// this one is not marked as dirty and thus not part of dirty_entries yet,
// we can safely add it to dirty_entries without exceeding its maximum size.
matchlist->dirty_entries[matchlist->dirty_count] = string_index;
matchlist->dirty_count++;
}

int result = ERROR_SUCCESS;

#if YR_DEBUG_VERBOSITY > 0
Expand Down Expand Up @@ -369,8 +383,11 @@ _exit:;

static void _yr_scan_remove_match_from_list(
YR_MATCH* match,
YR_MATCHES* matches_list)
YR_MATCHLIST* matchlist,
int string_index)
{
YR_MATCHES* matches_list = &matchlist->entries[string_index];

if (match->prev != NULL)
match->prev->next = match->next;

Expand All @@ -386,6 +403,10 @@ static void _yr_scan_remove_match_from_list(
matches_list->count--;
match->next = NULL;
match->prev = NULL;

// If matches_list->count == 0 now, we could mark this matchlist entry as no longer dirty.
// However, finding and removing the entry from the dirty list takes more time than leaving
// it dirty and (unnecessarily) cleaning it later on, so we don't bother.
}

//
Expand Down Expand Up @@ -449,7 +470,7 @@ static int _yr_scan_verify_chained_string_match(
// list of unconfirmed matches. Unconfirmed matches are sorted in ascending
// offset order. If no unconfirmed match exists, the lowest possible offset
// is the offset of the current match.
match = context->unconfirmed_matches[matching_string->idx].head;
match = context->unconfirmed_matches.entries[matching_string->idx].head;

if (match != NULL)
lowest_offset = match->offset;
Expand All @@ -460,7 +481,7 @@ static int _yr_scan_verify_chained_string_match(
// precedes the currently matching string. If we have a string chain like:
// S1 <- S2 <- S3, and we just found a match for S2, we are iterating the
// list of unconfirmed matches of S1.
match = context->unconfirmed_matches[matching_string->chained_to->idx].head;
match = context->unconfirmed_matches.entries[matching_string->chained_to->idx].head;

while (match != NULL)
{
Expand All @@ -481,7 +502,8 @@ static int _yr_scan_verify_chained_string_match(
// match can't be an actual match)
_yr_scan_remove_match_from_list(
match,
&context->unconfirmed_matches[matching_string->chained_to->idx]);
&context->unconfirmed_matches,
matching_string->chained_to->idx);
}
else if (
ending_offset + matching_string->chain_gap_max >= match_offset &&
Expand Down Expand Up @@ -517,7 +539,7 @@ static int _yr_scan_verify_chained_string_match(
// every unconfirmed match in all the strings in the chain up to the head
// of the chain.
match =
context->unconfirmed_matches[matching_string->chained_to->idx].head;
context->unconfirmed_matches.entries[matching_string->chained_to->idx].head;

while (match != NULL)
{
Expand All @@ -543,7 +565,7 @@ static int _yr_scan_verify_chained_string_match(
}

// "string" points now to the head of the strings chain.
match = context->unconfirmed_matches[string->idx].head;
match = context->unconfirmed_matches.entries[string->idx].head;

// Iterate over the list of unconfirmed matches of the head of the chain,
// and move to the list of confirmed matches those with a chain_length
Expand All @@ -556,7 +578,9 @@ static int _yr_scan_verify_chained_string_match(
if (match->chain_length == full_chain_length)
{
_yr_scan_remove_match_from_list(
match, &context->unconfirmed_matches[string->idx]);
match,
&context->unconfirmed_matches,
string->idx);

match->match_length = (int32_t) (match_offset - match->offset +
match_length);
Expand All @@ -580,7 +604,7 @@ static int _yr_scan_verify_chained_string_match(
yr_bitmask_set(context->required_eval, string->rule_idx);

FAIL_ON_ERROR(_yr_scan_add_match_to_list(
match, &context->matches[string->idx], false));
match, &context->matches, string->idx, false));
}

match = next_match;
Expand Down Expand Up @@ -627,7 +651,8 @@ static int _yr_scan_verify_chained_string_match(
// an actual match until finding the remaining parts of the chain.
FAIL_ON_ERROR(_yr_scan_add_match_to_list(
new_match,
&context->unconfirmed_matches[matching_string->idx],
&context->unconfirmed_matches,
matching_string->idx,
false));
}
}
Expand Down Expand Up @@ -758,7 +783,8 @@ static int _yr_scan_match_callback(

FAIL_ON_ERROR(_yr_scan_add_match_to_list(
new_match,
&callback_args->context->matches[string->idx],
&callback_args->context->matches,
string->idx,
STRING_IS_GREEDY_REGEXP(string)));
}
}
Expand Down Expand Up @@ -1059,7 +1085,7 @@ int yr_scan_verify_match(
return ERROR_SUCCESS;

if (context->flags & SCAN_FLAGS_FAST_MODE && STRING_IS_SINGLE_MATCH(string) &&
context->matches[string->idx].head != NULL)
context->matches.entries[string->idx].head != NULL)
return ERROR_SUCCESS;

if (STRING_IS_FIXED_OFFSET(string) &&
Expand Down
Loading

0 comments on commit 361301f

Please sign in to comment.