Skip to content

Commit

Permalink
Improve performance of finding indexables
Browse files Browse the repository at this point in the history
Currently, all folders and files in the current tree are turned into
IndexablePath, and then excluded files are filtered out after.

When there are large file trees that are meant to be excluded, this
results in a lot of unnecessary work.

ActiveStorage stores files in the `tmp` directory in many many small
folders. So does Bootsnap. Ruby LSP has to traverse all of these files,
even though the entire directory should just be ignored.

Rubocop has solved this by breaking the `includes` patterns up into many
patterns, applying the exclusions *before* the `Dir.glob`, so I followed
in their footsteps. This works great for exclusions that end in "**/*".
We still need to loop through all IndexablePath objects and see if
they're excluded, in the case that an extension was provided on the
excluded path, but this can cut down load time dramatically.

Before this PR in my Rails app, `indexables` took 76 seconds to run. Now
it takes, 0.17 seconds. Before and after code both return the same exact
file list.
  • Loading branch information
natematykiewicz committed May 23, 2024
1 parent 85b9e29 commit 460e046
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 15 deletions.
79 changes: 65 additions & 14 deletions lib/ruby_indexer/lib/ruby_indexer/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,16 @@ class Configuration
def initialize
@excluded_gems = T.let(initial_excluded_gems, T::Array[String])
@included_gems = T.let([], T::Array[String])
@excluded_patterns = T.let([File.join("**", "*_test.rb"), File.join("**", "tmp", "**", "*")], T::Array[String])
@excluded_patterns = T.let(
[
File.join("**", "*_test.rb"),
File.join("**", "tmp", "**", "*"),
File.join("**", "node_modules", "**", "*"),
],
T::Array[String],
)
path = Bundler.settings["path"]
@excluded_patterns << File.join(File.expand_path(path, Dir.pwd), "**", "*.rb") if path
@excluded_patterns << File.join(File.expand_path(path, Dir.pwd), "**", "*") if path

@included_patterns = T.let([File.join(Dir.pwd, "**", "*.rb")], T::Array[String])
@excluded_magic_comments = T.let(
Expand All @@ -43,6 +50,15 @@ def initialize
)
end

sig { returns(String) }
def exclude_pattern
relative_patterns = @excluded_patterns
.select { |p| p.end_with?("/**/*") }
.map { |p| p.delete_prefix("#{Dir.pwd}/") }

"#{Dir.pwd}/{#{relative_patterns.join(",")}}"
end

sig { returns(T::Array[IndexablePath]) }
def indexables
excluded_gems = @excluded_gems - @included_gems
Expand All @@ -52,21 +68,25 @@ def indexables
# having duplicates if BUNDLE_PATH is set to a folder inside the project structure

# Add user specified patterns
indexables = @included_patterns.flat_map do |pattern|
patterns = indexable_included_file_patterns.sort.map! { |dir| File.join(dir, "*.rb") }
patterns = [File.join(Dir.pwd, "**/*.rb")] if patterns.empty?

indexables = patterns.flat_map do |pattern|
load_path_entry = T.let(nil, T.nilable(String))

Dir.glob(pattern, File::FNM_PATHNAME | File::FNM_EXTGLOB).map! do |path|
path = File.expand_path(path)
# All entries for the same pattern match the same $LOAD_PATH entry. Since searching the $LOAD_PATH for every
# entry is expensive, we memoize it until we find a path that doesn't belong to that $LOAD_PATH. This happens
# on repositories that define multiple gems, like Rails. All frameworks are defined inside the Dir.pwd, but
# each one of them belongs to a different $LOAD_PATH entry
if load_path_entry.nil? || !path.start_with?(load_path_entry)
load_path_entry = $LOAD_PATH.find { |load_path| path.start_with?(load_path) }
Dir.glob(pattern, File::FNM_PATHNAME | File::FNM_EXTGLOB)
.map! do |path|
path = File.expand_path(path)
# All entries for the same pattern match the same $LOAD_PATH entry. Since searching the $LOAD_PATH for every
# entry is expensive, we memoize it until we find a path that doesn't belong to that $LOAD_PATH. This
# happens on repositories that define multiple gems, like Rails. All frameworks are defined inside the
# Dir.pwd, but each one of them belongs to a different $LOAD_PATH entry
if load_path_entry.nil? || !path.start_with?(load_path_entry)
load_path_entry = $LOAD_PATH.find { |load_path| path.start_with?(load_path) }
end

IndexablePath.new(load_path_entry, path)
end

IndexablePath.new(load_path_entry, path)
end
end

# Remove user specified patterns
Expand Down Expand Up @@ -158,6 +178,37 @@ def apply_config(config)

private

sig { params(base_directory: String, excluded_pattern: String).returns(T::Array[String]) }
def indexable_included_file_patterns(base_directory = Dir.pwd, excluded_pattern = exclude_pattern)
flags = File::FNM_PATHNAME | File::FNM_EXTGLOB

# Escape File.fnmatch? wildcards in the directory
base_directory = base_directory.gsub(/[\\\{\}\[\]\*\?]/) do |reserved_glob_character|
"\\#{reserved_glob_character}"
end

dirs = Dir.glob(File.join(base_directory, "*/"), flags)
.reject do |dir|
next true if dir.end_with?("/./", "/../")
next true if File.fnmatch?(excluded_pattern, dir, flags)

symlink_excluded_or_infinite_loop?(base_directory, dir, excluded_pattern, flags)
end

dirs
.flat_map { |dir| indexable_included_file_patterns(dir, excluded_pattern) }
.unshift(base_directory)
end

sig { params(base_dir: String, current_dir: String, exclude_pattern: String, flags: Integer).returns(T::Boolean) }
def symlink_excluded_or_infinite_loop?(base_dir, current_dir, exclude_pattern, flags)
dir_realpath = File.realpath(current_dir)
File.symlink?(current_dir.chomp("/")) && (
File.fnmatch?(exclude_pattern, "#{dir_realpath}/", flags) ||
File.realpath(base_dir).start_with?(dir_realpath)
)
end

sig { params(config: T::Hash[String, T.untyped]).void }
def validate_config!(config)
errors = config.filter_map do |key, value|
Expand Down
9 changes: 8 additions & 1 deletion lib/ruby_indexer/test/configuration_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ def test_indexables_does_not_include_default_gem_path_when_in_bundle
)
end

def test_exclude_pattern
assert_equal(
@config.exclude_pattern,
"#{Dir.pwd}/{**/tmp/**/*,**/node_modules/**/*,vendor/bundle/**/*}",
)
end

def test_indexables_includes_default_gems
indexables = @config.indexables.map(&:full_path)

Expand All @@ -71,7 +78,7 @@ def test_indexables_avoids_duplicates_if_bundle_path_is_inside_project
Bundler.settings.temporary(path: "vendor/bundle") do
config = Configuration.new

assert_includes(config.instance_variable_get(:@excluded_patterns), "#{Dir.pwd}/vendor/bundle/**/*.rb")
assert_includes(config.instance_variable_get(:@excluded_patterns), "#{Dir.pwd}/vendor/bundle/**/*")
end
end

Expand Down

0 comments on commit 460e046

Please sign in to comment.