Skip to content

Commit

Permalink
subset_pattern & subset_level, works on #26
Browse files Browse the repository at this point in the history
  • Loading branch information
bbauerma committed Mar 29, 2022
1 parent fe69182 commit fe936e7
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 66 deletions.
77 changes: 37 additions & 40 deletions src/geopathfinder/folder_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ def remove_smartpath(self, key):
self.count_dirs()


def get_subtree_matching(self, level, pattern, register_file_pattern=None):
def get_subtree_matching(self, level, level_pattern, register_file_pattern=None):
'''
Returns a subtree of the SmartTree with branches comprising
ALL matches with the pattern at the given level.
Expand All @@ -855,9 +855,15 @@ def get_subtree_matching(self, level, pattern, register_file_pattern=None):
level : str
Name of level in hierarchy.
e.g. 'wflow'
pattern : str
level_pattern : str
string defining search pattern at given level
e.g. 'C1003'
register_file_pattern : str tuple, optional
strings defining search pattern for file search for file_register
e.g. ('C1003', 'E048N012T6')
No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Be careful: If the tree is large, this can take a while!
Returns
-------
Expand All @@ -868,10 +874,10 @@ def get_subtree_matching(self, level, pattern, register_file_pattern=None):

branch = copy.deepcopy(self)

branch_paths = self.collect_level_string(level, pattern=pattern, unique=True)
branch_paths = self.collect_level_string(level, pattern=level_pattern, unique=True)

if len(branch_paths) == 0:
warnings.warn('get_subtree_unique_rebased(): No matches for "pattern" at "level"!')
warnings.warn('get_subtree_unique_rebased(): No matches for "level_pattern" at "level"!')
return NullSmartTree(self.root)
else:
for d in self.dirs.keys():
Expand All @@ -897,7 +903,7 @@ def get_subtree_matching(self, level, pattern, register_file_pattern=None):
return branch


def get_subtree_unique_rebased(self, level, pattern, register_file_pattern=None):
def get_subtree_unique_rebased(self, level, level_pattern, register_file_pattern=None):
'''
Returns a single branch (a subtree) of the SmartTree
with ONE UNIQUE match with the pattern at the given level.
Expand All @@ -908,9 +914,15 @@ def get_subtree_unique_rebased(self, level, pattern, register_file_pattern=None)
level : str
Name of level in hierarchy.
e.g. 'wflow'
pattern : str
level_pattern : str
string defining search pattern at given level
e.g. 'C1003'
register_file_pattern : str tuple, optional
strings defining search pattern for file search for file_register
e.g. ('C1003', 'E048N012T6')
No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Be careful: If the tree is large, this can take a while!
Returns
-------
Expand All @@ -921,13 +933,13 @@ def get_subtree_unique_rebased(self, level, pattern, register_file_pattern=None)

branch = copy.deepcopy(self)

branch_path = self.collect_level_string(level, pattern=pattern, unique=True)
branch_path = self.collect_level_string(level, pattern=level_pattern, unique=True)

if len(branch_path) == 0:
warnings.warn('get_subtree_unique_rebased(): No matches for "pattern" at "level"!')
warnings.warn('get_subtree_unique_rebased(): No matches for "level_pattern" at "level"!')
return NullSmartTree(self.root)
elif len(branch_path) > 1:
warnings.warn('get_subtree_unique_rebased(): Multiple matches for "pattern" at "level"!')
warnings.warn('get_subtree_unique_rebased(): Multiple matches for "level_pattern" at "level"!')
return NullSmartTree(self.root)
else:
for d in self.dirs.keys():
Expand Down Expand Up @@ -1085,9 +1097,7 @@ def create_smartpath(root, hierarchy, levels, make_dir=False):
def build_smarttree(root,
hierarchy,
target_level=None,
register_file_pattern=None,
trim_level=None,
trim_pattern=None):
register_file_pattern=None):
'''
Function walking through directories in root path for building a structure
of SmartPaths. Can also search for files.
Expand All @@ -1111,17 +1121,11 @@ def build_smarttree(root,
No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Be careful: If the tree is large, this can take a while!
trim_level : str
Name of level in hierarchy that is subject to the trimming
e.g. 'grid'
trim_pattern : str or list of str
string defining search pattern at trimming level, meaning only paths
matching this pattern at "trim_level" will be included in the
SmartTree()
e.g. 'EQUI7_EU500M'
Returns
-------
SmartTree
Tree object for the dataset.
'''

Expand All @@ -1140,13 +1144,12 @@ def build_smarttree(root,
depth += [len(dirpath.split(os.sep)) - root_depth]
# if set, then files are registered
# (they are in the memory anyway at this moment)
if trim_level is None:
if register_file_pattern is not None:
files, count = regex_file_search(dirpath,
register_file_pattern,
full_paths=True)
smart_tree.file_register += files
smart_tree.file_count += count
if register_file_pattern is not None:
files, count = regex_file_search(dirpath,
register_file_pattern,
full_paths=True)
smart_tree.file_register += files
smart_tree.file_count += count
alldirs = np.array(alldirs)

# only select paths reaching given target level
Expand Down Expand Up @@ -1189,11 +1192,10 @@ def build_smarttree(root,
smart_tree.add_smartpath(smart_path)

# to register only files down to target level
if trim_level is None:
if register_file_pattern is not None and target_level is not None:
smart_path.build_file_register(down_to_level=target_level,
pattern=register_file_pattern)
file_register += smart_path.file_register
if register_file_pattern is not None and target_level is not None:
smart_path.build_file_register(down_to_level=target_level,
pattern=register_file_pattern)
file_register += smart_path.file_register

smart_path = None
levels = None
Expand All @@ -1202,14 +1204,9 @@ def build_smarttree(root,
smart_tree.count_dirs()

# register only files in paths down to target level
if trim_level is None:
if register_file_pattern is not None and target_level is not None:
smart_tree.file_register = list(set(file_register))
smart_tree.file_count = len(smart_tree.file_register)

if trim_level is not None and trim_pattern is not None:
smart_tree = smart_tree.get_subtree_unique_rebased(trim_level, pattern=trim_pattern,
register_file_pattern=register_file_pattern)
if register_file_pattern is not None and target_level is not None:
smart_tree.file_register = list(set(file_register))
smart_tree.file_count = len(smart_tree.file_register)

if register_file_pattern is not None:
smart_tree.has_register = True
Expand Down
29 changes: 28 additions & 1 deletion src/geopathfinder/naming_conventions/sgrt_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,8 @@ def sgrt_path(root, mode=None, group=None, datalog=None,
make_dir=make_dir)


def sgrt_tree(root, target_level=None, register_file_pattern=None):
def sgrt_tree(root, target_level=None, register_file_pattern=None,
subset_level=('grid'), subset_pattern=('EQUI7'), subset_unique=False):

"""
Realisation of the full SGRT folder naming convention, yielding a
Expand All @@ -458,6 +459,21 @@ def sgrt_tree(root, target_level=None, register_file_pattern=None):
No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Be careful: If the tree is large, this can take a while!
subset_level : str tuple, optional
Name of level in tree's hierarchy where the subset should be applied
e.g. ('tile').
Default level is ('grid')
subset_pattern : str tuple, optional
Strings defining search pattern for subset_level, meaning only paths
matching this pattern at "subset_level" will be included in the SmartTree().
Default pattern is ('EQUI7').
e.g. ('EQUI7', '500M'), or ('500M'). No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
subset_unique : bool, optional
defines of the subset will deliver...
True: just one single subtree that matches uniquely the subset_pattern,
and which is rebased to the subset_level.
False: all subtrees that match the subset_pattern (Default).
Returns
-------
Expand All @@ -479,6 +495,17 @@ def sgrt_tree(root, target_level=None, register_file_pattern=None):
raise ValueError('Root-directory "{}" does is '
'not a valid SGRT folder!'.format(root))

# limit the tree to a subtree with all paths that match the subset_pattern at subset_level
if subset_level is not None and not subset_unique:
sgrt_tree = sgrt_tree.get_subtree_matching(subset_level, subset_pattern,
register_file_pattern=register_file_pattern)

# limit the tree to a single, unique, small subtree that matches the subset_pattern at subset_level,
# which is re-rooted to that level.
elif subset_level is not None:
sgrt_tree = sgrt_tree.get_subtree_unique_rebased(subset_level, subset_pattern,
register_file_pattern=register_file_pattern)

return sgrt_tree


Expand Down
50 changes: 34 additions & 16 deletions src/geopathfinder/naming_conventions/yeoda_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,12 @@ def encode_extra_field(self, relative_orbit):
return relative_orbit


def yeoda_path(root, product=None, data_version=None, grid=None, tile=None, qlook=True,
def yeoda_path(root, product, data_version, grid=None, tile=None, qlook=True,
make_dir=False):
"""
Realisation of the full yeoda folder naming convention, yielding a single
SmartPath.
If a keyword is not specified, the yeoda_path is shorthanded, until one level above the missing keyword
Parameters
----------
Expand All @@ -271,13 +272,13 @@ def yeoda_path(root, product=None, data_version=None, grid=None, tile=None, qloo
e.g. "ssm"
data_version : int or str
e.g. 2 or "V1M3R2"
grid : str
grid : str, optional
e.g. "EQUI7_EU500M"
tile : str
tile : str, optional
e.g. "E048N012T6"
qlook : bool
qlook : bool, optional
if the quicklook subdir should be integrated
make_dir : bool
make_dir : bool, optional
if the directory should be created on the filesystem
Returns
Expand Down Expand Up @@ -306,11 +307,12 @@ def yeoda_path(root, product=None, data_version=None, grid=None, tile=None, qloo
return create_smartpath(root, hierarchy=hierarchy, levels=levels, make_dir=make_dir)


def yeoda_tree(root, target_level=None, register_file_pattern=None, grid_pattern=('EQUI7')):
def yeoda_tree(root, target_level=None, register_file_pattern=None,
subset_level=('grid'), subset_pattern=('EQUI7'), subset_unique=False):

"""
Realisation of the full yeoda folder naming convention, yielding a
SmartTree(), reflecting all subfolders as SmartPath()
SmartTree(), reflecting all compatible subfolders as SmartPath()
Parameters
----------
Expand All @@ -329,12 +331,21 @@ def yeoda_tree(root, target_level=None, register_file_pattern=None, grid_pattern
No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Be careful: If the tree is large, this can take a while!
grid_pattern : str tuple, optional
strings defining search pattern for file search for file_register
e.g. ('EQUI7', '500M'), or ('500M')
No asterisk is needed ('*')!
subset_level : str tuple, optional
Name of level in tree's hierarchy where the subset should be applied
e.g. ('tile').
Default level is ('grid')
subset_pattern : str tuple, optional
Strings defining search pattern for subset_level, meaning only paths
matching this pattern at "subset_level" will be included in the SmartTree()
Default pattern is ('EQUI7').
e.g. ('EQUI7', '500M'), or ('500M'). No asterisk is needed ('*')!
Sequence of strings in given tuple is crucial!
Default is 'EQUI7'
subset_unique : bool, optional
defines of the subset will deliver...
True: just one single subtree that matches uniquely the subset_pattern,
and which is rebased to the subset_level.
False: all subtrees that match the subset_pattern (Default).
Returns
-------
Expand All @@ -345,15 +356,22 @@ def yeoda_tree(root, target_level=None, register_file_pattern=None, grid_pattern
# defining the hierarchy
hierarchy = ['product', 'data_version', 'grid', 'tile', 'qlook']

sgrt_tree = build_smarttree(root, hierarchy,
yeoda_tree = build_smarttree(root, hierarchy,
target_level=target_level,
register_file_pattern=register_file_pattern)

# limit the tree to a subtree with all paths that match the subset_pattern at subset_level
if subset_level is not None and not subset_unique:
yeoda_tree = yeoda_tree.get_subtree_matching(subset_level, subset_pattern,
register_file_pattern=register_file_pattern)

if grid_pattern is not None:
sgrt_tree = sgrt_tree.get_subtree_matching('grid', grid_pattern)
# limit the tree to a single, unique, small subtree that matches the subset_pattern at subset_level,
# which is re-rooted to that level.
elif subset_level is not None:
yeoda_tree = yeoda_tree.get_subtree_unique_rebased(subset_level, subset_pattern,
register_file_pattern=register_file_pattern)

return sgrt_tree
return yeoda_tree


if __name__ == '__main__':
Expand Down
13 changes: 8 additions & 5 deletions tests/test_folder_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,10 @@ def test_tree_dir_n_file_count(self):
Tests the dir_ and file_count of SmartTree()
"""
self.assertEqual(self.stt_1.dir_count, 9)
self.assertEqual(self.stt_1.file_count, 17)
# PS: "logfiles" are kicked out with subset_level in
# self.stt_1 = sgrt_tree(self.test_dir, register_file_pattern='.tif', subset_level='grid')
self.assertEqual(self.stt_1.dir_count, 8)
self.assertEqual(self.stt_1.file_count, 16)
# TODO: should be 16 "file_too_deep.tif" should be not included in
# file_register!

Expand Down Expand Up @@ -423,8 +425,7 @@ def test_copy_smarttree_on_fs_level_pattern(self):
level='wflow', level_pattern='A0202')

files = next(os.walk(self.copy_dir))[2]
file_count = sum(
[len(files) for r, d, files in os.walk(self.copy_dir)])
file_count = sum([len(files) for r, d, files in os.walk(self.copy_dir)])

self.assertEqual(file_count, 4)

Expand Down Expand Up @@ -474,8 +475,10 @@ def test_get_disk_usage(self):
self.assertAlmostEqual(result['du'][0], sig0_du, places=2)

# test complete query result
# PS: "logfiles" are kicked out with subset_level in
# self.stt_1 = sgrt_tree(self.test_dir, register_file_pattern='.tif', subset_level='grid')
result = self.stt_1.get_disk_usage(unit='KB')
self.assertEqual(result.shape, (33, 10))
self.assertEqual(result.shape, (32, 10))
should = ['E006N006T1', 'E006N006T1', 'E006N006T6', 'E006N006T6', 'E006N006T6', 'E006N006T6', 'E006N006T6',
'E006N006T6', 'E006N006T6', 'E006N006T6', 'E006N006T6', 'E006N012T6', 'E006N012T6', 'E048N012T6',
'E048N012T6', 'E048N012T6', 'E048N012T6', 'E048N012T6']
Expand Down
14 changes: 10 additions & 4 deletions tests/test_yeoda_naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,9 @@ def tearDown(self):
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir)

def test_grid_pattern(self):
def test_subtree_subsetting(self):
"""
Test the tree to handle logfiles correctly
Test the tree to handle subsetting correctly
"""

Expand All @@ -330,11 +330,17 @@ def test_grid_pattern(self):

os.makedirs(os.path.join(stp1.get_level('data_version'), 'logfiles', 'dummy'))

yt = yeoda_tree(self.test_dir, grid_pattern=('EQUI7', '500M'))

# test get_subtree_matching() to get limited number of paths matching the level pattern.
yt = yeoda_tree(self.test_dir, subset_pattern=('EQUI7', '500M'))
self.assertEqual(yt.dir_count, 2)
self.assertEqual(sorted(yt.collect_level_topnames('grid')), ['EQUI7_AF500M', 'EQUI7_EU500M'])

# test get_subtree_unique_rebased() to get small, single, unique subtree,
# which is is re-rooted to that level
st = yt.get_subtree_unique_rebased('tile', 'E048N012T6')
self.assertEqual(st.dir_count, 1)
self.assertEqual(st.root, stp1.get_level('tile'))


if __name__ == "__main__":
unittest.main()

0 comments on commit fe936e7

Please sign in to comment.