Merge pull request #8 from cioos-atlantic/improved-filtering

implemented decode->flatten->filter->unflatten->encode enhancement
cioos-atlantic · Jul 12, 2021 · 2a48687 · 2a48687
2 parents bc1f030 + b4f7b79
commit 2a48687
Show file tree

Hide file tree

Showing 4 changed files with 251 additions and 92 deletions.
diff --git a/ckanext/vitality_prototype/constants.py b/ckanext/vitality_prototype/constants.py
@@ -0,0 +1,128 @@
+DATASET_FIELDS = [
+        "author",
+        "author_email",
+        "bbox-east-long",
+        "bbox-north-lat",
+        "bbox-south-lat",
+        "bbox-west-long",
+        "cited-responsible-party",
+        "creator_user_id",
+        "dataset-reference-date",
+        "eov",
+        "extras",
+        "frequency-of-update",
+        "groups",
+        "id",
+        "isopen",
+        "keywords/en",
+        "keywords/fr",
+        "license_id",
+        "license_title",
+        "license_url",
+        "maintainer",
+        "maintainer_email",
+        "metadata_created",
+        "metadata_modified",
+        "metadata-language",
+        "metadata-point-of-contact/contact-info_email",
+        "metadata-point-of-contact/contact-info_online-resource_application-profile",
+        "metadata-point-of-contact/contact-info_online-resource_description",
+        "metadata-point-of-contact/contact-info_online-resource_function",
+        "metadata-point-of-contact/contact-info_online-resource_name",
+        "metadata-point-of-contact/contact-info_online-resource_protocol",
+        "metadata-point-of-contact/contact-info_online-resource_protocol-request",
+        "metadata-point-of-contact/contact-info_online-resource_url",
+        "metadata-point-of-contact/individual-name",
+        "metadata-point-of-contact/organisation-name",
+        "metadata-point-of-contact/position-name",
+        "metadata-point-of-contact/role",
+        "metadata-reference-date",
+        "name",
+        "notes/en",
+        "notes/fr",
+        "notes_translated/en",
+        "notes_translated/fr",
+        "num_resources",
+        "num_tags",
+        "organization/approval_status",
+        "organization/created",
+        "organization/description",
+        "organization/description_translated/en",
+        "organization/description_translated/fr",
+        "organization/id",
+        "organization/image_url",
+        "organization/image_url_translated/en",
+        "organization/image_url_translated/fr",
+        "organization/is_organization",
+        "organization/revision_id",
+        "organization/state",
+        "organization/title",
+        "organization/title_translated/en",
+        "organization/title_translated/fr",
+        "organization/type",
+        "owner_org",
+        "private",
+        "progress",
+        "relationships_as_object",
+        "relationships_as_subject",
+        "resources",
+        "resource-type",
+        "revision_id",
+        "spatial/coordinates",
+        "spatial/type",
+        "state",
+        "tags",
+        "temporal-extent/begin",
+        "temporal-extent/end",
+        "title",
+        "title_translated/en",
+        "title_translated/fr",
+        "tracking_summary/recent",
+        "tracking_summary/total",
+        "type",
+        "unique-resource-identifier-full/authority",
+        "unique-resource-identifier-full/code",
+        "unique-resource-identifier-full/code-space",
+        "unique-resource-identifier-full/version",
+        "url",
+        "vertical-extent",
+        "xml_location_url",
+        "organization/name"
+    ]
+
+PUBLIC_FIELDS = [
+    "id",
+    "resources",
+    "type",
+    "name",
+    "state",
+    "organization/approval_status",
+    "orgnaization/created",
+    "organization/description",
+    "organization/description_translated/en",
+    "organization/description_translated/fr",
+    "organization/id",
+    "organization/image_url",
+    "organization/image_url_translated/en",
+    "organization/image_url_translated/fr",
+    "organization/is_organization",
+    "organization/revision_id",
+    "organization/state",
+    "organization/title",
+    "organization/title_translated/en",
+    "organization/title_translated/fr",
+    "organization/type",
+    "organization/name",
+    "title_translated/en",
+    "title_translated/fr" 
+]
+
+STRINGIFIED_FIELDS = [
+    "metadata-point-of-contact",
+        "spatial",
+        "temporal-extent",
+        "unique-resource-identifier-full",
+        "notes",
+        "cited-responsible-party",
+        "dataset-reference-date"
+]
diff --git a/ckanext/vitality_prototype/meta_authorize.py b/ckanext/vitality_prototype/meta_authorize.py
@@ -1,5 +1,10 @@
 from enum import Enum
 import logging
+import json
+import copy
+import constants
+from flatten_dict import flatten
+from flatten_dict import unflatten
 
 
 '''
@@ -148,28 +153,28 @@ def filter_dict(self, input, fields, whitelist):
         the original dictionary input with fields corresponding to whitelist.      
         """
 
-        #log.info(fields)
-
         # Trivially check input type
         if type(input) != dict:
             raise TypeError("Only dicts can be filtered recursively! Attempted to filter " + str(type(input)))
 
+        # DECODE Stringified JSON elements
+        decoded = self._decode(copy.deepcopy(input))
+
+        # FLATTEN decoded input
+        flattened = flatten(decoded, reducer='path')
+
         # Iterate through the dictionary entries
 
         # TODO: I would use a comprehension + helper function.
         #  i.e. {k: v for k, v in input.items if filterLogicFn(k, v)}
         #  The original dictionary will stay in tact and in memory though.
-        for key,value in input.items():
+        for key,value in flattened.items():
 
             log.info("Checking authorization for %s", str(key))
 
-            # TODO - this needs to be handled way better
-            if key == "en" or key == "fr":
-                continue
-
             # Pop unknown fields
             if key.encode('utf-8') not in fields:
-                input.pop(key, None)
+                flattened.pop(key, None)
                 log.warn("Popped unknown field: " + str(key))
                 continue
 
@@ -178,20 +183,98 @@ def filter_dict(self, input, fields, whitelist):
 
             # If the current field's id does not appear in the whitelist, pop it from the input
             if curr_field_id not in whitelist:
-                input.pop(key, None)
+                flattened.pop(key, None)
                 log.info("Key rejected!")
                 continue
 
             # If the value is a dict, recurse 
             if type(value) is dict:
 
                 # Overwrite value with filtered dict
-                input[key] = self.filter_dict(value, fields, whitelist)
+                flattened[key] = self.filter_dict(value, fields, whitelist)
 
             log.info("Key authorized!")
 
+        # UNFLATTEN filtered dictionary
+        unflattened = unflatten(flattened, splitter='path')
+
+        # STRINGIFY required json fields
+        encoded = self._encode(unflattened)
+
+        return encoded
+
+
+    def _decode(self, input):
+        """
+        Decode dictionary containing string encoded JSON objects. 
+
+        Parameters
+        ----------
+        input: dict or stringified JSON
+            The dictionary to decode
+
+        Returns
+        -------
+        A dictionary where all fields that contained stringified JSON are now 
+        expanded into dictionaries. 
+        """
+        if type(input) == str or type(input) == unicode:
+            root = MetaAuthorize._parse_json(input)
+        elif type(input) == dict:
+            root = input
+        else:
+            raise TypeError("_decode can only decode str or dict inputs! Got {}".format(str(type(input))))
+
+        if root != None:
+            for key,value in root.items():
+                # If the value is a string attempt to parse it as json
+                #log.info("Attempting to decode: %s - %s ", key, str(type(value)))
+                #TODO - this may need to change for python3
+                if type(value) == str or type(value) == unicode:
+                    #log.info("%s is a str/unicode!", key)
+                    parsed_json = MetaAuthorize._parse_json(value, key)
+
+                    # If the string parsed 
+                    if parsed_json != None:
+                        # into a dictonary 
+                        if type(parsed_json) == dict:
+                            # decode the parsed dict
+                            parsed_json = self._decode(parsed_json)
+                            log.info('%s - parsed type %s', key, type(parsed_json))
+                            # replace the value at the current key
+                            root[key] = parsed_json
+                        # into a list
+                        elif type(parsed_json) == list:
+                            # replace the value at the current key
+                            root[key] = parsed_json
+
+
+                # Else if the value is a dictonary, recurse!
+                elif type(value) == dict:
+                    root[key] = self._decode(value)
 
-        # log.info("Filtered input")
-        # log.info(input)
 
+        return root
+
+    def _encode(self, input):
+
+        for key,value in input.items():
+
+            if key in constants.STRINGIFIED_FIELDS:
+                log.info("Stringifying %s", key)
+                input[key] = unicode(json.dumps(value),'utf-8')
+
         return input
+
+
+    @staticmethod
+    def _parse_json(value, key=None):
+        try:
+            # TODO: Unicode stuff may need rework for python 3
+            return json.loads(value.encode('utf-8'))
+        except ValueError:
+            #log.info("Value could not be parsed as JSON. %s", key)
+            return None
+        except TypeError:
+            #log.warn("Value could not be parsed as JSON, %s", key)
+            return None