Skip to content

Commit

Permalink
Merge pull request #8 from cioos-atlantic/improved-filtering
Browse files Browse the repository at this point in the history
implemented decode->flatten->filter->unflatten->encode enhancement
  • Loading branch information
aianta authored Jul 12, 2021
2 parents bc1f030 + b4f7b79 commit 2a48687
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 92 deletions.
128 changes: 128 additions & 0 deletions ckanext/vitality_prototype/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
DATASET_FIELDS = [
"author",
"author_email",
"bbox-east-long",
"bbox-north-lat",
"bbox-south-lat",
"bbox-west-long",
"cited-responsible-party",
"creator_user_id",
"dataset-reference-date",
"eov",
"extras",
"frequency-of-update",
"groups",
"id",
"isopen",
"keywords/en",
"keywords/fr",
"license_id",
"license_title",
"license_url",
"maintainer",
"maintainer_email",
"metadata_created",
"metadata_modified",
"metadata-language",
"metadata-point-of-contact/contact-info_email",
"metadata-point-of-contact/contact-info_online-resource_application-profile",
"metadata-point-of-contact/contact-info_online-resource_description",
"metadata-point-of-contact/contact-info_online-resource_function",
"metadata-point-of-contact/contact-info_online-resource_name",
"metadata-point-of-contact/contact-info_online-resource_protocol",
"metadata-point-of-contact/contact-info_online-resource_protocol-request",
"metadata-point-of-contact/contact-info_online-resource_url",
"metadata-point-of-contact/individual-name",
"metadata-point-of-contact/organisation-name",
"metadata-point-of-contact/position-name",
"metadata-point-of-contact/role",
"metadata-reference-date",
"name",
"notes/en",
"notes/fr",
"notes_translated/en",
"notes_translated/fr",
"num_resources",
"num_tags",
"organization/approval_status",
"organization/created",
"organization/description",
"organization/description_translated/en",
"organization/description_translated/fr",
"organization/id",
"organization/image_url",
"organization/image_url_translated/en",
"organization/image_url_translated/fr",
"organization/is_organization",
"organization/revision_id",
"organization/state",
"organization/title",
"organization/title_translated/en",
"organization/title_translated/fr",
"organization/type",
"owner_org",
"private",
"progress",
"relationships_as_object",
"relationships_as_subject",
"resources",
"resource-type",
"revision_id",
"spatial/coordinates",
"spatial/type",
"state",
"tags",
"temporal-extent/begin",
"temporal-extent/end",
"title",
"title_translated/en",
"title_translated/fr",
"tracking_summary/recent",
"tracking_summary/total",
"type",
"unique-resource-identifier-full/authority",
"unique-resource-identifier-full/code",
"unique-resource-identifier-full/code-space",
"unique-resource-identifier-full/version",
"url",
"vertical-extent",
"xml_location_url",
"organization/name"
]

PUBLIC_FIELDS = [
"id",
"resources",
"type",
"name",
"state",
"organization/approval_status",
"orgnaization/created",
"organization/description",
"organization/description_translated/en",
"organization/description_translated/fr",
"organization/id",
"organization/image_url",
"organization/image_url_translated/en",
"organization/image_url_translated/fr",
"organization/is_organization",
"organization/revision_id",
"organization/state",
"organization/title",
"organization/title_translated/en",
"organization/title_translated/fr",
"organization/type",
"organization/name",
"title_translated/en",
"title_translated/fr"
]

STRINGIFIED_FIELDS = [
"metadata-point-of-contact",
"spatial",
"temporal-extent",
"unique-resource-identifier-full",
"notes",
"cited-responsible-party",
"dataset-reference-date"
]
107 changes: 95 additions & 12 deletions ckanext/vitality_prototype/meta_authorize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from enum import Enum
import logging
import json
import copy
import constants
from flatten_dict import flatten
from flatten_dict import unflatten


'''
Expand Down Expand Up @@ -148,28 +153,28 @@ def filter_dict(self, input, fields, whitelist):
the original dictionary input with fields corresponding to whitelist.
"""

#log.info(fields)

# Trivially check input type
if type(input) != dict:
raise TypeError("Only dicts can be filtered recursively! Attempted to filter " + str(type(input)))

# DECODE Stringified JSON elements
decoded = self._decode(copy.deepcopy(input))

# FLATTEN decoded input
flattened = flatten(decoded, reducer='path')

# Iterate through the dictionary entries

# TODO: I would use a comprehension + helper function.
# i.e. {k: v for k, v in input.items if filterLogicFn(k, v)}
# The original dictionary will stay in tact and in memory though.
for key,value in input.items():
for key,value in flattened.items():

log.info("Checking authorization for %s", str(key))

# TODO - this needs to be handled way better
if key == "en" or key == "fr":
continue

# Pop unknown fields
if key.encode('utf-8') not in fields:
input.pop(key, None)
flattened.pop(key, None)
log.warn("Popped unknown field: " + str(key))
continue

Expand All @@ -178,20 +183,98 @@ def filter_dict(self, input, fields, whitelist):

# If the current field's id does not appear in the whitelist, pop it from the input
if curr_field_id not in whitelist:
input.pop(key, None)
flattened.pop(key, None)
log.info("Key rejected!")
continue

# If the value is a dict, recurse
if type(value) is dict:

# Overwrite value with filtered dict
input[key] = self.filter_dict(value, fields, whitelist)
flattened[key] = self.filter_dict(value, fields, whitelist)

log.info("Key authorized!")

# UNFLATTEN filtered dictionary
unflattened = unflatten(flattened, splitter='path')

# STRINGIFY required json fields
encoded = self._encode(unflattened)

return encoded


def _decode(self, input):
"""
Decode dictionary containing string encoded JSON objects.
Parameters
----------
input: dict or stringified JSON
The dictionary to decode
Returns
-------
A dictionary where all fields that contained stringified JSON are now
expanded into dictionaries.
"""
if type(input) == str or type(input) == unicode:
root = MetaAuthorize._parse_json(input)
elif type(input) == dict:
root = input
else:
raise TypeError("_decode can only decode str or dict inputs! Got {}".format(str(type(input))))

if root != None:
for key,value in root.items():
# If the value is a string attempt to parse it as json
#log.info("Attempting to decode: %s - %s ", key, str(type(value)))
#TODO - this may need to change for python3
if type(value) == str or type(value) == unicode:
#log.info("%s is a str/unicode!", key)
parsed_json = MetaAuthorize._parse_json(value, key)

# If the string parsed
if parsed_json != None:
# into a dictonary
if type(parsed_json) == dict:
# decode the parsed dict
parsed_json = self._decode(parsed_json)
log.info('%s - parsed type %s', key, type(parsed_json))
# replace the value at the current key
root[key] = parsed_json
# into a list
elif type(parsed_json) == list:
# replace the value at the current key
root[key] = parsed_json


# Else if the value is a dictonary, recurse!
elif type(value) == dict:
root[key] = self._decode(value)

# log.info("Filtered input")
# log.info(input)

return root

def _encode(self, input):

for key,value in input.items():

if key in constants.STRINGIFIED_FIELDS:
log.info("Stringifying %s", key)
input[key] = unicode(json.dumps(value),'utf-8')

return input


@staticmethod
def _parse_json(value, key=None):
try:
# TODO: Unicode stuff may need rework for python 3
return json.loads(value.encode('utf-8'))
except ValueError:
#log.info("Value could not be parsed as JSON. %s", key)
return None
except TypeError:
#log.warn("Value could not be parsed as JSON, %s", key)
return None
Loading

0 comments on commit 2a48687

Please sign in to comment.