Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhc refactor doi pipeline #339

Draft
wants to merge 10 commits into
base: dev
Choose a base branch
from
21 changes: 7 additions & 14 deletions cmr/cmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,24 +221,17 @@ def cmr_parameter_transform(input_str, reverse=False):
query_parameter (str): cmr query parameter matching the input_str
"""

mapping = {
"instrument": "instrument",
"platform": "platform",
"campaign": "project",
}
mapping = {"instrument": "instrument", "platform": "platform", "campaign": "project"}

input_str = input_str.lower()

if reverse:
if input_str not in [cmr_param for table_name, cmr_param in mapping.items()]:
raise ValueError("cmr_param must be project, instrument, or platform")
result = {v: k for k, v in mapping.items()}[input_str]
else:
if input_str not in [table_name for table_name, cmr_param in mapping.items()]:
raise ValueError("table_name must be campaign, instrument, or platform")
result = mapping[input_str]

return result
mapping = {v: k for k, v in mapping.items()}

if input_str not in mapping.keys():
raise ValueError(f"Invalid input_str. Input must be one of {', '.join(mapping.values())}")

return mapping[input_str]


def query_and_process_cmr(table_name, aliases):
Expand Down
25 changes: 8 additions & 17 deletions cmr/doi_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self):

def universal_get(self, table_name, uuid):
"""Queries the database for a uuid within a table name, but searches
the database propper as well as change objects, preferentially returning
the database proper as well as change objects, preferentially returning
results from the main db.

Args:
Expand All @@ -40,8 +40,7 @@ def universal_get(self, table_name, uuid):

# if the published object isn't found, search the drafts
except model.DoesNotExist:
model = apps.get_model("api_app", "change")
obj = model.objects.get(uuid=uuid)
obj - Change.objects.get(uuid=uuid)
data = json.loads(serializers.serialize("json", [obj,]))[0][
"fields"
]["update"]
Expand All @@ -67,13 +66,13 @@ def valid_object_list_generator(self, table_name, query_parameter=None, query_va
uuid_list (list): List of strings of uuids for the valid objects from a table
"""

# TODO: Why does this exclude published things??
valid_objects = Change.objects.filter(
content_type__model=table_name, action=Change.Actions.CREATE
).exclude(action=Change.Actions.DELETE, status=Change.Statuses.PUBLISHED)

if query_parameter:
query_parameter = "update__" + query_parameter
kwargs = {query_parameter: query_value}
kwargs = {f"update__{query_parameter}": query_value}
valid_objects = valid_objects.filter(**kwargs)

valid_object_uuids = [str(uuid) for uuid in valid_objects.values_list("uuid", flat=True)]
Expand Down Expand Up @@ -306,6 +305,7 @@ def add_to_db(self, doi):

return "Draft created for DOI"

# TODO: what if there was an edit draft made for a doi?
uuid = existing_doi_uuids[0]
existing_doi = self.universal_get("doi", uuid)
# if item exists as a draft, directly update using db functions with same methodology as above
Expand All @@ -322,18 +322,9 @@ def add_to_db(self, doi):

# if db item exists, replace cmr metadata fields and append suggestion fields as an update
existing_doi = DOI.objects.all().filter(uuid=uuid).first()
existing_campaigns = [str(c.uuid) for c in existing_doi.campaigns.all()]
existing_instruments = [str(c.uuid) for c in existing_doi.instruments.all()]
existing_platforms = [str(c.uuid) for c in existing_doi.platforms.all()]
existing_collection_periods = [str(c.uuid) for c in existing_doi.collection_periods.all()]

doi["campaigns"].extend(existing_campaigns)
doi["instruments"].extend(existing_instruments)
doi["platforms"].extend(existing_platforms)
doi["collection_periods"].extend(existing_collection_periods)

for field in ["campaigns", "instruments", "platforms", "collection_periods"]:
doi[field] = list(set(doi[field]))
for field_name in ["campaigns", "instruments", "platforms", "collection_periods"]:
doi[field_name].extend([str(field.uuid) for field in getattr(existing_doi, field_name).all()])
doi[field_name] = list(set(doi[field_name]))

doi_obj = Change(
content_type=ContentType.objects.get(model="doi"),
Expand Down
31 changes: 16 additions & 15 deletions cmr/process_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@ def extract_doi(concept):

def process_data_product(dp):
# this takes a single entry from the campaign metadata list
metadata = {}
metadata["concept_id"] = dp["meta"].get("concept-id")
metadata["doi"] = extract_doi(dp)
metadata["cmr_projects"] = dp["umm"].get("Projects")
metadata["cmr_short_name"] = dp["umm"].get("ShortName")
metadata["cmr_entry_title"] = dp["umm"].get("EntryTitle")
metadata["cmr_dates"] = dp["umm"].get("TemporalExtents", [])
metadata["cmr_plats_and_insts"] = dp["umm"].get("Platforms", [])

return metadata
return {
"concept_id": dp["meta"].get("concept-id"),
"doi": extract_doi(dp),
"cmr_projects": dp["umm"].get("Projects"),
"cmr_short_name": dp["umm"].get("ShortName"),
"cmr_entry_title": dp["umm"].get("EntryTitle"),
"cmr_dates": dp["umm"].get("TemporalExtents", []),
"cmr_plats_and_insts": dp["umm"].get("Platforms", []),
"cmr_data_format": [
info.get('Format', '')
for info in dp["umm"]
.get('ArchiveAndDistributionInformation', {})
.get('FileDistributionInformation', [{}])
],
}


def process_metadata_list(metadata_list):
processed_metadata_list = []
for dp in metadata_list:
processed_metadata_list.append(process_data_product(dp))

return processed_metadata_list
return [process_data_product(dp) for dp in metadata_list]
22 changes: 22 additions & 0 deletions data_models/migrations/0045_doi_cmr_data_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.1.3 on 2022-05-04 18:24

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [('data_models', '0044_auto_20220203_1500')]

operations = [
migrations.AddField(
model_name='doi',
name='cmr_data_formats',
field=django.contrib.postgres.fields.ArrayField(
base_field=models.CharField(blank=True, default='', max_length=512),
blank=True,
default=list,
size=None,
),
)
]
42 changes: 12 additions & 30 deletions data_models/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import uuid
import urllib.parse
import uuid

from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation
from django.contrib.contenttypes.models import ContentType
from django.contrib.gis.db import models as geomodels
from django.contrib.postgres.fields import ArrayField
from django.contrib.postgres.search import SearchQuery, SearchVector
from django.db import models

Expand Down Expand Up @@ -73,9 +74,7 @@ class LimitedInfo(BaseModel):
short_name = models.CharField(max_length=256, blank=False, unique=True)
long_name = models.CharField(max_length=512, default="", blank=True)
notes_internal = models.TextField(
default="",
blank=True,
help_text="Free text notes for ADMG staff - not visible to public.",
default="", blank=True, help_text="Free text notes for ADMG staff - not visible to public."
)
notes_public = models.TextField(
default="",
Expand All @@ -97,11 +96,7 @@ class Meta:

class PlatformType(LimitedInfoPriority):
parent = models.ForeignKey(
"PlatformType",
on_delete=models.CASCADE,
related_name="sub_types",
null=True,
blank=True,
"PlatformType", on_delete=models.CASCADE, related_name="sub_types", null=True, blank=True
)

gcmd_uuid = models.UUIDField(null=True, blank=True)
Expand All @@ -126,11 +121,7 @@ class Meta(LimitedInfo.Meta):

class MeasurementType(LimitedInfoPriority):
parent = models.ForeignKey(
"MeasurementType",
on_delete=models.CASCADE,
related_name="sub_types",
null=True,
blank=True,
"MeasurementType", on_delete=models.CASCADE, related_name="sub_types", null=True, blank=True
)
example = models.CharField(max_length=1024, blank=True, default="")

Expand Down Expand Up @@ -335,9 +326,7 @@ class Website(BaseModel):
title = models.TextField(default="", blank=True)
description = models.TextField(default="", blank=True)
notes_internal = models.TextField(
default="",
blank=True,
help_text="Free text notes for ADMG staff - not visible to public.",
default="", blank=True, help_text="Free text notes for ADMG staff - not visible to public."
)

def __str__(self):
Expand Down Expand Up @@ -561,12 +550,7 @@ def platforms(self):

@staticmethod
def search_fields():
return [
"short_name",
"long_name",
"description_short",
"focus_phenomena",
]
return ["short_name", "long_name", "description_short", "focus_phenomena"]

def get_absolute_url(self):
return urllib.parse.urljoin(FRONTEND_URL, f"/campaign/{self.uuid}/")
Expand Down Expand Up @@ -940,10 +924,7 @@ class CollectionPeriod(BaseModel):
)

platform_owner = models.CharField(
max_length=256,
default="",
blank=True,
help_text="Organization that owns the platform",
max_length=256, default="", blank=True, help_text="Organization that owns the platform"
)
platform_technical_contact = models.CharField(
max_length=256,
Expand All @@ -967,9 +948,7 @@ class CollectionPeriod(BaseModel):
)

notes_internal = models.TextField(
default="",
blank=True,
help_text="Free text notes for ADMG staff - not visible to public.",
default="", blank=True, help_text="Free text notes for ADMG staff - not visible to public."
)
notes_public = models.TextField(
default="",
Expand All @@ -996,6 +975,9 @@ class DOI(BaseModel):
cmr_projects = models.JSONField(default=None, blank=True, null=True)
cmr_dates = models.JSONField(default=None, blank=True, null=True)
cmr_plats_and_insts = models.JSONField(default=None, blank=True, null=True)
cmr_data_formats = ArrayField(
models.CharField(max_length=512, blank=True, default=""), blank=True, default=list
)

date_queried = models.DateTimeField()

Expand Down