From 30a0ca6ef072b666c9d7e436088eb0032b874636 Mon Sep 17 00:00:00 2001 From: Felipe Carlos Date: Tue, 12 Dec 2023 13:40:54 -0300 Subject: [PATCH 1/4] validation: adding initial validation of links from records --- geo_rdm_records/tasks.py | 37 ++++++++++++++++ geo_rdm_records/validation.py | 81 +++++++++++++++++++++++++++++++++++ setup.cfg | 1 + 3 files changed, 119 insertions(+) create mode 100644 geo_rdm_records/tasks.py create mode 100644 geo_rdm_records/validation.py diff --git a/geo_rdm_records/tasks.py b/geo_rdm_records/tasks.py new file mode 100644 index 0000000..fe97948 --- /dev/null +++ b/geo_rdm_records/tasks.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Tasks module.""" + +from celery import shared_task +from invenio_rdm_records.records.models import RDMRecordMetadata as GEORecordMetadata + +from geo_rdm_records import validation +from geo_rdm_records.modules.packages.records.models import GEOPackageRecordMetadata + + +@shared_task(ignore_result=True) +def check_records_links(): + """Check records links. + + ToDos: + - Implement chunk system with multi worker support (one worker for each chunk) + """ + # ToDo: Review the chunk size + chunk_size = 25 + + # Packages: validating links + # ToDo: Include report function + validation.check_links( + GEOPackageRecordMetadata, chunk_size, validation.check_chunk_of_package, None + ) + + # Resources: validating links + # ToDo: Include report function + validation.check_links( + GEORecordMetadata, chunk_size, validation.check_chunk_of_resources, None + ) diff --git a/geo_rdm_records/validation.py b/geo_rdm_records/validation.py new file mode 100644 index 0000000..f745e68 --- /dev/null +++ b/geo_rdm_records/validation.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Tasks module.""" + +from invenio_links_checker import checker_validate_links +from invenio_links_checker.contrib.chunking import checker_create_chunks + +from geo_rdm_records.customizations.records.api import GEORecord +from geo_rdm_records.modules.packages.records.api import GEOPackageRecord + + +def check_chunk_of_package(packages): + """Check links from chunks of Knowledge Packages.""" + results = [] + + for package in packages: + package = GEOPackageRecord.get_record(package.id) + + package_is_latest = package.versions.is_latest + package_is_published = package.is_published + + # Check only the latest versions of published packages + if package_is_latest and package_is_published: + # Extracting resources + package_resources = [r.resolve() for r in package.relationship.resources] + + # Validating links from resources + package_resources_links_status = checker_validate_links(package_resources) + + # Validating links from the package + package_links_status = checker_validate_links([package]) + + # Saving the result + results.append( + dict( + package=package_links_status[0], + resources=package_resources_links_status, + ) + ) + + return results + + +def check_chunk_of_resources(resources): + """Check links from chunks of resources.""" + valid_resources = [] + + for resource in resources: + resource = GEORecord.get_record(resource.id) + + resource_is_published = resource.is_published + resource_is_latest = resource.versions.is_latest + resource_is_managed = resource.parent.relationship.managed_by is None + + # Select only the latest versions of not-managed published resources + if resource_is_latest and resource_is_published and resource_is_managed: + valid_resources.append(resource) + + checker_validate_links(valid_resources) + + +def check_links(metadata_cls, chunk_size, chunk_validation_fnc, report_fnc): + """Check links from any type of InvenioRDM records.""" + # Loading records + records = metadata_cls.query.all() + + # Creating chunks + records_chunks = checker_create_chunks(records, chunk_size) + + # Validating chunks + for record_chunk in records_chunks: + chunk_results = chunk_validation_fnc(record_chunk) + + # ToDo: Create report + # for chunk_result in chunk_results: + # report_fnc(chunk_result) diff --git a/setup.cfg b/setup.cfg index 7394282..a4e5a36 100644 --- a/setup.cfg +++ b/setup.cfg @@ -96,6 +96,7 @@ invenio_db.alembic = geo_rdm_records = geo_rdm_records:alembic invenio_celery.tasks = geo_rdm_records_packages = geo_rdm_records.modules.packages.services.tasks + geo_rdm_records_tasks = geo_rdm_records.tasks [build_sphinx] source-dir = docs/ From dce4c91db04fd705799e3e0149cbe456e5e3f73f Mon Sep 17 00:00:00 2001 From: Felipe Carlos Date: Fri, 12 Jan 2024 09:04:46 -0300 Subject: [PATCH 2/4] validation: improving validation workflow --- geo_rdm_records/config.py | 15 + geo_rdm_records/tasks.py | 37 - .../reports/records-report.html | 1272 +++++++++++++++++ geo_rdm_records/validation.py | 81 -- geo_rdm_records/validation/__init__.py | 8 + geo_rdm_records/validation/config.py | 35 + geo_rdm_records/validation/links.py | 123 ++ geo_rdm_records/validation/records.py | 115 ++ geo_rdm_records/validation/report.py | 61 + geo_rdm_records/validation/tasks.py | 22 + setup.cfg | 2 +- 11 files changed, 1652 insertions(+), 119 deletions(-) delete mode 100644 geo_rdm_records/tasks.py create mode 100644 geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html delete mode 100644 geo_rdm_records/validation.py create mode 100644 geo_rdm_records/validation/__init__.py create mode 100644 geo_rdm_records/validation/config.py create mode 100644 geo_rdm_records/validation/links.py create mode 100644 geo_rdm_records/validation/records.py create mode 100644 geo_rdm_records/validation/report.py create mode 100644 geo_rdm_records/validation/tasks.py diff --git a/geo_rdm_records/config.py b/geo_rdm_records/config.py index 3dcba24..57adde4 100644 --- a/geo_rdm_records/config.py +++ b/geo_rdm_records/config.py @@ -177,6 +177,21 @@ # E-mail configuration GEO_RDM_NOTIFICATION_DEFAULT_RECEIVER_EMAILS = [] +# +# Checker configuration +# +GEO_RDM_CHECKER_RETRY_CONFIG = {"retries": 5, "backoff_factor": 0.3} +"""Retry configurations (based on retry-requests library).""" + +GEO_RDM_CHECKER_REQUEST_CONFIG = {"timeout": 10} +"""Request configuration (based on requests (get method) library.""" + +GEO_RDM_CHECKER_CHUNK_SIZE = 10 +"""Chunk size used by the checker.""" + +GEO_RDM_CHECKER_SLEEP_TIME = 5 +"""Sleep time to be applied between chunks (in seconds).""" + # OAI-PMH # ======= # See https://github.com/inveniosoftware/invenio-oaiserver/blob/master/invenio_oaiserver/config.py diff --git a/geo_rdm_records/tasks.py b/geo_rdm_records/tasks.py deleted file mode 100644 index fe97948..0000000 --- a/geo_rdm_records/tasks.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Tasks module.""" - -from celery import shared_task -from invenio_rdm_records.records.models import RDMRecordMetadata as GEORecordMetadata - -from geo_rdm_records import validation -from geo_rdm_records.modules.packages.records.models import GEOPackageRecordMetadata - - -@shared_task(ignore_result=True) -def check_records_links(): - """Check records links. - - ToDos: - - Implement chunk system with multi worker support (one worker for each chunk) - """ - # ToDo: Review the chunk size - chunk_size = 25 - - # Packages: validating links - # ToDo: Include report function - validation.check_links( - GEOPackageRecordMetadata, chunk_size, validation.check_chunk_of_package, None - ) - - # Resources: validating links - # ToDo: Include report function - validation.check_links( - GEORecordMetadata, chunk_size, validation.check_chunk_of_resources, None - ) diff --git a/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html b/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html new file mode 100644 index 0000000..3b9d035 --- /dev/null +++ b/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html @@ -0,0 +1,1272 @@ + + + + + + + + + + + + + + + +Thanks for using the GEO Knowledge Hub. + + + + + + + diff --git a/geo_rdm_records/validation.py b/geo_rdm_records/validation.py deleted file mode 100644 index f745e68..0000000 --- a/geo_rdm_records/validation.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Tasks module.""" - -from invenio_links_checker import checker_validate_links -from invenio_links_checker.contrib.chunking import checker_create_chunks - -from geo_rdm_records.customizations.records.api import GEORecord -from geo_rdm_records.modules.packages.records.api import GEOPackageRecord - - -def check_chunk_of_package(packages): - """Check links from chunks of Knowledge Packages.""" - results = [] - - for package in packages: - package = GEOPackageRecord.get_record(package.id) - - package_is_latest = package.versions.is_latest - package_is_published = package.is_published - - # Check only the latest versions of published packages - if package_is_latest and package_is_published: - # Extracting resources - package_resources = [r.resolve() for r in package.relationship.resources] - - # Validating links from resources - package_resources_links_status = checker_validate_links(package_resources) - - # Validating links from the package - package_links_status = checker_validate_links([package]) - - # Saving the result - results.append( - dict( - package=package_links_status[0], - resources=package_resources_links_status, - ) - ) - - return results - - -def check_chunk_of_resources(resources): - """Check links from chunks of resources.""" - valid_resources = [] - - for resource in resources: - resource = GEORecord.get_record(resource.id) - - resource_is_published = resource.is_published - resource_is_latest = resource.versions.is_latest - resource_is_managed = resource.parent.relationship.managed_by is None - - # Select only the latest versions of not-managed published resources - if resource_is_latest and resource_is_published and resource_is_managed: - valid_resources.append(resource) - - checker_validate_links(valid_resources) - - -def check_links(metadata_cls, chunk_size, chunk_validation_fnc, report_fnc): - """Check links from any type of InvenioRDM records.""" - # Loading records - records = metadata_cls.query.all() - - # Creating chunks - records_chunks = checker_create_chunks(records, chunk_size) - - # Validating chunks - for record_chunk in records_chunks: - chunk_results = chunk_validation_fnc(record_chunk) - - # ToDo: Create report - # for chunk_result in chunk_results: - # report_fnc(chunk_result) diff --git a/geo_rdm_records/validation/__init__.py b/geo_rdm_records/validation/__init__.py new file mode 100644 index 0000000..aec145d --- /dev/null +++ b/geo_rdm_records/validation/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validation module.""" diff --git a/geo_rdm_records/validation/config.py b/geo_rdm_records/validation/config.py new file mode 100644 index 0000000..7da5c83 --- /dev/null +++ b/geo_rdm_records/validation/config.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validation Helper module.""" + +from flask import current_app + + +def get_retry_config(confvar: str = "GEO_RDM_CHECKER_RETRY_CONFIG"): + """Helper to get configurations of retry.""" + return current_app.config[confvar] + + +def get_requests_config(confvar: str = "GEO_RDM_CHECKER_REQUEST_CONFIG"): + """Helper to get configurations of the ``requests.get`` method.""" + return current_app.config[confvar] + + +def get_chunks_config(confvar: str = "GEO_RDM_CHECKER_CHUNK_SIZE"): + """Helper to get configurations of the checker chunking system.""" + return current_app.config[confvar] + + +def get_sleep_config(confvar: str = "GEO_RDM_CHECKER_SLEEP_TIME"): + """Helper to get configurations of the sleep time used between the chunks processing.""" + return current_app.config[confvar] + + +def create_checker_config(): + """Create configuration object for the Link Checker.""" + return dict(requests_config=get_requests_config(), retry_config=get_retry_config()) diff --git a/geo_rdm_records/validation/links.py b/geo_rdm_records/validation/links.py new file mode 100644 index 0000000..51c915f --- /dev/null +++ b/geo_rdm_records/validation/links.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Tasks module.""" + +from time import sleep + +from invenio_links_checker import checker_validate_links +from invenio_links_checker.contrib.chunking import checker_create_chunks_by_owners +from invenio_rdm_records.records.models import RDMRecordMetadata as GEORecordMetadata + +from geo_rdm_records.customizations.records.api import GEORecord +from geo_rdm_records.modules.packages.records.api import GEOPackageRecord +from geo_rdm_records.modules.packages.records.models import GEOPackageRecordMetadata +from geo_rdm_records.validation import config, report +from geo_rdm_records.validation.records import enrich_status_objects + + +def _check_chunk_packages(packages): + """Check links from chunks of Knowledge Packages.""" + results = [] + + # reading checker configuration + checker_configuration = config.create_checker_config() + + for package in packages: + package = GEOPackageRecord.get_record(package.id) + + package_is_latest = package.versions.is_latest + package_is_published = package.is_published + + # Check only the latest versions of published packages + if package_is_latest and package_is_published: + # Extracting resources + package_resources = [r.resolve() for r in package.relationship.resources] + + # Validating links from resources + package_resources_links_status = checker_validate_links(package_resources) + + # Validating links from the package + package_links_status = checker_validate_links( + [package], **checker_configuration + ) + + # Saving the result + results.append( + dict( + package=package_links_status[0], + resources=package_resources_links_status, + ) + ) + + return results + + +def _check_chunk_resources(resources): + """Check links from chunks of resources.""" + valid_resources = [] + + # reading checker configuration + checker_configuration = config.create_checker_config() + + for resource in resources: + resource = GEORecord.get_record(resource.id) + + resource_is_published = resource.is_published + resource_is_latest = resource.versions.is_latest + resource_is_managed = resource.parent.relationship.managed_by is None + + # select only the latest versions of not-managed published resources + if resource_is_latest and resource_is_published and resource_is_managed: + valid_resources.append(resource) + + return checker_validate_links(valid_resources, **checker_configuration) + + +def _check_chunk(record_chunk): + """Check links of the records in the chunk. + + Note: + In the GEO Knowledge Hub case, it is required to group the records by type and then validate them. + """ + # filtering the records by type. + packages = list(filter(lambda x: x.parent.json["type"] == "package", record_chunk)) + resources = list(filter(lambda x: x.parent.json["type"] != "package", record_chunk)) + + # validating the links. + return [*_check_chunk_packages(packages), *_check_chunk_resources(resources)] + + +def check_links(chunk_size): + """Check links from any type of InvenioRDM records.""" + # reading configurations + sleep_time = config.get_sleep_config() + + # loading records + resources = GEORecordMetadata.query.all() + records = GEOPackageRecordMetadata.query.all() + + records.extend(resources) + + # creating chunks + chunks = checker_create_chunks_by_owners(records, chunk_size) + + # validating chunks + for chunk in chunks: + chunk_owner = chunk["owner"] + chunk_records = chunk["records"] + + chunk_results = _check_chunk(chunk_records) + chunk_results = enrich_status_objects(chunk_results) + + report.send_report(chunk_results, chunk_owner) + + # note: In the GEO Knowledge Hub case, it is possible to stop between + # chunks as we don't have a lot of resources. In other cases, another approach + # should be considered. + if sleep_time: + sleep(sleep_time) diff --git a/geo_rdm_records/validation/records.py b/geo_rdm_records/validation/records.py new file mode 100644 index 0000000..f88a4bd --- /dev/null +++ b/geo_rdm_records/validation/records.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Record manipulation module.""" + +from geo_rdm_records.customizations.records.api import GEORecord +from geo_rdm_records.modules.packages.records.api import GEOPackageRecord + + +def _calculate_links(record): + """Calculate the number of links.""" + return len(list(record["links_status"])) + + +def _calculate_links_with_errors(record): + """Calculate the number of links with errors.""" + return len(list(filter(lambda x: not x["is_available"], record["links_status"]))) + + +def _summarize_records_total(packages, resources): + """Summarize total number of records (packages and resources).""" + nrecords = len(resources) + + for package in packages: + # 1 package + n resources + nrecords += 1 + len(package["resources"]) + + return nrecords + + +def _summarize(records, key): + """Summarize total number of links errors (from packages and resources).""" + return sum(map(lambda x: x[key], records)) + + +def _enrich_resource(resource): + """Load metadata from a resource object.""" + resource_obj = GEORecord.pid.resolve(resource["id"]) + + nlinks = _calculate_links(resource) + nerrors = _calculate_links_with_errors(resource) + + return { + **resource, + "metadata": resource_obj["metadata"], + "owners": resource_obj.parent["access"]["owned_by"], + "errors": nerrors, + "links": nlinks, + } + + +def _enrich_package(package): + """Load metadata from a package object.""" + # processing the package + package_obj = GEOPackageRecord.pid.resolve(package["package"]["id"]) + + nlinks = _calculate_links(package["package"]) + nerrors = _calculate_links_with_errors(package["package"]) + + package_result = { + **package["package"], + "metadata": package_obj["metadata"], + "owners": package_obj.parent["access"]["owned_by"], + "errors": nerrors, + "links": nlinks, + } + + # processing resources + resources_result = [] + package_resources = package["resources"] + + for resource in package_resources: + resource = _enrich_resource(resource) + resources_result.append(resource) + + nlinks += resource["links"] + nerrors += resource["errors"] + + return dict( + package=package_result, resources=resources_result, errors=nerrors, links=nlinks + ) + + +def enrich_status_objects(records): + """Inject extra metadata in the links status object.""" + packages = [] + resources = [] + + for record in records: + record_is_package = True if "package" in record else False + + if record_is_package: + packages.append(_enrich_package(record)) + else: + resources.append(_enrich_resource(record)) + + # summarizing some metrics + number_of_records = _summarize_records_total(packages, resources) + number_of_errors = _summarize(packages, "errors") + _summarize(resources, "errors") + + packages_links = _summarize(packages, "links") + resources_links = _summarize(resources, "links") + + return dict( + packages=packages, + resources=resources, + total_errors=number_of_errors, + total_records=number_of_records, + total_packages_links=packages_links, + total_resources_links=resources_links, + ) diff --git a/geo_rdm_records/validation/report.py b/geo_rdm_records/validation/report.py new file mode 100644 index 0000000..950397b --- /dev/null +++ b/geo_rdm_records/validation/report.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validations report module.""" + +from flask import current_app +from invenio_access.permissions import system_identity +from invenio_mail.api import TemplatedMessage +from invenio_users_resources.proxies import current_users_service + + +def _check_owner_can_receive_report(owner_profile): + """Check if an owner can receive a report.""" + # Checking if user can receive emails. To receive an email, user + # must have the following properties: + # 1. Must be `Active`; + # 2. Must have `Email` confirmed. + is_active = owner_profile["active"] + is_email_confirmed = owner_profile["confirmed"] + + return is_active and is_email_confirmed + + +def _build_report_message( + records, records_owner_profile, report_title, report_template +): + """Build the report message.""" + # formatting the user e-mail. + records_owner_email = [records_owner_profile["email"]] + + return TemplatedMessage( + subject=report_title, + template_html=report_template, + recipients=records_owner_email, + ctx={**records}, + ) + + +def send_report(records, records_owner): + """Send a report to Knowledge Provider.""" + report_base_title = "GEO Knowledge Hub - Links status from your records" + report_base_template = "geo_rdm_records/reports/records-report.html" + + # reading owner profile + owner_profile = current_users_service.read(system_identity, records_owner).to_dict() + + # checking if the owner can receive a report + can_receive_report = _check_owner_can_receive_report(owner_profile) + + if can_receive_report: + # building the message + report_message = _build_report_message( + records, owner_profile, report_base_title, report_base_template + ) + + # sending the message + current_app.extensions["mail"].send(report_message) diff --git a/geo_rdm_records/validation/tasks.py b/geo_rdm_records/validation/tasks.py new file mode 100644 index 0000000..2334dac --- /dev/null +++ b/geo_rdm_records/validation/tasks.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validation tasks module.""" + +from celery import shared_task + +from geo_rdm_records.validation.config import get_chunks_config +from geo_rdm_records.validation.links import check_links + + +@shared_task(ignore_result=True) +def check_records_links(): + """Check records links.""" + chunk_size = get_chunks_config() + + # validating links of packages and resources. + check_links(chunk_size) diff --git a/setup.cfg b/setup.cfg index a4e5a36..7d730af 100644 --- a/setup.cfg +++ b/setup.cfg @@ -96,7 +96,7 @@ invenio_db.alembic = geo_rdm_records = geo_rdm_records:alembic invenio_celery.tasks = geo_rdm_records_packages = geo_rdm_records.modules.packages.services.tasks - geo_rdm_records_tasks = geo_rdm_records.tasks + geo_rdm_records_validation = geo_rdm_records.validation.tasks [build_sphinx] source-dir = docs/ From 23474db62088cf45c1813fe493c669d7a6ddf1e2 Mon Sep 17 00:00:00 2001 From: Felipe Carlos Date: Fri, 12 Jan 2024 09:07:08 -0300 Subject: [PATCH 3/4] package: updating package version --- geo_rdm_records/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geo_rdm_records/__init__.py b/geo_rdm_records/__init__.py index b667901..c5a1ae5 100644 --- a/geo_rdm_records/__init__.py +++ b/geo_rdm_records/__init__.py @@ -9,5 +9,5 @@ from .ext import GEORDMRecords -__version__ = "0.7.0" +__version__ = "0.8.0" __all__ = ("__version__", "GEORDMRecords") From 6f1122e8b1a27529da44bdaf465ae249ee401664 Mon Sep 17 00:00:00 2001 From: Felipe Carlos Date: Thu, 18 Jan 2024 07:50:07 -0300 Subject: [PATCH 4/4] checker: reviewing checker module --- geo_rdm_records/config.py | 9 +- geo_rdm_records/modules/checker/__init__.py | 8 + .../modules/checker/links/__init__.py | 8 + .../checker/links/checker}/__init__.py | 2 +- .../modules/checker/links/checker/check.py | 53 ++ .../modules/checker/links/checker/metadata.py | 46 ++ .../modules/checker/links/checker/network.py | 67 ++ .../modules/checker/links/service/__init__.py | 8 + .../modules/checker/links/service/config.py | 44 + .../modules/checker/links/service/records.py | 299 +++++++ .../checker/links/service}/report.py | 49 +- .../modules/checker/links/validation.py | 124 +++ geo_rdm_records/modules/checker/schema.py | 33 + geo_rdm_records/modules/checker/tasks.py | 24 + .../reports/records-report.html | 754 ++++++++---------- geo_rdm_records/validation/config.py | 35 - geo_rdm_records/validation/links.py | 123 --- geo_rdm_records/validation/records.py | 115 --- geo_rdm_records/validation/tasks.py | 22 - setup.cfg | 6 +- 20 files changed, 1108 insertions(+), 721 deletions(-) create mode 100644 geo_rdm_records/modules/checker/__init__.py create mode 100644 geo_rdm_records/modules/checker/links/__init__.py rename geo_rdm_records/{validation => modules/checker/links/checker}/__init__.py (89%) create mode 100644 geo_rdm_records/modules/checker/links/checker/check.py create mode 100644 geo_rdm_records/modules/checker/links/checker/metadata.py create mode 100644 geo_rdm_records/modules/checker/links/checker/network.py create mode 100644 geo_rdm_records/modules/checker/links/service/__init__.py create mode 100644 geo_rdm_records/modules/checker/links/service/config.py create mode 100644 geo_rdm_records/modules/checker/links/service/records.py rename geo_rdm_records/{validation => modules/checker/links/service}/report.py (55%) create mode 100644 geo_rdm_records/modules/checker/links/validation.py create mode 100644 geo_rdm_records/modules/checker/schema.py create mode 100644 geo_rdm_records/modules/checker/tasks.py delete mode 100644 geo_rdm_records/validation/config.py delete mode 100644 geo_rdm_records/validation/links.py delete mode 100644 geo_rdm_records/validation/records.py delete mode 100644 geo_rdm_records/validation/tasks.py diff --git a/geo_rdm_records/config.py b/geo_rdm_records/config.py index 57adde4..af53518 100644 --- a/geo_rdm_records/config.py +++ b/geo_rdm_records/config.py @@ -7,6 +7,7 @@ """GEO RDM Records module configurations.""" +from flask_babelex import lazy_gettext as _ from invenio_rdm_records.services import facets as rdm_facets from geo_rdm_records.base.services import facets as geo_facets @@ -186,11 +187,11 @@ GEO_RDM_CHECKER_REQUEST_CONFIG = {"timeout": 10} """Request configuration (based on requests (get method) library.""" -GEO_RDM_CHECKER_CHUNK_SIZE = 10 -"""Chunk size used by the checker.""" +GEO_RDM_CHECKER_REPORT_TITLE = _("GEO Knowledge Hub - Links status from your records") +"""Report title.""" -GEO_RDM_CHECKER_SLEEP_TIME = 5 -"""Sleep time to be applied between chunks (in seconds).""" +GEO_RDM_CHECKER_REPORT_TEMPLATE = "geo_rdm_records/reports/records-report.html" +"""Report jinja2 template.""" # OAI-PMH # ======= diff --git a/geo_rdm_records/modules/checker/__init__.py b/geo_rdm_records/modules/checker/__init__.py new file mode 100644 index 0000000..9ae5b88 --- /dev/null +++ b/geo_rdm_records/modules/checker/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Records checker module.""" diff --git a/geo_rdm_records/modules/checker/links/__init__.py b/geo_rdm_records/modules/checker/links/__init__.py new file mode 100644 index 0000000..ff2b690 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Links checker module.""" diff --git a/geo_rdm_records/validation/__init__.py b/geo_rdm_records/modules/checker/links/checker/__init__.py similarity index 89% rename from geo_rdm_records/validation/__init__.py rename to geo_rdm_records/modules/checker/links/checker/__init__.py index aec145d..7e1fecd 100644 --- a/geo_rdm_records/validation/__init__.py +++ b/geo_rdm_records/modules/checker/links/checker/__init__.py @@ -5,4 +5,4 @@ # geo-rdm-records is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. -"""Validation module.""" +"""Checker module.""" diff --git a/geo_rdm_records/modules/checker/links/checker/check.py b/geo_rdm_records/modules/checker/links/checker/check.py new file mode 100644 index 0000000..fcdaa28 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/checker/check.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Checker module.""" + +from .metadata import extract_links_from_record +from .network import is_link_available + + +def _check_links(record_links, **kwargs): + """Check links from a given record. + + Args: + record_links (list): List of links from a given record. + + **kwargs: Extra configurations for the `is_link_available` function. + + Returns: + list: List with the links' status. + """ + return [ + dict(link=link, is_available=is_link_available(link, **kwargs)) + for link in record_links + ] + + +def checker_validate_links(records, **kwargs): + """Check links from records. + + Args: + records (list): List of ``invenio_records.api.Record`` objects. + + **kwargs: Extra configurations for the `is_link_available` function. + + Returns: + list: List with the links' status. + """ + result = [] + + for record in records: + record_id = record.pid.pid_value + + # extracting the links and checking its status + record_links = extract_links_from_record(record) + record_links_status = _check_links(record_links, **kwargs) + + result.append(dict(id=record_id, links_status=record_links_status)) + + return result diff --git a/geo_rdm_records/modules/checker/links/checker/metadata.py b/geo_rdm_records/modules/checker/links/checker/metadata.py new file mode 100644 index 0000000..da09ef1 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/checker/metadata.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Checker metadata management module.""" + +import re + +from pydash import py_ + + +def _extract_links(record_document): + """Extract all links from a string. + + Args: + record_document (str): Record document as a string. + + Returns: + list: List containing all links found in the record document. + """ + # Regex pattern for extracting URLs + url_pattern = r'https?://[^\s<>"\',]+|www\.[^\s<>"\',]+' + + # Find all non-overlapping matches of the pattern in the string + return re.findall(url_pattern, record_document) + + +def extract_links_from_record(record): + """Extract all links available in a record. + + Args: + record (invenio_records.api.Record): Record object + + Returns: + list: List containing all links found in the record document. + """ + record_metadata_as_string = str(record.dumps()) + + # Extracting links + record_links = _extract_links(record_metadata_as_string) + + # Removing duplicates + return py_.uniq(record_links) diff --git a/geo_rdm_records/modules/checker/links/checker/network.py b/geo_rdm_records/modules/checker/links/checker/network.py new file mode 100644 index 0000000..dd24117 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/checker/network.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Checker Network management module.""" + +from datetime import timedelta + +import requests +from requests_cache import CachedSession +from retry_requests import retry + + +def is_link_available( + url: str, + requests_config=None, + retry_config=None, + cache_config=None, +): + """Check if a link is available. + + Args: + url (str): URL to be checked. + + requests_config (dict): ``requests.get`` configurations + + retry_config (dict): ``retry_requests.retry`` configurations + + cache_config (dict): ``requests_cache.CachedSession`` configurations. + + Note: + By default, the following cases are used to define a link as unavailable: + - Case 1: Delay to answer longer than 5 seconds; + - Case 2: No access to the server or a dropped connection; + - Case 3: An HTTP Answer of 500, 502, or 504. + + To change the value of ``Case 1``, you can use the Request config (``requests_config``). Also, + to change the value of ``Case 2`` and ``Case 3``, you can use the Retry config (``retry_config``). + """ + is_available = True + + cache_config = {} if cache_config is None else cache_config + retry_config = {} if retry_config is None else retry_config + requests_config = {} if requests_config is None else requests_config + + # building the session object + session = CachedSession( + "geo_rdm_records_links_checker", + cache_control=False, + expire_after=timedelta(days=30), + allowable_codes=[200, 400], + **cache_config + ) + + session = retry(session, **retry_config) + + try: + session.get(url, **requests_config) + # nothing to do + except requests.RequestException as e: + # If there is any request-related error, the link is not available + is_available = False + + return is_available diff --git a/geo_rdm_records/modules/checker/links/service/__init__.py b/geo_rdm_records/modules/checker/links/service/__init__.py new file mode 100644 index 0000000..ad234be --- /dev/null +++ b/geo_rdm_records/modules/checker/links/service/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Checker service module.""" diff --git a/geo_rdm_records/modules/checker/links/service/config.py b/geo_rdm_records/modules/checker/links/service/config.py new file mode 100644 index 0000000..5ef5031 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/service/config.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Configuration utility module.""" + +from flask import current_app + + +def _get_retry_config(confvar: str = "GEO_RDM_CHECKER_RETRY_CONFIG"): + """Get configurations of retry.""" + return current_app.config[confvar] + + +def _get_requests_config(confvar: str = "GEO_RDM_CHECKER_REQUEST_CONFIG"): + """Get configurations of the ``requests.get`` method.""" + return current_app.config[confvar] + + +def _get_report_title(confvar: str = "GEO_RDM_CHECKER_REPORT_TITLE"): + """Get configurations of the sleep time used between the chunks processing.""" + return current_app.config[confvar] + + +def _get_report_template(confvar: str = "GEO_RDM_CHECKER_REPORT_TEMPLATE"): + """Get configurations of the sleep time used between the chunks processing.""" + return current_app.config[confvar] + + +def get_checker_config(): + """Get configuration object for the Link Checker.""" + return dict( + requests_config=_get_requests_config(), retry_config=_get_retry_config() + ) + + +def get_report_config(): + """Get configuration object for the Report.""" + return dict( + report_title=_get_report_title(), report_template=_get_report_template() + ) diff --git a/geo_rdm_records/modules/checker/links/service/records.py b/geo_rdm_records/modules/checker/links/service/records.py new file mode 100644 index 0000000..8c948e6 --- /dev/null +++ b/geo_rdm_records/modules/checker/links/service/records.py @@ -0,0 +1,299 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Records manipulation utility module.""" + +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_rdm_records_service + +from geo_rdm_records.base.records.types import GEORecordTypes +from geo_rdm_records.customizations.records.api import GEORecord +from geo_rdm_records.modules.checker.schema import EmailRecordJSONSerializer +from geo_rdm_records.modules.packages.records.api import GEOPackageRecord +from geo_rdm_records.proxies import current_geo_packages_service + + +# +# Utilities +# +def _calculate_links(record): + """Calculate the number of links. + + Args: + record (dict): Dict containing links associated with a record. + Returns: + int: Number of links available in the given record. + """ + return len(list(record["links_status"])) + + +def _calculate_links_with_errors(record): + """Calculate the number of links with errors. + + Args: + record (dict): Dict containing links associated with a record. + + Returns: + int: Number of links with an error. + """ + return len(list(filter(lambda x: not x["is_available"], record["links_status"]))) + + +def _summarize_records_total(packages, resources): + """Summarize total number of records (packages and resources). + + Args: + packages (list): List of packages + + resources (list): List of individual resources (not associated with packages). + Returns: + int: Total number of records. + """ + nrecords = len(resources) + + for package in packages: + # 1 package + n resources + nrecords += 1 + len(package["resources"]) + + return nrecords + + +def _summarize(records, key): + """Summarize total number of objects in a given key. + + Args: + records (list): List of dict + + key (str): Key to be checked from each record in ``records`` + Returns: + int: Number of objects available in the given key. + """ + return sum(map(lambda x: x[key], records)) + + +def _serialize_record(record): + """Serialize record. + + Args: + record (dict): Record (Package or resource) with metadata to be serialized. + + Returns: + dict: Record with serialized fields. + """ + return EmailRecordJSONSerializer().dump_obj(record) + + +def _enrich_resource(resource): + """Enrich a link status object from a resource. + + Args: + resource (dict): Record metadata to be enriched. + Returns: + dict: Record enriched. + """ + resource_obj = _serialize_record(resource) + + nlinks = _calculate_links(resource) + nerrors = _calculate_links_with_errors(resource) + + return { + **resource, + "metadata": resource_obj["metadata"], + "ui": resource_obj["ui"], + "owners": resource_obj["parent"]["access"]["owned_by"], + "nerrors": nerrors, + "nlinks": nlinks, + } + + +def _enrich_package(package): + """Enrich a link status object from a resource. + + Args: + package (dict): Record metadata to be enriched. + Returns: + dict: Record enriched. + """ + # processing the package + package_obj = _serialize_record(package["package"]) + + nlinks = _calculate_links(package["package"]) + nerrors = _calculate_links_with_errors(package["package"]) + + package_result = { + **package["package"], + "metadata": package_obj["metadata"], + "ui": package_obj["ui"], + "owners": package_obj["parent"]["access"]["owned_by"], + "errors": nerrors, + "nlinks": nlinks, + } + + # processing resources + resources_result = [] + package_resources = package["resources"] + + for resource in package_resources: + resource = _enrich_resource(resource) + resources_result.append(resource) + + nlinks += resource["nlinks"] + nerrors += resource["nerrors"] + + return dict( + package=package_result, + resources=resources_result, + nerrors=nerrors, + nlinks=nlinks, + ) + + +def _read_record_metadata(rid_, cache_, type_): + """Read metadata of a record. + + Args: + rid_ (str): Record ID. + + cache_ (dict): Dict containing already loaded metadata. + + type_ (str): Type of record. + + Returns: + dict: Record metadata. + """ + result_data = list(filter(lambda x: x["id"] == rid_, cache_)) + + if not result_data: + if type_ == GEORecordTypes.package: + result_data = current_geo_packages_service.read( + identity=system_identity, id_=rid_ + ).to_dict() + + else: + result_data = current_rdm_records_service.read( + identity=system_identity, id_=rid_ + ).to_dict() + + else: + result_data = result_data[0] + + return result_data + + +def _merge_metadata(records, cache): + """Merge metadata inside record link status objects. + + Args: + records (list): List containing links status objects. + + cache (dict): Dict with already loaded metadata from packages and resources. + + Yields: + dict: Record metadata object. + """ + for record in records: + if "package" in record: + # preparing package metadata + package_metadata = _read_record_metadata( + record["package"]["id"], cache["packages"], GEORecordTypes.package + ) + package_metadata = {**record["package"], **package_metadata} + + # preparing resources metadata + resources_metadata = [] + + for resource in record["resources"]: + resource_metadata = _read_record_metadata( + resource["id"], cache["resources"], GEORecordTypes.resource + ) + + resources_metadata.append({**resource, **resource_metadata}) + + yield dict(package=package_metadata, resources=resources_metadata) + + else: + individual_resource_metadata = _read_record_metadata( + record["id"], cache["resources"], GEORecordTypes.resource + ) + + yield {**record, **individual_resource_metadata} + + +# +# Records high-level functions. +# +def enrich_status_objects(records_status_object, metadata_cache): + """Inject extra metadata in the links status object. + + Args: + records_status_object (list): List of record status link object. + + metadata_cache (dict): Already loaded metadata. + + Returns: + list: Records with extra metadata. + """ + packages = [] + resources = [] + + # first, we merge status objects with the metadata already available + records_status_object = _merge_metadata(records_status_object, metadata_cache) + + for record_status_object in records_status_object: + record_is_package = True if "package" in record_status_object else False + + if record_is_package: + packages.append(_enrich_package(record_status_object)) + else: + resources.append(_enrich_resource(record_status_object)) + + # summarizing some metrics + number_of_records = _summarize_records_total(packages, resources) + number_of_errors = _summarize(packages, "nerrors") + _summarize( + resources, "nerrors" + ) + + packages_links = _summarize(packages, "nlinks") + resources_links = _summarize(resources, "nlinks") + + return dict( + packages=packages, + resources=resources, + total_errors=number_of_errors, + total_records=number_of_records, + total_packages_links=packages_links, + total_resources_links=resources_links, + ) + + +def get_records_by_owner(owner_id): + """Get all records associated with an owner. + + Args: + owner_id (int): Owner ID + + Returns: + tuple: Tuple containing a list of records and its metadata. + """ + search_params = {"q": f"parent.access.owned_by.user: {owner_id}"} + + # searching all records (packages and resources) and its metadata + packages_metadata = list( + current_geo_packages_service.search(system_identity, params=search_params).hits + ) + + resources_metadata = list( + current_rdm_records_service.search(system_identity, params=search_params).hits + ) + + # reading reference from database + records_obj = [ + *list(map(lambda x: GEOPackageRecord.pid.resolve(x["id"]), packages_metadata)), + *list(map(lambda x: GEORecord.pid.resolve(x["id"]), resources_metadata)), + ] + + return records_obj, dict(packages=packages_metadata, resources=resources_metadata) diff --git a/geo_rdm_records/validation/report.py b/geo_rdm_records/modules/checker/links/service/report.py similarity index 55% rename from geo_rdm_records/validation/report.py rename to geo_rdm_records/modules/checker/links/service/report.py index 950397b..240d620 100644 --- a/geo_rdm_records/validation/report.py +++ b/geo_rdm_records/modules/checker/links/service/report.py @@ -5,7 +5,9 @@ # geo-rdm-records is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. -"""Validations report module.""" +"""Report utility module.""" + +from datetime import datetime from flask import current_app from invenio_access.permissions import system_identity @@ -14,7 +16,14 @@ def _check_owner_can_receive_report(owner_profile): - """Check if an owner can receive a report.""" + """Check if an owner can receive a report. + + Args: + owner_profile (dict): Dict containing owner profile. + + Returns: + bool: Flag indicating if a given owner can receive e-mails. + """ # Checking if user can receive emails. To receive an email, user # must have the following properties: # 1. Must be `Active`; @@ -28,23 +37,45 @@ def _check_owner_can_receive_report(owner_profile): def _build_report_message( records, records_owner_profile, report_title, report_template ): - """Build the report message.""" + """Build a report message. + + Args: + records (list): List containing status of the links from the + records (Knowledge Packages and Knowledge Resources). + + records_owner_profile (dict): Dict with the profile of the records' owner. + + report_title (str): Report's e-mails title. + + report_template (str): Report's e-mails template. + + Returns: + invenio_mail.api.TemplatedMessage: Email message. + """ # formatting the user e-mail. records_owner_email = [records_owner_profile["email"]] + report_date = datetime.now().strftime("%B %d, %Y") + return TemplatedMessage( subject=report_title, template_html=report_template, recipients=records_owner_email, - ctx={**records}, + ctx={**records, "report_date": report_date}, ) -def send_report(records, records_owner): - """Send a report to Knowledge Provider.""" - report_base_title = "GEO Knowledge Hub - Links status from your records" - report_base_template = "geo_rdm_records/reports/records-report.html" +def send_report(records, records_owner, report_configuration): + """Send a report to Knowledge Provider. + + Args: + records (list): List containing status of the links from the + records (Knowledge Packages and Knowledge Resources). + + records_owner (int): Record owner's ID. + report_configuration (dict): Report configuration + """ # reading owner profile owner_profile = current_users_service.read(system_identity, records_owner).to_dict() @@ -54,7 +85,7 @@ def send_report(records, records_owner): if can_receive_report: # building the message report_message = _build_report_message( - records, owner_profile, report_base_title, report_base_template + records, owner_profile, **report_configuration ) # sending the message diff --git a/geo_rdm_records/modules/checker/links/validation.py b/geo_rdm_records/modules/checker/links/validation.py new file mode 100644 index 0000000..2fa92bf --- /dev/null +++ b/geo_rdm_records/modules/checker/links/validation.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validation links module.""" + +from invenio_access.models import User + +from geo_rdm_records.modules.checker.links.checker import check +from geo_rdm_records.modules.checker.links.service import records, report + + +def _validate_packages_links(packages, checker_configuration): + """Validate links from Knowledge Packages. + + Args: + packages (list): List of ``GEOPackageRecord``. + + checker_configuration (dict): Extra configurations for the link checker. + + Returns: + list: List containing status of the links from the packages. + """ + results = [] + + for package in packages: + package_is_latest = package.versions.is_latest + package_is_published = package.is_published + + # Check only the latest versions of published packages + if package_is_latest and package_is_published: + # Extracting resources + package_resources = [r.resolve() for r in package.relationship.resources] + + # Validating links from resources + package_resources_links_status = check.checker_validate_links( + package_resources + ) + + # Validating links from the package + package_links_status = check.checker_validate_links( + [package], **checker_configuration + ) + + # Saving the result + results.append( + dict( + package=package_links_status[0], + resources=package_resources_links_status, + ) + ) + + return results + + +def _validate_resources_links(resources, checker_configuration): + """Validate links from Knowledge Resources. + + Args: + resources (list): List of ``GEORecord`` + + checker_configuration (dict): Extra configurations for the link checker. + + Returns: + list: List containing status of the links from the resources. + """ + valid_resources = [] + + for resource in resources: + resource_is_published = resource.is_published + resource_is_latest = resource.versions.is_latest + resource_is_managed = resource.parent.relationship.managed_by is None + + # select only the latest versions of not-managed published resources + if resource_is_latest and resource_is_published and resource_is_managed: + valid_resources.append(resource) + + return check.checker_validate_links(valid_resources, **checker_configuration) + + +def _validate_records_links(records, checker_configuration): + """Check links from records (Knowledge Packages and Knowledge Resources). + + Args: + records (list): List of ``GEOPackageRecord`` and ``GEORecord``. + + checker_configuration (dict): Extra configurations for the link checker. + + Returns: + list: List containing status of the links from the records. + """ + # filtering the records by type. + packages = list(filter(lambda x: x.parent["type"] == "package", records)) + resources = list(filter(lambda x: x.parent["type"] != "package", records)) + + # validating the links. + return [ + *_validate_packages_links(packages, checker_configuration), + *_validate_resources_links(resources, checker_configuration), + ] + + +def validate_records_links(checker_configuration, report_configuration): + """Validate links from GEO Knowledge Hub records (Knowledge Packages and Knowledge Resources).""" + for user in User.query.yield_per(1000): + records_owner_id = user.id + + # reading records associated with the current author (packages and resources) + records_obj, records_metadata = records.get_records_by_owner(records_owner_id) + + if not len(records_obj): + continue + + # checking links + validation_results = _validate_records_links(records_obj, checker_configuration) + validation_results = records.enrich_status_objects( + validation_results, metadata_cache=records_metadata + ) + + # reporting results + report.send_report(validation_results, records_owner_id, report_configuration) diff --git a/geo_rdm_records/modules/checker/schema.py b/geo_rdm_records/modules/checker/schema.py new file mode 100644 index 0000000..8914b0b --- /dev/null +++ b/geo_rdm_records/modules/checker/schema.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Schema module.""" + +from flask_resources import BaseObjectSchema, MarshmallowSerializer +from flask_resources.serializers import JSONSerializer +from invenio_vocabularies.resources import VocabularyL10Schema +from marshmallow import fields + + +class EmailRecordSchema(BaseObjectSchema): + """Record schema for e-mails.""" + + resource_type = fields.Nested( + VocabularyL10Schema, attribute="metadata.resource_type" + ) + + +class EmailRecordJSONSerializer(MarshmallowSerializer): + """Record serializer for e-mails.""" + + def __init__(self): + """Initializer.""" + super().__init__( + format_serializer_cls=JSONSerializer, + object_schema_cls=EmailRecordSchema, + schema_context={"object_key": "ui"}, + ) diff --git a/geo_rdm_records/modules/checker/tasks.py b/geo_rdm_records/modules/checker/tasks.py new file mode 100644 index 0000000..3210fbd --- /dev/null +++ b/geo_rdm_records/modules/checker/tasks.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 GEO Secretariat. +# +# geo-rdm-records is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Validation tasks module.""" + +from celery import shared_task + +from geo_rdm_records.modules.checker.links.service import config +from geo_rdm_records.modules.checker.links.validation import validate_records_links + + +@shared_task(ignore_result=True) +def check_records_links(): + """Check records links.""" + # reading configurations + report_configuration = config.get_report_config() + checker_configuration = config.get_checker_config() + + # validating links! + validate_records_links(checker_configuration, report_configuration) diff --git a/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html b/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html index 3b9d035..8d9ced7 100644 --- a/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html +++ b/geo_rdm_records/templates/semantic-ui/geo_rdm_records/reports/records-report.html @@ -1,16 +1,15 @@ - + - - - - - + + + + + + Thanks for using the GEO Knowledge Hub. +>Your links health report is here. - This report presents the accessibility status of the links - you shared in your Knowledge Packages and Resources. - Below are the general metrics and detailed information - about all links. + This report presents the accessibility status of the + links you shared in your Knowledge Packages and + Resources. Below are the general metrics and detailed + information about all links.

@@ -697,13 +485,10 @@ > Status of all your active records -
+ + + + + + + + +
+ + + + + + +
+
+ Records checked on + {{ report_date }} +
+
+
@@ -835,16 +668,18 @@ - {% for package in packages %} - {% if package.links > 0 %} - -
- +

- {{ package.package.metadata.title | safe }} + {{ + package.package.metadata.title | + safe }}

- - {% if package.package.links > 0 %} + {% if package.package.nlinks > 0 %}
-
+

Package's links

+ +
+ Access +
-
-
-
+
+
+
- {% for link in package.package.links_status %} + {% for link in + package.package.links_status %} {% endfor %} @@ -944,17 +799,13 @@ {% else %} -

This Knowledge Package doesn't have links.

- {% endif %} - - {# Show only if there are more links than the ones from the package. #} - {% if package.links > package.package.links %} +

+ This Knowledge Package doesn't have links. +

+ {% endif %} {# Show only if there are more + links than the ones from the package. #}
- {% for resource in package.resources %} - {% if resource.links > 0 %} - - - - - - {% endif %} - {% endfor %} + + + + {% endif %} {% endfor %}
+ package.package.nlinks %} {% for type, + items in package.resources | + groupby("ui.resource_type.title_l10n") %} + + + + - - -
+
-
- Resource: {{ resource.metadata.title | safe }} -
-
-
-
- + {{ type }} + + + + + + + {% for resource in items %} {% if + resource.nlinks > 0 %} +
+
+ - {% for link in resource.links_status %} - - - - - {% endfor %} + Access + + +
- {{ link.link | truncate(50, True) }} - +
+ Resource: {{ + resource.metadata.title | + safe }} +
+
- {{ 'Available' if link.is_available else 'Unavailable' }} -
+
+ + + {% for link in + resource.links_status %} + + + + + {% endfor %} + + +
- {% endif %} - {% endfor %} + {% endif %} {% endfor %} {% endfor %} {% + else %} +

+ This Knowledge Package doesn't have + resources with links. +

+ {% endif %}
- {% else %} -

This Knowledge Package doesn't have resources with links.

- {% endif %} -
@@ -1063,15 +981,16 @@ {% for resource in resources %} - {% if resource.links > 0 %} - - - - {% endif %} + + + + {% endfor %}
- +

- {{ resource.metadata.title | safe }} + {{ resource.metadata.title | + safe }}

@@ -1110,11 +1030,7 @@
-
-

+

Resource's links

+
+ +
+ Access - Resource's links -
-
-
-
- - - {% for link in resource.links_status %} - - - - - {% endfor %} - - +
+ {% if resource.nlinks > 0 %} +
+
+ + + {% for link in + resource.links_status %} + + + + + {% endfor %} + + +
-
+ {% else %} +

+ This Knowledge Resource doesn't have + links. +

+ {% endif %}
-
diff --git a/geo_rdm_records/validation/config.py b/geo_rdm_records/validation/config.py deleted file mode 100644 index 7da5c83..0000000 --- a/geo_rdm_records/validation/config.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022-2024 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Validation Helper module.""" - -from flask import current_app - - -def get_retry_config(confvar: str = "GEO_RDM_CHECKER_RETRY_CONFIG"): - """Helper to get configurations of retry.""" - return current_app.config[confvar] - - -def get_requests_config(confvar: str = "GEO_RDM_CHECKER_REQUEST_CONFIG"): - """Helper to get configurations of the ``requests.get`` method.""" - return current_app.config[confvar] - - -def get_chunks_config(confvar: str = "GEO_RDM_CHECKER_CHUNK_SIZE"): - """Helper to get configurations of the checker chunking system.""" - return current_app.config[confvar] - - -def get_sleep_config(confvar: str = "GEO_RDM_CHECKER_SLEEP_TIME"): - """Helper to get configurations of the sleep time used between the chunks processing.""" - return current_app.config[confvar] - - -def create_checker_config(): - """Create configuration object for the Link Checker.""" - return dict(requests_config=get_requests_config(), retry_config=get_retry_config()) diff --git a/geo_rdm_records/validation/links.py b/geo_rdm_records/validation/links.py deleted file mode 100644 index 51c915f..0000000 --- a/geo_rdm_records/validation/links.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022-2024 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Tasks module.""" - -from time import sleep - -from invenio_links_checker import checker_validate_links -from invenio_links_checker.contrib.chunking import checker_create_chunks_by_owners -from invenio_rdm_records.records.models import RDMRecordMetadata as GEORecordMetadata - -from geo_rdm_records.customizations.records.api import GEORecord -from geo_rdm_records.modules.packages.records.api import GEOPackageRecord -from geo_rdm_records.modules.packages.records.models import GEOPackageRecordMetadata -from geo_rdm_records.validation import config, report -from geo_rdm_records.validation.records import enrich_status_objects - - -def _check_chunk_packages(packages): - """Check links from chunks of Knowledge Packages.""" - results = [] - - # reading checker configuration - checker_configuration = config.create_checker_config() - - for package in packages: - package = GEOPackageRecord.get_record(package.id) - - package_is_latest = package.versions.is_latest - package_is_published = package.is_published - - # Check only the latest versions of published packages - if package_is_latest and package_is_published: - # Extracting resources - package_resources = [r.resolve() for r in package.relationship.resources] - - # Validating links from resources - package_resources_links_status = checker_validate_links(package_resources) - - # Validating links from the package - package_links_status = checker_validate_links( - [package], **checker_configuration - ) - - # Saving the result - results.append( - dict( - package=package_links_status[0], - resources=package_resources_links_status, - ) - ) - - return results - - -def _check_chunk_resources(resources): - """Check links from chunks of resources.""" - valid_resources = [] - - # reading checker configuration - checker_configuration = config.create_checker_config() - - for resource in resources: - resource = GEORecord.get_record(resource.id) - - resource_is_published = resource.is_published - resource_is_latest = resource.versions.is_latest - resource_is_managed = resource.parent.relationship.managed_by is None - - # select only the latest versions of not-managed published resources - if resource_is_latest and resource_is_published and resource_is_managed: - valid_resources.append(resource) - - return checker_validate_links(valid_resources, **checker_configuration) - - -def _check_chunk(record_chunk): - """Check links of the records in the chunk. - - Note: - In the GEO Knowledge Hub case, it is required to group the records by type and then validate them. - """ - # filtering the records by type. - packages = list(filter(lambda x: x.parent.json["type"] == "package", record_chunk)) - resources = list(filter(lambda x: x.parent.json["type"] != "package", record_chunk)) - - # validating the links. - return [*_check_chunk_packages(packages), *_check_chunk_resources(resources)] - - -def check_links(chunk_size): - """Check links from any type of InvenioRDM records.""" - # reading configurations - sleep_time = config.get_sleep_config() - - # loading records - resources = GEORecordMetadata.query.all() - records = GEOPackageRecordMetadata.query.all() - - records.extend(resources) - - # creating chunks - chunks = checker_create_chunks_by_owners(records, chunk_size) - - # validating chunks - for chunk in chunks: - chunk_owner = chunk["owner"] - chunk_records = chunk["records"] - - chunk_results = _check_chunk(chunk_records) - chunk_results = enrich_status_objects(chunk_results) - - report.send_report(chunk_results, chunk_owner) - - # note: In the GEO Knowledge Hub case, it is possible to stop between - # chunks as we don't have a lot of resources. In other cases, another approach - # should be considered. - if sleep_time: - sleep(sleep_time) diff --git a/geo_rdm_records/validation/records.py b/geo_rdm_records/validation/records.py deleted file mode 100644 index f88a4bd..0000000 --- a/geo_rdm_records/validation/records.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022-2024 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Record manipulation module.""" - -from geo_rdm_records.customizations.records.api import GEORecord -from geo_rdm_records.modules.packages.records.api import GEOPackageRecord - - -def _calculate_links(record): - """Calculate the number of links.""" - return len(list(record["links_status"])) - - -def _calculate_links_with_errors(record): - """Calculate the number of links with errors.""" - return len(list(filter(lambda x: not x["is_available"], record["links_status"]))) - - -def _summarize_records_total(packages, resources): - """Summarize total number of records (packages and resources).""" - nrecords = len(resources) - - for package in packages: - # 1 package + n resources - nrecords += 1 + len(package["resources"]) - - return nrecords - - -def _summarize(records, key): - """Summarize total number of links errors (from packages and resources).""" - return sum(map(lambda x: x[key], records)) - - -def _enrich_resource(resource): - """Load metadata from a resource object.""" - resource_obj = GEORecord.pid.resolve(resource["id"]) - - nlinks = _calculate_links(resource) - nerrors = _calculate_links_with_errors(resource) - - return { - **resource, - "metadata": resource_obj["metadata"], - "owners": resource_obj.parent["access"]["owned_by"], - "errors": nerrors, - "links": nlinks, - } - - -def _enrich_package(package): - """Load metadata from a package object.""" - # processing the package - package_obj = GEOPackageRecord.pid.resolve(package["package"]["id"]) - - nlinks = _calculate_links(package["package"]) - nerrors = _calculate_links_with_errors(package["package"]) - - package_result = { - **package["package"], - "metadata": package_obj["metadata"], - "owners": package_obj.parent["access"]["owned_by"], - "errors": nerrors, - "links": nlinks, - } - - # processing resources - resources_result = [] - package_resources = package["resources"] - - for resource in package_resources: - resource = _enrich_resource(resource) - resources_result.append(resource) - - nlinks += resource["links"] - nerrors += resource["errors"] - - return dict( - package=package_result, resources=resources_result, errors=nerrors, links=nlinks - ) - - -def enrich_status_objects(records): - """Inject extra metadata in the links status object.""" - packages = [] - resources = [] - - for record in records: - record_is_package = True if "package" in record else False - - if record_is_package: - packages.append(_enrich_package(record)) - else: - resources.append(_enrich_resource(record)) - - # summarizing some metrics - number_of_records = _summarize_records_total(packages, resources) - number_of_errors = _summarize(packages, "errors") + _summarize(resources, "errors") - - packages_links = _summarize(packages, "links") - resources_links = _summarize(resources, "links") - - return dict( - packages=packages, - resources=resources, - total_errors=number_of_errors, - total_records=number_of_records, - total_packages_links=packages_links, - total_resources_links=resources_links, - ) diff --git a/geo_rdm_records/validation/tasks.py b/geo_rdm_records/validation/tasks.py deleted file mode 100644 index 2334dac..0000000 --- a/geo_rdm_records/validation/tasks.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2022-2024 GEO Secretariat. -# -# geo-rdm-records is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Validation tasks module.""" - -from celery import shared_task - -from geo_rdm_records.validation.config import get_chunks_config -from geo_rdm_records.validation.links import check_links - - -@shared_task(ignore_result=True) -def check_records_links(): - """Check records links.""" - chunk_size = get_chunks_config() - - # validating links of packages and resources. - check_links(chunk_size) diff --git a/setup.cfg b/setup.cfg index 7d730af..c488414 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,10 @@ packages = find: python_requires = >=3.7 zip_safe = False install_requires = + pydash>=7.0.6 requests>=2.28.2 + retry-requests>=2.0.0,<2.1.0 + requests-cache>=1.1.0,<1.2.0 invenio-i18n>=1.2.0 invenio-oaiserver>=2.0.0,<2.2.0 geo-config @ git+https://github.com/geo-knowledge-hub/geo-config@b-0.5 @@ -52,6 +55,7 @@ tests = sphinx>=4.5.0 tripoli~=2.0.0 requests-mock>=1.10.0 + docker-services-cli<=0.7.1 opensearch2 = invenio-search[opensearch2]>=2.1.0,<3.0.0 @@ -96,7 +100,7 @@ invenio_db.alembic = geo_rdm_records = geo_rdm_records:alembic invenio_celery.tasks = geo_rdm_records_packages = geo_rdm_records.modules.packages.services.tasks - geo_rdm_records_validation = geo_rdm_records.validation.tasks + geo_rdm_records_checker = geo_rdm_records.modules.checker.tasks [build_sphinx] source-dir = docs/