Skip to content

Commit

Permalink
Merge pull request #173 from M3nin0/validation
Browse files Browse the repository at this point in the history
[WIP] validation: adding initial validation of links from records
  • Loading branch information
M3nin0 authored Jan 18, 2024
2 parents 6beba4c + 6f1122e commit 09414e2
Show file tree
Hide file tree
Showing 17 changed files with 2,040 additions and 1 deletion.
2 changes: 1 addition & 1 deletion geo_rdm_records/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@

from .ext import GEORDMRecords

__version__ = "0.7.0"
__version__ = "0.8.0"
__all__ = ("__version__", "GEORDMRecords")
16 changes: 16 additions & 0 deletions geo_rdm_records/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

"""GEO RDM Records module configurations."""

from flask_babelex import lazy_gettext as _
from invenio_rdm_records.services import facets as rdm_facets

from geo_rdm_records.base.services import facets as geo_facets
Expand Down Expand Up @@ -177,6 +178,21 @@
# E-mail configuration
GEO_RDM_NOTIFICATION_DEFAULT_RECEIVER_EMAILS = []

#
# Checker configuration
#
GEO_RDM_CHECKER_RETRY_CONFIG = {"retries": 5, "backoff_factor": 0.3}
"""Retry configurations (based on retry-requests library)."""

GEO_RDM_CHECKER_REQUEST_CONFIG = {"timeout": 10}
"""Request configuration (based on requests (get method) library."""

GEO_RDM_CHECKER_REPORT_TITLE = _("GEO Knowledge Hub - Links status from your records")
"""Report title."""

GEO_RDM_CHECKER_REPORT_TEMPLATE = "geo_rdm_records/reports/records-report.html"
"""Report jinja2 template."""

# OAI-PMH
# =======
# See https://github.com/inveniosoftware/invenio-oaiserver/blob/master/invenio_oaiserver/config.py
Expand Down
8 changes: 8 additions & 0 deletions geo_rdm_records/modules/checker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Records checker module."""
8 changes: 8 additions & 0 deletions geo_rdm_records/modules/checker/links/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Links checker module."""
8 changes: 8 additions & 0 deletions geo_rdm_records/modules/checker/links/checker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Checker module."""
53 changes: 53 additions & 0 deletions geo_rdm_records/modules/checker/links/checker/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Checker module."""

from .metadata import extract_links_from_record
from .network import is_link_available


def _check_links(record_links, **kwargs):
"""Check links from a given record.
Args:
record_links (list): List of links from a given record.
**kwargs: Extra configurations for the `is_link_available` function.
Returns:
list: List with the links' status.
"""
return [
dict(link=link, is_available=is_link_available(link, **kwargs))
for link in record_links
]


def checker_validate_links(records, **kwargs):
"""Check links from records.
Args:
records (list): List of ``invenio_records.api.Record`` objects.
**kwargs: Extra configurations for the `is_link_available` function.
Returns:
list: List with the links' status.
"""
result = []

for record in records:
record_id = record.pid.pid_value

# extracting the links and checking its status
record_links = extract_links_from_record(record)
record_links_status = _check_links(record_links, **kwargs)

result.append(dict(id=record_id, links_status=record_links_status))

return result
46 changes: 46 additions & 0 deletions geo_rdm_records/modules/checker/links/checker/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Checker metadata management module."""

import re

from pydash import py_


def _extract_links(record_document):
"""Extract all links from a string.
Args:
record_document (str): Record document as a string.
Returns:
list: List containing all links found in the record document.
"""
# Regex pattern for extracting URLs
url_pattern = r'https?://[^\s<>"\',]+|www\.[^\s<>"\',]+'

# Find all non-overlapping matches of the pattern in the string
return re.findall(url_pattern, record_document)


def extract_links_from_record(record):
"""Extract all links available in a record.
Args:
record (invenio_records.api.Record): Record object
Returns:
list: List containing all links found in the record document.
"""
record_metadata_as_string = str(record.dumps())

# Extracting links
record_links = _extract_links(record_metadata_as_string)

# Removing duplicates
return py_.uniq(record_links)
67 changes: 67 additions & 0 deletions geo_rdm_records/modules/checker/links/checker/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Checker Network management module."""

from datetime import timedelta

import requests
from requests_cache import CachedSession
from retry_requests import retry


def is_link_available(
url: str,
requests_config=None,
retry_config=None,
cache_config=None,
):
"""Check if a link is available.
Args:
url (str): URL to be checked.
requests_config (dict): ``requests.get`` configurations
retry_config (dict): ``retry_requests.retry`` configurations
cache_config (dict): ``requests_cache.CachedSession`` configurations.
Note:
By default, the following cases are used to define a link as unavailable:
- Case 1: Delay to answer longer than 5 seconds;
- Case 2: No access to the server or a dropped connection;
- Case 3: An HTTP Answer of 500, 502, or 504.
To change the value of ``Case 1``, you can use the Request config (``requests_config``). Also,
to change the value of ``Case 2`` and ``Case 3``, you can use the Retry config (``retry_config``).
"""
is_available = True

cache_config = {} if cache_config is None else cache_config
retry_config = {} if retry_config is None else retry_config
requests_config = {} if requests_config is None else requests_config

# building the session object
session = CachedSession(
"geo_rdm_records_links_checker",
cache_control=False,
expire_after=timedelta(days=30),
allowable_codes=[200, 400],
**cache_config
)

session = retry(session, **retry_config)

try:
session.get(url, **requests_config)
# nothing to do
except requests.RequestException as e:
# If there is any request-related error, the link is not available
is_available = False

return is_available
8 changes: 8 additions & 0 deletions geo_rdm_records/modules/checker/links/service/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Checker service module."""
44 changes: 44 additions & 0 deletions geo_rdm_records/modules/checker/links/service/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 GEO Secretariat.
#
# geo-rdm-records is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Configuration utility module."""

from flask import current_app


def _get_retry_config(confvar: str = "GEO_RDM_CHECKER_RETRY_CONFIG"):
"""Get configurations of retry."""
return current_app.config[confvar]


def _get_requests_config(confvar: str = "GEO_RDM_CHECKER_REQUEST_CONFIG"):
"""Get configurations of the ``requests.get`` method."""
return current_app.config[confvar]


def _get_report_title(confvar: str = "GEO_RDM_CHECKER_REPORT_TITLE"):
"""Get configurations of the sleep time used between the chunks processing."""
return current_app.config[confvar]


def _get_report_template(confvar: str = "GEO_RDM_CHECKER_REPORT_TEMPLATE"):
"""Get configurations of the sleep time used between the chunks processing."""
return current_app.config[confvar]


def get_checker_config():
"""Get configuration object for the Link Checker."""
return dict(
requests_config=_get_requests_config(), retry_config=_get_retry_config()
)


def get_report_config():
"""Get configuration object for the Report."""
return dict(
report_title=_get_report_title(), report_template=_get_report_template()
)
Loading

0 comments on commit 09414e2

Please sign in to comment.