-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #173 from M3nin0/validation
[WIP] validation: adding initial validation of links from records
- Loading branch information
Showing
17 changed files
with
2,040 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Records checker module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Links checker module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Checker module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Checker module.""" | ||
|
||
from .metadata import extract_links_from_record | ||
from .network import is_link_available | ||
|
||
|
||
def _check_links(record_links, **kwargs): | ||
"""Check links from a given record. | ||
Args: | ||
record_links (list): List of links from a given record. | ||
**kwargs: Extra configurations for the `is_link_available` function. | ||
Returns: | ||
list: List with the links' status. | ||
""" | ||
return [ | ||
dict(link=link, is_available=is_link_available(link, **kwargs)) | ||
for link in record_links | ||
] | ||
|
||
|
||
def checker_validate_links(records, **kwargs): | ||
"""Check links from records. | ||
Args: | ||
records (list): List of ``invenio_records.api.Record`` objects. | ||
**kwargs: Extra configurations for the `is_link_available` function. | ||
Returns: | ||
list: List with the links' status. | ||
""" | ||
result = [] | ||
|
||
for record in records: | ||
record_id = record.pid.pid_value | ||
|
||
# extracting the links and checking its status | ||
record_links = extract_links_from_record(record) | ||
record_links_status = _check_links(record_links, **kwargs) | ||
|
||
result.append(dict(id=record_id, links_status=record_links_status)) | ||
|
||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Checker metadata management module.""" | ||
|
||
import re | ||
|
||
from pydash import py_ | ||
|
||
|
||
def _extract_links(record_document): | ||
"""Extract all links from a string. | ||
Args: | ||
record_document (str): Record document as a string. | ||
Returns: | ||
list: List containing all links found in the record document. | ||
""" | ||
# Regex pattern for extracting URLs | ||
url_pattern = r'https?://[^\s<>"\',]+|www\.[^\s<>"\',]+' | ||
|
||
# Find all non-overlapping matches of the pattern in the string | ||
return re.findall(url_pattern, record_document) | ||
|
||
|
||
def extract_links_from_record(record): | ||
"""Extract all links available in a record. | ||
Args: | ||
record (invenio_records.api.Record): Record object | ||
Returns: | ||
list: List containing all links found in the record document. | ||
""" | ||
record_metadata_as_string = str(record.dumps()) | ||
|
||
# Extracting links | ||
record_links = _extract_links(record_metadata_as_string) | ||
|
||
# Removing duplicates | ||
return py_.uniq(record_links) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Checker Network management module.""" | ||
|
||
from datetime import timedelta | ||
|
||
import requests | ||
from requests_cache import CachedSession | ||
from retry_requests import retry | ||
|
||
|
||
def is_link_available( | ||
url: str, | ||
requests_config=None, | ||
retry_config=None, | ||
cache_config=None, | ||
): | ||
"""Check if a link is available. | ||
Args: | ||
url (str): URL to be checked. | ||
requests_config (dict): ``requests.get`` configurations | ||
retry_config (dict): ``retry_requests.retry`` configurations | ||
cache_config (dict): ``requests_cache.CachedSession`` configurations. | ||
Note: | ||
By default, the following cases are used to define a link as unavailable: | ||
- Case 1: Delay to answer longer than 5 seconds; | ||
- Case 2: No access to the server or a dropped connection; | ||
- Case 3: An HTTP Answer of 500, 502, or 504. | ||
To change the value of ``Case 1``, you can use the Request config (``requests_config``). Also, | ||
to change the value of ``Case 2`` and ``Case 3``, you can use the Retry config (``retry_config``). | ||
""" | ||
is_available = True | ||
|
||
cache_config = {} if cache_config is None else cache_config | ||
retry_config = {} if retry_config is None else retry_config | ||
requests_config = {} if requests_config is None else requests_config | ||
|
||
# building the session object | ||
session = CachedSession( | ||
"geo_rdm_records_links_checker", | ||
cache_control=False, | ||
expire_after=timedelta(days=30), | ||
allowable_codes=[200, 400], | ||
**cache_config | ||
) | ||
|
||
session = retry(session, **retry_config) | ||
|
||
try: | ||
session.get(url, **requests_config) | ||
# nothing to do | ||
except requests.RequestException as e: | ||
# If there is any request-related error, the link is not available | ||
is_available = False | ||
|
||
return is_available |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Checker service module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 GEO Secretariat. | ||
# | ||
# geo-rdm-records is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Configuration utility module.""" | ||
|
||
from flask import current_app | ||
|
||
|
||
def _get_retry_config(confvar: str = "GEO_RDM_CHECKER_RETRY_CONFIG"): | ||
"""Get configurations of retry.""" | ||
return current_app.config[confvar] | ||
|
||
|
||
def _get_requests_config(confvar: str = "GEO_RDM_CHECKER_REQUEST_CONFIG"): | ||
"""Get configurations of the ``requests.get`` method.""" | ||
return current_app.config[confvar] | ||
|
||
|
||
def _get_report_title(confvar: str = "GEO_RDM_CHECKER_REPORT_TITLE"): | ||
"""Get configurations of the sleep time used between the chunks processing.""" | ||
return current_app.config[confvar] | ||
|
||
|
||
def _get_report_template(confvar: str = "GEO_RDM_CHECKER_REPORT_TEMPLATE"): | ||
"""Get configurations of the sleep time used between the chunks processing.""" | ||
return current_app.config[confvar] | ||
|
||
|
||
def get_checker_config(): | ||
"""Get configuration object for the Link Checker.""" | ||
return dict( | ||
requests_config=_get_requests_config(), retry_config=_get_retry_config() | ||
) | ||
|
||
|
||
def get_report_config(): | ||
"""Get configuration object for the Report.""" | ||
return dict( | ||
report_title=_get_report_title(), report_template=_get_report_template() | ||
) |
Oops, something went wrong.