LUMC · rhpvorderman · Mar 18, 2024 · Jan 13, 2023 · Jan 20, 2023 · Jun 22, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.7"
+          - "3.8"
     steps:
       - uses: actions/[email protected]
       - name: Set up Python ${{ matrix.python-version }}
@@ -42,11 +42,11 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.7"
           - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
     steps:
       - uses: actions/[email protected]
       - name: Set up Python ${{ matrix.python-version }}
@@ -64,7 +64,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7"]
+        python-version: ["3.11"]
         test-program: [snakemake, miniwdl]
     steps:
       - uses: actions/[email protected]

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,13 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+    - requirements: requirements-docs.txt
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,11 +2,24 @@
 Changelog
 ==========
 
+
 .. Newest changes should be on top.
 
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 2.1.0
+---------------------------
++ Python version 3.7 support is dropped because it is deprecated. Python
+  version 3.12 was added.
++ Fixed a bug where pytest 8.1+ would raise a ``PluginValidationError`` because
+  the hook ``pytest_collect_file()`` has finally dropped the deprecated
+  argument ``path`` from its specification.
++ Add extract_md5sum check on uncompressed contents of compressed output files.
+  Gzipped files contain a timestamp which makes it hard to directly compare the
+  md5sums of gzipped files.
++ Document naming conventions for Python test discovery
+
 version 2.0.1
 ---------------------------
 + Fixed a bug where pytest-workflow would crash on logs that used non-ASCII

diff --git a/README.rst b/README.rst
@@ -127,6 +127,7 @@ predefined tests as well as custom tests are possible.
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"

diff --git a/docs/writing_tests.rst b/docs/writing_tests.rst
@@ -64,6 +64,7 @@ Test options
       - path: "TomCruise.txt.gz"       # Gzipped files can also be searched, provided their extension is '.gz'
         contains:
           - "starring"
+        extract_md5sum: e27c52f6b5f8152aa3ef58be7bdacc4d   # Md5sum of the uncompressed file (optional)
     stderr:                            # Options for testing stderr (optional)
       contains:                        # A list of strings which should be in stderr (optional)
         - "BSOD error, please contact the IT crowd"
@@ -89,6 +90,12 @@ Please see the `Python documentation on regular expressions
 <https://docs.python.org/3/library/re.html>`_ to see how Python handles escape
 sequences.
 
+The ``extract_md5sum`` option is used to uncompress a file and then compare
+the md5sum of the uncompressed file with the supplied md5sum. This option is
+particularly useful when testing gzipped files, which may contain a file
+creation timestamp in the gzip header. The supported compressed file
+formats for this option are gzip, bzip2, xz and Zstandard.
+
 .. note::
     Workflow names must be unique. Pytest workflow will crash when multiple
     workflows have the same name, even if they are in different files.
@@ -160,6 +167,10 @@ Multiple workflows can use the same custom test like this:
 points to the folder where the named workflow was executed. This allows writing
 of advanced python tests for each file produced by the workflow.
 
+Custom tests must follow the `conventions for Python test discovery
+<https://docs.pytest.org/en/latest/explanation/goodpractices.html#conventions-for-python-test-discovery>`_,
+which constrains the names of files and functions containing custom tests.
+
 .. note::
 
     stdout and stderr are available as files in the root of the

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 pyyaml
 pytest>=7.0.0
-jsonschema
+jsonschema
+xopen>=1.7.0
+zstandard
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name="pytest-workflow",
-    version="2.0.1",
+    version="2.1.0",
     description="A pytest plugin for configuring workflow/pipeline tests "
                 "using YAML files",
     author="Leiden University Medical Center",
@@ -39,22 +39,24 @@
     classifiers=[
         "Programming Language :: Python :: 3 :: Only",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Development Status :: 5 - Production/Stable",
         "License :: OSI Approved :: "
         "GNU Affero General Public License v3 or later (AGPLv3+)",
         "Framework :: Pytest",
     ],
-    # Because we cannot test anymore on Python 3.6.
-    python_requires=">=3.7",
+    # Because we cannot test anymore on Python 3.8.
+    python_requires=">=3.8",
     install_requires=[
         "pytest>=7.0.0",  # To use pathlib Path's in pytest
         "pyyaml",
-        "jsonschema"
+        "jsonschema",
+        "xopen>=1.4.0",
+        "zstandard",
     ],
     # This line makes sure the plugin is automatically loaded when it is
     # installed in the same environment as pytest. No need to configure

diff --git a/src/pytest_workflow/file_tests.py b/src/pytest_workflow/file_tests.py
@@ -22,7 +22,7 @@
 
 from .content_tests import ContentTestCollector
 from .schema import FileTest
-from .util import file_md5sum
+from .util import extract_md5sum, file_md5sum
 from .workflow import Workflow
 
 
@@ -76,7 +76,16 @@
                 parent=self,
                 filepath=filepath,
                 md5sum=self.filetest.md5sum,
-                workflow=self.workflow)]
+                workflow=self.workflow,
+                extract=False)]
+
+        if self.filetest.extract_md5sum:
+            tests += [FileMd5.from_parent(
+                parent=self,
+                filepath=filepath,
+                md5sum=self.filetest.extract_md5sum,
+                workflow=self.workflow,
+                extract=True)]
 
         return tests
 
@@ -119,32 +128,37 @@
 
 class FileMd5(pytest.Item):
     def __init__(self, parent: pytest.Collector, filepath: Path,
-                 md5sum: str, workflow: Workflow):
+                 md5sum: str, workflow: Workflow, extract: bool):
         """
         Create a tests for the file md5sum.
         :param parent: The collector that started this item
         :param filepath: The path to the file
         :param md5sum:  The expected md5sum
         :param workflow: The workflow running to generate the file
+        :param extract: Whether the file should be extracted before calculating
         """
-        name = "md5sum"
+        name = "extract_md5sum" if extract else "md5sum"
         super().__init__(name, parent)
         self.filepath = filepath
         self.expected_md5sum = md5sum
         self.observed_md5sum = None
         self.workflow = workflow
+        self.extract = extract
 
     def runtest(self):
         # Wait for the workflow to finish before we check the md5sum of a file.
         self.workflow.wait()
         if not self.workflow.matching_exitcode():
             pytest.skip(f"'{self.parent.workflow.name}' did not exit with"
                         f"desired exit code.")
-        self.observed_md5sum = file_md5sum(self.filepath)
+        sum_func = extract_md5sum if self.extract else file_md5sum
+        self.observed_md5sum = sum_func(self.filepath)
         assert self.observed_md5sum == self.expected_md5sum
 
     def repr_failure(self, excinfo, style=None):
+        metric = "extract_md5sum" if self.extract else "md5sum"
         return (
-            f"Observed md5sum '{self.observed_md5sum}' not equal to expected "
-            f"md5sum '{self.expected_md5sum}' for file '{self.filepath}'"
-        )
+            f"Observed {metric} '{self.observed_md5sum}' not equal to "
+            f"expected {metric} '{self.expected_md5sum}' for file "
+            f"'{self.filepath}'"
+         )
diff --git a/src/pytest_workflow/plugin.py b/src/pytest_workflow/plugin.py
@@ -117,11 +117,12 @@ def addoption(self, *args, **kwargs):
     return parser
 
 
-def pytest_collect_file(file_path, path, parent):
+def pytest_collect_file(file_path, parent):
     """Collection hook
     This collects the yaml files that start with "test" and end with
     .yaml or .yml"""
-    if path.ext in [".yml", ".yaml"] and path.basename.startswith("test"):
+    if (file_path.suffix in [".yml", ".yaml"] and
+            file_path.name.startswith("test")):
         return YamlFile.from_parent(parent, path=file_path)
     return None
 

diff --git a/src/pytest_workflow/schema.py b/src/pytest_workflow/schema.py
@@ -125,6 +125,7 @@ def __init__(self, contains: Optional[List[str]] = None,
 class FileTest(ContentTest):
     """A class that contains all the properties of a to be tested file."""
     def __init__(self, path: str, md5sum: Optional[str] = None,
+                 extract_md5sum: Optional[str] = None,
                  should_exist: bool = DEFAULT_FILE_SHOULD_EXIST,
                  contains: Optional[List[str]] = None,
                  must_not_contain: Optional[List[str]] = None,
@@ -135,6 +136,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
         A container object
         :param path: the path to the file
         :param md5sum: md5sum of the file contents
+        :param extract_md5sum: md5sum of the extracted file contents
         :param should_exist: whether the file should exist or not
         :param contains: a list of strings that should be present in the file
         :param must_not_contain: a list of strings that should not be present
@@ -150,6 +152,7 @@ def __init__(self, path: str, md5sum: Optional[str] = None,
                          encoding=encoding)
         self.path = Path(path)
         self.md5sum = md5sum
+        self.extract_md5sum = extract_md5sum
         self.should_exist = should_exist
 
 

diff --git a/src/pytest_workflow/schema/schema.json b/src/pytest_workflow/schema/schema.json
@@ -123,6 +123,10 @@
             "should_exist": {
               "type": "boolean"
             },
+            "extract_md5sum": {
+              "type": "string",
+              "pattern": "^[a-f0-9]{32}$"
+            },
             "contains": {
               "type": "array",
               "items": {

diff --git a/src/pytest_workflow/util.py b/src/pytest_workflow/util.py
@@ -7,7 +7,10 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
+from typing import Callable, IO, Iterator, List, Optional, Set, Tuple, Union, \
+                   cast
+
+from xopen import xopen
 
 Filepath = Union[str, os.PathLike]
 
@@ -204,10 +207,32 @@ def file_md5sum(filepath: Path, block_size=64 * 1024) -> str:
     :param block_size: Block size in bytes
     :return: a md5sum as hexadecimal string.
     """
-    hasher = hashlib.md5()
     with filepath.open('rb') as file_handler:  # Read the file in bytes
-        for block in iter(lambda: file_handler.read(block_size), b''):
-            hasher.update(block)
+        return file_handle_md5sum(file_handler, block_size)
+
+
+def extract_md5sum(filepath: Path, block_size=64 * 1024) -> str:
+    """
+    Generates a md5sum for the uncompressed contents of compressed file.
+    Reads file in blocks to save memory.
+    :param filepath: a pathlib. Path to the compressed file
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    with xopen(filepath, 'rb') as file_handler:  # Read the file in bytes
+        return file_handle_md5sum(cast(IO[bytes], file_handler), block_size)
+
+
+def file_handle_md5sum(file_handler: IO[bytes], block_size) -> str:
+    """
+    Generates a md5sum for a file handle. Reads file in blocks to save memory.
+    :param file_handler: a readable binary file handler
+    :param block_size: Block size in bytes
+    :return: a md5sum as hexadecimal string.
+    """
+    hasher = hashlib.md5()
+    for block in iter(lambda: file_handler.read(block_size), b''):
+        hasher.update(block)
     return hasher.hexdigest()
 
 

diff --git a/tests/functional/simple_snakefile_test_cases.yml b/tests/functional/simple_snakefile_test_cases.yml
@@ -1,26 +1,26 @@
 - name: test-dry-run
-  command: snakemake -n -r -p -s SimpleSnakefile --config N_LINES_TO_READ=1
+  command: snakemake -n -p -s SimpleSnakefile --config N_LINES_TO_READ=1
 - name: test-config-missing
-  command: snakemake -n -r -p -s SimpleSnakefile
+  command: snakemake -n -p -s SimpleSnakefile
   exit_code: 1
-  stdout:
+  stdout:  # Dry run output should be stdout. See https://github.com/snakemake/snakemake/issues/2757
     contains:
       - "You must set --config N_LINES_TO_READ=<a value>."
 - name: test-config-wrong-type
-  command: snakemake -n -r -p -s SimpleSnakefile --config N_LINES_TO_READ=one
+  command: snakemake -n -p -s SimpleSnakefile --config N_LINES_TO_READ=one
   exit_code: 1
   stdout:
     contains:
       - "N_LINES_TO_READ must be an integer."
 - name: test-config-invalid-value
-  command: snakemake -n -r -p -s SimpleSnakefile --config N_LINES_TO_READ=-1
+  command: snakemake -n -p -s SimpleSnakefile --config N_LINES_TO_READ=-1
   exit_code: 1
   stdout:
     contains:
       - "N_LINES_TO_READ must at least be 1."
 - name: test-snakemake-run
   command: >-
-    snakemake --cores 1 -r -p -s SimpleSnakefile --config N_LINES_TO_READ=500
+    snakemake --cores 1 -p -s SimpleSnakefile --config N_LINES_TO_READ=500
   files:
     - path: rand/0.txt
     - path: rand/1.txt

diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -171,6 +171,7 @@ def test_filetest_defaults():
     assert file_test.contains_regex == []
     assert file_test.must_not_contain_regex == []
     assert file_test.md5sum is None
+    assert file_test.extract_md5sum is None
     assert file_test.should_exist
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with pytest-workflow.  If not, see <https://www.gnu.org/licenses/
+import gzip
 import hashlib
 import itertools
 import os
@@ -25,7 +26,7 @@
 import pytest
 
 from pytest_workflow.util import decode_unaligned, duplicate_tree, \
-    file_md5sum, git_check_submodules_cloned, git_root, \
+    extract_md5sum, file_md5sum, git_check_submodules_cloned, git_root, \
     is_in_dir, link_tree, replace_whitespace
 
 WHITESPACE_TESTS = [
@@ -163,6 +164,14 @@ def test_file_md5sum(hash_file: Path):
     assert whole_file_md5 == per_line_md5
 
 
+def test_extract_md5sum():
+    hash_file = HASH_FILE_DIR / "LICENSE.gz"
+    with gzip.open(hash_file, "rb") as contents_fh:
+        whole_file_md5 = hashlib.md5(contents_fh.read()).hexdigest()
+    per_line_md5 = extract_md5sum(hash_file)
+    assert whole_file_md5 == per_line_md5
+
+
 def create_git_repo(path):
     dir = Path(path)
     os.mkdir(dir)