[BD-21] Implement multi-line code annotations

Make it possible (again) to write multi-line code annotations, such as: .. pi: This is an annotation that spans multiple lines and allows developers to write more extensive docs. The only problem with multi-line annotations is that we did not find a way to get rid of empty spaces that prefix every new line after the first.
openedx · Jul 22, 2020 · 4cfcf96 · 4cfcf96
1 parent bb3f1aa
commit 4cfcf96
Show file tree

Hide file tree

Showing 11 changed files with 202 additions and 64 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,6 +16,11 @@ Unreleased
 
 *
 
+[0.4.0] - 2020-07-22
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multi-line code annotations
+
 [0.3.4] - 2020-05-06
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.3.4'
+__version__ = '0.4.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
@@ -4,7 +4,7 @@
 import re
 from abc import ABCMeta, abstractmethod
 
-from code_annotations.helpers import clean_abs_path, get_annotation_regex
+from code_annotations.helpers import clean_abs_path, clean_annotation, get_annotation_regex
 
 
 class AnnotationExtension(object, metaclass=ABCMeta):
@@ -56,24 +56,9 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     Returns a 2-tuple of:
      - ("Comment text", None) in the case of a multi-line comment OR
      - (None, "Comment text") in the case of a single-line comment
-
-    TODO: Make this handle multi-line annotation comments again.
     """
     comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
 
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
-
-    [\s\S]*? - Strip out any characters between the start of the comment and the annotation
-    ({})     - {} is a Python format string that will be replaced with a regex escaped and
-               then or-joined to make a list of the annotation tokens we're looking for
-               Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)     - and capture all characters until the end of the line
-
-    Returns a 2-tuple of found annotation token and annotation comment
-
-    TODO: Make multi line annotation comments work again.
-    """
     def __init__(self, config, echo):
         """
         Set up the extension and create the regexes used to do searches.
@@ -126,25 +111,23 @@ def search(self, file_handle):
                     # annotation token itself.
                     line = txt.count('\n', 0, match.start()) + 1
 
-                    # No matter how long the regex is, there should only be 2 non-None items,
-                    # with the first being the annotation token and the 2nd being the comment.
-                    cleaned_groups = [item for item in inner_match.groups() if item is not None]
-
-                    if len(cleaned_groups) != 2:  # pragma: no cover
-                        raise Exception('{}::{}: Number of found items in the list is not 2. Found: {}'.format(
+                    try:
+                        annotation_token = inner_match.group('token')
+                        annotation_data = inner_match.group('data')
+                    except IndexError:
+                        # pragma: no cover
+                        raise ValueError('{}::{}: Could not find "data" or "token" groups. Found: {}'.format(
                             fname,
                             line,
-                            cleaned_groups
+                            inner_match.groupdict()
                         ))
-
-                    annotation, comment = cleaned_groups
-
+                    annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
                     found_annotations.append({
                         'found_by': self.extension_name,
                         'filename': fname,
                         'line_number': line,
-                        'annotation_token': annotation.strip(),
-                        'annotation_data': comment.strip()
+                        'annotation_token': annotation_token,
+                        'annotation_data': annotation_data,
                     })
 
         return found_annotations
diff --git a/code_annotations/find_django.py b/code_annotations/find_django.py
@@ -12,7 +12,7 @@
 from django.db import models
 
 from code_annotations.base import BaseSearch
-from code_annotations.helpers import fail, get_annotation_regex
+from code_annotations.helpers import clean_annotation, fail, get_annotation_regex
 
 DEFAULT_SAFELIST_FILE_PATH = '.annotation_safe_list.yml'
 
@@ -108,33 +108,30 @@ def _append_model_annotations(self, model_type, model_id, query, model_annotatio
         with open(filename, 'r') as file_handle:
             txt = file_handle.read()
 
-        for inner_match in query.finditer(model_type.__doc__):
-            # TODO: This is duplicated code with extensions/base.py
-            # No matter how long the regex is, there should only be 2 non-None items,
-            # with the first being the annotation token and the 2nd being the comment.
-            cleaned_groups = [item for item in inner_match.groups() if item is not None]
+        # Get the line number by counting newlines + 1 (for the first line).
+        # Note that this is the line number of the beginning of the comment, not the
+        # annotation token itself. We find based on the entire code content of the model
+        # as that seems to be the only way to be sure we're getting the correct line number.
+        # It is slow and should be replaced if we can find a better way that is accurate.
+        line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
 
-            if len(cleaned_groups) != 2:  # pragma: no cover
-                raise Exception('{}: Number of found items in the list is not 2. Found: {}'.format(
+        for inner_match in query.finditer(model_type.__doc__):
+            try:
+                annotation_token = inner_match.group('token')
+                annotation_data = inner_match.group('data')
+            except IndexError:
+                # pragma: no cover
+                raise ValueError('{}: Could not find "data" or "token" groups. Found: {}'.format(
                     self.get_model_id(model_type),
-                    cleaned_groups
+                    inner_match.groupdict()
                 ))
-
-            annotation, comment = cleaned_groups
-
-            # Get the line number by counting newlines + 1 (for the first line).
-            # Note that this is the line number of the beginning of the comment, not the
-            # annotation token itself. We find based on the entire code content of the model
-            # as that seems to be the only way to be sure we're getting the correct line number.
-            # It is slow and should be replaced if we can find a better way that is accurate.
-            line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
-
+            annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
             model_annotations.append({
                 'found_by': "django",
                 'filename': filename,
                 'line_number': line,
-                'annotation_token': annotation.strip(),
-                'annotation_data': comment.strip(),
+                'annotation_token': annotation_token,
+                'annotation_data': annotation_data,
                 'extra': {
                     'object_id': model_id,
                     'full_comment': model_type.__doc__.strip()

diff --git a/code_annotations/helpers.py b/code_annotations/helpers.py
@@ -113,25 +113,59 @@ def clean_abs_path(filename_to_clean, parent_path):
 def get_annotation_regex(annotation_regexes):
     """
     Return the full regex to search inside comments for configured annotations.
-    A match against the regex will returns a 2-tuple of found annotation token and annotation comment
+
+    A successful match against the regex will return two groups of interest: 'token'
+    and 'data'.
+
+    This regular expression supports annotation tokens that span multiple lines. To do
+    so, prefix each line after the first by at least two leading spaces. E.g:
+
+        .. pii: First line
+          second line
+
+    Unfortunately, the indenting spaces will find their way to the content of the "token" group.
 
     Args:
         annotation_regexes: List of re.escaped annotation tokens to search for.
 
     Returns:
         Regex ready for searching comments for annotations.
-    """
-    # pylint: disable=pointless-string-statement
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
-
     """
     annotation_regex = r"""
-    [\s\S]*?                   # Strip out any characters between the start of the comment and the annotation
-    ({})                       # Python format string that will be replaced with a regex escaped and
-                               # then or-joined to make a list of the annotation tokens we're looking for
-                               # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)                       # capture all characters until the end of the line
+    (?P<space>[\ \t]*)               # Leading empty spaces
+    (?P<token>{tokens})              # Python format string that will be replaced with a
+                                     # regex, escaped and then or-joined to make a list
+                                     # of the annotation tokens we're looking for
+                                     # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
+    (?P<data>                        # Captured annotation data
+        (?:                          # non-capture mode
+            .                        # any non-newline character
+            |                        # or new line of multi-line annotation data
+            (?:                      # non-capture mode
+                \n{{1,}}             # at least one newline,
+                (?P=space)           # followed by as much space as the prefix,
+                (?P<indent>\ {{2,}}) # at least two spaces,
+                (?=[^\ ])            # and a non-space character (look-ahead)
+                (?!{tokens})         # that does not match any of the token regexes
+            )                        #
+        )*                           # any number of times
+    )
     """
-    annotation_regex = annotation_regex.format('|'.join(annotation_regexes))
+    annotation_regex = annotation_regex.format(tokens='|'.join(annotation_regexes))
     return re.compile(annotation_regex, flags=re.VERBOSE)
+
+
+def clean_annotation(token, data):
+    """
+    Clean annotation token and data by stripping all trailing/prefix empty spaces.
+
+    Args:
+        token (str)
+        data (str)
+
+    Returns:
+        (str, str): Tuple of cleaned token, data
+    """
+    token = token.strip()
+    data = data.strip()
+    return token, data
diff --git a/docs/writing_annotations.rst b/docs/writing_annotations.rst
@@ -13,8 +13,10 @@ comments into two parts- the annotation token, and the annotation data.
 
 - Annotation data
     Annotation data can either be a simple free text comment that is on the same line as the token, or a choice list.
-    The choices in a choice list are configured in the configuration file and can be separated by spaces or commas when
-    used in comments. As such, the choices themselves should not contain spaces or commas.
+    Free text annotations can span multiple lines, provided all lines after the first
+    are indented by at least two spaces. The choices in a choice list are configured in
+    the configuration file and can be separated by spaces or commas when used in
+    comments. As such, the choices themselves should not contain spaces or commas.
 
 The information below applies to both the Static Search and Django Model Search tools, with the exception that the
 Django Model Search only looks in model docstrings.
@@ -51,6 +53,30 @@ When a report is run against this code an entry like this will be generated in t
 
 *Note that the rest of the comment is ignored in the report.*
 
+An annotation can also span multiple lines. For instance:
+
+.. code-block:: python
+
+    """
+        This function handles setting the price on an item in the database.
+
+        .. fun_fact: This code is the only remaining piece of our first commit!
+          To write long texts, prepend at least two additional spaces at the start
+          of every line after the first.
+    """
+
+This code would result in the following report:
+
+.. code-block:: yaml
+
+    - annotation_data: "This code is the only remaining piece of our first commit!\n \
+        \     To write long texts, prepend at least two additional spaces at the start\n\
+        \      of every line after the first."
+      annotation_token: '.. fun_fact:'
+      filename: foo/bar/something.py
+      found_by: python
+      line_number: 1
+
 Configuration for an "async" annotation type, denoted by the annotation token ``.. async:`` and choices denoting the
 types of asynchronous processors hooked up to it:
 

diff --git a/tests/extensions/python_test_files/multiline_empty_first_line.pyt b/tests/extensions/python_test_files/multiline_empty_first_line.pyt
@@ -0,0 +1,7 @@
+"""
+.. pii:
+  This is an annotation that
+  spans multiple lines and allows developers to
+  write more extensive docs.
+Comment after annotation and being annotated
+"""
diff --git a/tests/extensions/python_test_files/multiline_indented.pyt b/tests/extensions/python_test_files/multiline_indented.pyt
@@ -0,0 +1,6 @@
+"""
+    .. pii: A long description that
+        spans multiple indented
+        lines
+    .. pii_types: id, name
+"""
diff --git a/tests/extensions/python_test_files/multiline_paragraphs.pyt b/tests/extensions/python_test_files/multiline_paragraphs.pyt
@@ -0,0 +1,15 @@
+"""
+.. pii: This is an annotation that
+  spans multiple paragraphs.
+
+  This allows developers to write even more
+  extensive docs.
+Comment after annotation and being annotated
+"""
+
+"""
+Docstring
+.. pii: Annotation 1 with:
+
+     Multi-line and multi-paragraph.
+"""
diff --git a/tests/extensions/python_test_files/multiline_simple.pyt b/tests/extensions/python_test_files/multiline_simple.pyt
@@ -0,0 +1,8 @@
+"""
+Docstring
+
+.. pii: A long description that
+  spans multiple
+  lines
+.. pii_types: id, name
+"""
diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py
@@ -3,6 +3,9 @@
 """
 import pytest
 
+from code_annotations.base import AnnotationConfig
+from code_annotations.extensions.python import PythonAnnotationExtension
+from code_annotations.helpers import VerboseEcho
 from tests.helpers import EXIT_CODE_FAILURE, EXIT_CODE_SUCCESS, call_script
 
 
@@ -31,3 +34,57 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me
 
     if expected_exit_code == EXIT_CODE_FAILURE:
         assert "Search failed due to linting errors!" in result.output
+
+
+@pytest.mark.parametrize('test_file,annotations', [
+    (
+        'multiline_simple.pyt',
+        [
+            ('.. pii:', """A long description that
+  spans multiple
+  lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
+    (
+        'multiline_indented.pyt',
+        [
+            ('.. pii:', """A long description that
+        spans multiple indented
+        lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
+    (
+        'multiline_empty_first_line.pyt',
+        [
+            ('.. pii:', """This is an annotation that
+  spans multiple lines and allows developers to
+  write more extensive docs."""),
+        ]
+    ),
+    (
+        'multiline_paragraphs.pyt',
+        [
+            ('.. pii:', """This is an annotation that
+  spans multiple paragraphs.
+
+  This allows developers to write even more
+  extensive docs."""),
+            ('.. pii:', """Annotation 1 with:
+
+     Multi-line and multi-paragraph.""")
+        ]
+    ),
+])
+def test_multi_line_annotations(test_file, annotations):
+    config = AnnotationConfig('tests/test_configurations/.annotations_test')
+    annotator = PythonAnnotationExtension(config, VerboseEcho())
+
+    with open('tests/extensions/python_test_files/{}'.format(test_file)) as fi:
+        result_annotations = annotator.search(fi)
+
+    assert len(annotations) == len(result_annotations)
+    for annotation, result_annotation in zip(annotations, result_annotations):
+        assert result_annotation['annotation_token'] == annotation[0]
+        assert result_annotation['annotation_data'] == annotation[1]